Skip to content

Commit

Permalink
Merge pull request #4 from spectrtrec/dev
Browse files Browse the repository at this point in the history
Add base test, new libs and fix document format. Change version of rust and tesseract.
  • Loading branch information
spectrtrec authored Nov 14, 2023
2 parents f5e3bdc + 052caf1 commit 02e182a
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 46 deletions.
8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,10 @@ edition = "2021"
[dependencies]
tesseract-sys = "~0.5.15"
tesseract-plumbing = "~0.9.0"
thiserror = "1.0"
thiserror = "1.0"
field_accessor = "0.5.2"
either = "1.9"
mockall = "0.11.2"
derivative = "2.2.0"
glob = "0.3.1"

4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM rust:1.72
FROM rust:1.73

ARG TAG=5.3.1
ARG TAG=5.3.3

RUN mkdir /tmp/tesseract && \
echo "Update & upgrade" && \
Expand Down
2 changes: 1 addition & 1 deletion src/constanst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ impl TesseractDefaultConstants {
pub const DEFAULT_DPI: i32 = 300;
pub const DEFAULT_PSM: u32 = 4;
pub const DEFAULT_OEM: u32 = 3;
}
}
2 changes: 1 addition & 1 deletion src/errors.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::fmt::{self, Display};
use std::error::Error;

#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum TesseractError{
TesseractInitError,
NoSuchFileException,
Expand Down
3 changes: 2 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use rusttesserast::tess_lib::TesseractApi;

fn main() {
let mut tesseract_base = TesseractApi::new(Some(String::from("/usr/local/share/tessdata").as_str()), Some(String::from("rus").as_str())).unwrap();
//let mut tesseract_base = TesseractApi{dpi: 3, psm:3, oem:3, ..Default::default()};
let mut tesseract_base = TesseractApi::new(Some(TesseractApi{dpi: 3, psm:3, ..Default::default()}), Some(String::from("/usr/local/share/tessdata").as_str()), Some(String::from("rus").as_str())).unwrap();
let image_array = vec!["/workspaces/rusttesserast/tests/resipients.png"];
let test = tesseract_base.recognize_doc(None, None, image_array, "tsv");
println!("{:?}", test);
Expand Down
104 changes: 65 additions & 39 deletions src/tess_lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,27 @@ use crate::constanst::TesseractDefaultConstants;
use crate::errors::TesseractError;
use crate::file_types::OutputFileFormat;
use crate::utils::get_current_working_dir;
use derivative::Derivative;
use either::*;
use pl::TessBaseApiInitError;
use std::ffi::CString;
use std::fs::{metadata, File};
use std::io::Write;
use std::ops::Deref;
use std::path::Path;
use std::process::Command;
use tesseract_plumbing as pl;

#[derive(Default)]
#[derive(Derivative)]
#[derivative(Default, Debug)]
pub struct TesseractApi {
dpi: i32,
psm: u32,
oem: u32,
timeout: i32,
tess_pl: pl::TessBaseApi,
#[derivative(Default(value = "300"))]
pub dpi: i32,
#[derivative(Default(value = "4"))]
pub psm: u32,
#[derivative(Default(value = "3"))]
pub oem: u32,
#[derivative(Default(value = "30"))]
pub timeout: i32,
pub tess_pl: pl::TessBaseApi,
}

impl TesseractApi {
Expand All @@ -30,10 +36,32 @@ impl TesseractApi {
};
}

pub fn new(datapath: Option<&str>, lang: Option<&str>) -> Result<TesseractApi, TesseractError> {
let mut tesseract = TesseractApi::default();
tesseract.tess_pl.set_source_resolution(tesseract.dpi);
tesseract.tess_pl.set_page_seg_mode(tesseract.psm);
pub fn get_attr(
&self,
field_string: &str,
) -> Either<Result<&i32, String>, Result<&u32, String>> {
match field_string {
"dpi" => Left(Ok(&self.dpi)),
"psm" => Right(Ok(&self.psm)),
"oem" => Right(Ok(&self.oem)),
"timeout" => Left(Ok(&self.timeout)),
_ => Right(Err(format!("invalid field name to get '{}'", field_string))),
}
}

pub fn new(
tesseract: Option<TesseractApi>,
datapath: Option<&str>,
lang: Option<&str>,
) -> Result<TesseractApi, TesseractError> {
let mut tess = match tesseract {
Some(tesseract) => tesseract,
None => TesseractApi::default(),
};

tess.tess_pl.set_source_resolution(tess.dpi);
tess.tess_pl.set_page_seg_mode(tess.psm);

let datapath = match datapath {
Some(i) => Some(CString::new(i).unwrap()),
None => None,
Expand All @@ -42,19 +70,25 @@ impl TesseractApi {
Some(i) => Some(CString::new(i).unwrap()),
None => None,
};
tesseract
match tess
.tess_pl
.init_4(datapath.as_deref(), lang.as_deref(), tesseract.oem)
.ok();
Ok(tesseract)
.init_4(datapath.as_deref(), lang.as_deref(), tess.oem)
{
Ok(()) => Ok(tess),
Err(TessBaseApiInitError {}) => Err(TesseractError::TesseractInitError),
}
}

fn image_to_string(&mut self, filename: &str) -> Result<String, TesseractError> {
pub fn set_image(&mut self, filename: &str) -> Result<(), TesseractError> {
match pl::leptonica_plumbing::Pix::read(&CString::new(filename).unwrap()) {
Ok(pix) => self.tess_pl.set_image_2(&pix),
Err(PixReadError) => return Err(TesseractError::TesseractInitError),
Err(_) => return Err(TesseractError::NoSuchFileException),
};
Ok(())
}

pub fn image_to_string(&mut self, filename: &str) -> Result<String, TesseractError> {
self.set_image(filename)?;
Ok(self
.tess_pl
.get_utf8_text()
Expand All @@ -64,10 +98,8 @@ impl TesseractApi {
.into_owned())
}

fn image_to_hocr(&mut self, filename: &str) -> Result<String, TesseractError> {
self.tess_pl.set_image_2(
&pl::leptonica_plumbing::Pix::read(&CString::new(filename).unwrap()).unwrap(),
);
pub fn image_to_hocr(&mut self, filename: &str) -> Result<String, TesseractError> {
self.set_image(filename)?;
Ok(self
.tess_pl
.get_hocr_text(0)
Expand All @@ -77,10 +109,8 @@ impl TesseractApi {
.into_owned())
}

fn image_to_tsv(&mut self, filename: &str) -> Result<String, TesseractError> {
self.tess_pl.set_image_2(
&pl::leptonica_plumbing::Pix::read(&CString::new(filename).unwrap()).unwrap(),
);
pub fn image_to_tsv(&mut self, filename: &str) -> Result<String, TesseractError> {
self.set_image(filename)?;
Ok(self
.tess_pl
.get_tsv_text(0)
Expand All @@ -90,6 +120,7 @@ impl TesseractApi {
.into_owned())
}

#[allow(dead_code)]
fn get_text(&mut self) -> Result<String, TesseractError> {
Ok(self
.tess_pl
Expand All @@ -100,7 +131,7 @@ impl TesseractApi {
.into_owned())
}

fn iter_through_img(
pub fn iter_through_img(
&mut self,
api_ogject: fn(&mut TesseractApi, &str) -> Result<String, TesseractError>,
image_array: Vec<&str>,
Expand All @@ -112,26 +143,26 @@ impl TesseractApi {
rec_vec
}

fn save_doc(&mut self, path: Option<&str>, file_name: Option<&str>, doc_vec: Vec<String>) {
pub fn save_doc(&mut self, path: Option<&str>, file_name: Option<&str>, doc_vec: Vec<String>) {
let binding = get_current_working_dir();

let path = match path {
Some(path) => path,
None => binding.as_os_str().to_str().unwrap(),
};

let defaul_filename = String::from("data.txt");

let file_name = match file_name {
Some(file_name) => file_name,
None => &defaul_filename,
};

if !Path::new(path).exists() {
format!("Path {path} doesnt exist.");
panic!("Path {path} doesnt exist. Use another path.")
}

let mut data_file =
File::create(path.to_owned() + "/" + file_name).expect("creation failed");
data_file
Expand All @@ -147,7 +178,6 @@ impl TesseractApi {
output_type: &str,
) -> Result<(), TesseractError> {
let output_type = match output_type {
"pdf" => OutputFileFormat::PDF,
"txt" => OutputFileFormat::TXT,
"tsv" => OutputFileFormat::TSV,
"HOCR" => OutputFileFormat::HOCR,
Expand All @@ -163,12 +193,8 @@ impl TesseractApi {
}
_ => panic!("None existing format"),
};

self.save_doc(
save_path,
doc_name,
doc,
);

self.save_doc(save_path, doc_name, doc);
Ok(())
}
}
1 change: 0 additions & 1 deletion src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::{env::current_dir, path::PathBuf};

pub fn get_current_working_dir() -> PathBuf {
let mut cur_dir = current_dir().unwrap();
return current_dir().unwrap().to_owned();
}
10 changes: 10 additions & 0 deletions tests/data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 1 0 0 0 0 0 0 343 430 -1
2 1 1 0 0 0 94 4 112 25 -1
3 1 1 1 0 0 94 4 112 25 -1
4 1 1 1 1 0 101 4 83 10 -1
5 1 1 1 1 1 101 4 35 10 59.081154 Helto
5 1 1 1 1 2 143 4 41 10 18.626404 World!
4 1 1 1 2 0 94 20 112 9 -1
5 1 1 1 2 1 94 20 5 9 83.044060 I
5 1 1 1 2 2 108 22 21 7 93.075371 use
5 1 1 1 2 3 137 20 69 9 31.186249 tesseract
Binary file added tests/test_img.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 02e182a

Please sign in to comment.