-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
dc284ef
commit 4c7afc0
Showing
20 changed files
with
13,231 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash | ||
# This script documents the steps taken to setup this repo for tesstutorial | ||
# for use with a language other than eng. e.g. tam in Tamil script. | ||
## DO NOT RERUN - some langdata files added manually also. | ||
|
||
SCRIPT=Tamil | ||
LANG=tam | ||
|
||
cd ~/tess4training | ||
cd langdata | ||
|
||
wget -O $SCRIPT.unicharset https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/$SCRIPT.unicharset | ||
wget -O $SCRIPT.xheights https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/$SCRIPT.xheights | ||
|
||
mkdir $LANG | ||
cd $LANG | ||
wget -O $LANG.lstm.training_text https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/$LANG/$LANG.training_text | ||
wget -O $LANG.training_text https://raw.githubusercontent.com/tesseract-ocr/langdata/master/$LANG/$LANG.training_text | ||
wget -O $LANG.punc https://raw.githubusercontent.com/tesseract-ocr/langdata/master/$LANG/$LANG.punc | ||
wget -O $LANG.numberhttps://raw.githubusercontent.com/tesseract-ocr/langdata/master/$LANG/$LANG.numbers | ||
|
||
cd ~/tess4training | ||
cd tesseract/tessdata/best | ||
wget -O $LANG.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/$LANG.traineddata | ||
wget -O $SCRIPT.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/script/$SCRIPT.traineddata | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
|
||
***** Run lstmtraining with debug output for first 100 iterations. | ||
|
||
Loaded file ../tesstutorial/trainplusminustheta/eng.lstm, unpacking... | ||
Warning: LSTMTrainer deserialized an LSTMRecognizer! | ||
Code range changed from 111 to 113! | ||
Num (Extended) outputs,weights in Series: | ||
1,36,0,1:1, 0 | ||
Num (Extended) outputs,weights in Series: | ||
C3,3:9, 0 | ||
Ft16:16, 160 | ||
Total weights = 160 | ||
[C3,3Ft16]:16, 160 | ||
Mp3,3:16, 0 | ||
Lfys64:64, 20736 | ||
Lfx96:96, 61824 | ||
Lrx96:96, 74112 | ||
Lfx512:512, 1247232 | ||
Fc113:113, 57969 | ||
Total weights = 1462033 | ||
Previous null char=110 mapped to 112 | ||
Continuing from ../tesstutorial/trainplusminustheta/eng.lstm | ||
Loaded 169/169 lines (1-169) of document ../tesstutorial/trainplusminustheta/eng.Arial_Bold.exp0.lstmf | ||
Loaded 169/169 lines (1-169) of document ../tesstutorial/trainplusminustheta/eng.Arial_Bold_Italic.exp0.lstmf | ||
Iteration 0: GROUND TRUTH : Ø TRADEMARKS §120.871 Gilmore, FREE More Number Low trying AWARD, ('Beaver | ||
Iteration 0: BEST OCR TEXT : @ TRADEMARKS §120.871 Gilmore, FREE More Number Low trying AWARD, ('Beaver | ||
File /tmp/eng-2020-05-25.fhP/eng.Arial_Bold.exp0.lstmf line 33 : | ||
Mean rms=0.734%, delta=0.811%, train=2.703%(9.091%), skip ratio=0% | ||
Iteration 1: GROUND TRUTH : or SC used By October Technology City And Business could Services (1) in Services 12 for | ||
File ../tesstutorial/trainplusminustheta/eng.Arial_Bold.exp0.lstmf line 1 (Perfect): | ||
Mean rms=0.434%, delta=0.405%, train=1.351%(4.545%), skip ratio=0% | ||
Iteration 2: GROUND TRUTH : does YOU OH 30 them its 1 comments are November URL Reply of a San'a' I've some The to: | ||
File ../tesstutorial/trainplusminustheta/eng.Arial_Bold.exp0.lstmf line 2 : | ||
Mean rms=0.424%, delta=0.366%, train=0.901%(3.03%), skip ratio=0% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
#!/bin/bash | ||
|
||
cd ./tesseract | ||
|
||
rm -rf ../tesstutorial/trainlayertamil | ||
rm -rf ../tesstutorial/evallayertamil | ||
|
||
cat \ | ||
../langdata/tam/tam.langdata.training_text \ | ||
../langdata/tam/tam.splcases.training_text \ | ||
../langdata/tam/tam.ta.wikisource.1jz9.training_text \ | ||
>../langdata/tam/tam.layertamil.training_text | ||
|
||
shuf -o ../langdata/tam/tam.layertamil.training_text <../langdata/tam/tam.layertamil.training_text | ||
|
||
echo -e "\n***** Making training data for trainlayertamil set for layertamil training." | ||
bash src/training/tesstrain.sh --fonts_dir /home/ubuntu/.fonts --lang tam --linedata_only \ | ||
--noextract_font_properties --langdata_dir ../langdata \ | ||
--training_text ../langdata/tam/tam.layertamil.training_text \ | ||
--tessdata_dir ./tessdata --output_dir ../tesstutorial/trainlayertamil \ | ||
--fontlist \ | ||
"AdSriTamilSans" \ | ||
"Akshar Unicode" \ | ||
"Arial Unicode MS" \ | ||
"Arima Madurai" \ | ||
"Arima Madurai Bold" \ | ||
"Arima Madurai Heavy" \ | ||
"Arima Madurai Light" \ | ||
"Arima Madurai Medium" \ | ||
"Arima Madurai Ultra-Bold" \ | ||
"Baloo Thambi" \ | ||
"Catamaran" \ | ||
"Catamaran Bold" \ | ||
"Catamaran Heavy" \ | ||
"Catamaran Light" \ | ||
"Catamaran Medium" \ | ||
"Catamaran Semi-Bold" \ | ||
"Catamaran Ultra-Bold" \ | ||
"Coiny" \ | ||
"Droid Sans Tamil" \ | ||
"Droid Sans Tamil Bold" \ | ||
"ETTamilNew" \ | ||
"FreeSans" \ | ||
"FreeSerif" \ | ||
"FreeSerif Bold" \ | ||
"GIST-TMOTAbhirami Bold" \ | ||
"GIST-TMOTAbhirami Ultra-Heavy Italic" \ | ||
"GIST-TMOTAmala Bold" \ | ||
"GIST-TMOTAmala Ultra-Heavy Italic" \ | ||
"GIST-TMOTAppar Bold" \ | ||
"GIST-TMOTAppar Ultra-Heavy Italic" \ | ||
"GIST-TMOTChanakya" \ | ||
"GIST-TMOTChanakya Bold" \ | ||
"GIST-TMOTChanakya Italic" \ | ||
"GIST-TMOTChanakya Ultra-Heavy Italic" \ | ||
"GIST-TMOTChandra Bold" \ | ||
"GIST-TMOTChandra Ultra-Heavy Italic" \ | ||
"GIST-TMOTHeena Bold" \ | ||
"GIST-TMOTHeena Ultra-Heavy Italic" \ | ||
"GIST-TMOTIlango" \ | ||
"GIST-TMOTIlango Bold" \ | ||
"GIST-TMOTKalyani Bold" \ | ||
"GIST-TMOTKalyani Ultra-Heavy Italic" \ | ||
"GIST-TMOTKamal" \ | ||
"GIST-TMOTKamal Bold" \ | ||
"GIST-TMOTKamal Italic" \ | ||
"GIST-TMOTKamal Ultra-Heavy Italic" \ | ||
"GIST-TMOTKannadasan" \ | ||
"GIST-TMOTKannadasan Italic" \ | ||
"GIST-TMOTKannagi Bold" \ | ||
"GIST-TMOTKannagi Ultra-Heavy Italic" \ | ||
"GIST-TMOTKomala Bold" \ | ||
"GIST-TMOTKomala Ultra-Heavy Italic" \ | ||
"GIST-TMOTKrishnan Bold" \ | ||
"GIST-TMOTKumudam" \ | ||
"GIST-TMOTLalitha" \ | ||
"GIST-TMOTLalitha Bold" \ | ||
"GIST-TMOTLalitha Italic" \ | ||
"GIST-TMOTLalitha Ultra-Heavy Italic" \ | ||
"GIST-TMOTMadhura Bold" \ | ||
"GIST-TMOTMina Bold" \ | ||
"GIST-TMOTNambi" \ | ||
"GIST-TMOTNambi Bold" \ | ||
"GIST-TMOTNambi Italic" \ | ||
"GIST-TMOTNambi Ultra-Heavy Italic" \ | ||
"GIST-TMOTPadma" \ | ||
"GIST-TMOTPadma Bold" \ | ||
"GIST-TMOTParvathi Bold" \ | ||
"GIST-TMOTPattinathar" \ | ||
"GIST-TMOTPattinathar Bold" \ | ||
"GIST-TMOTPattinathar Bold Italic" \ | ||
"GIST-TMOTPattinathar Italic" \ | ||
"GIST-TMOTSuman Bold" \ | ||
"Hind Madurai" \ | ||
"Hind Madurai Bold" \ | ||
"Hind Madurai Light" \ | ||
"Hind Madurai Medium" \ | ||
"Hind Madurai Semi-Bold" \ | ||
"Karla Tamil Inclined Bold Italic" \ | ||
"Karla Tamil Inclined Italic" \ | ||
"Karla Tamil Upright" \ | ||
"Karla Tamil Upright Bold" \ | ||
"Kavivanar" \ | ||
"Latha" \ | ||
"Latha Bold" \ | ||
"Lohit Tamil" \ | ||
"Lohit Tamil Classical" \ | ||
"Meera Inimai" \ | ||
"Mukta Malar" \ | ||
"Mukta Malar Bold" \ | ||
"Mukta Malar Light" \ | ||
"Mukta Malar Medium" \ | ||
"Mukta Malar Semi-Bold" \ | ||
"Mukta Malar Ultra-Bold" \ | ||
"Nirmala UI" \ | ||
"Nirmala UI Bold" \ | ||
"Nirmala UI Semi-Light" \ | ||
"Noto Sans Tamil" \ | ||
"Noto Sans Tamil Bold" \ | ||
"Noto Sans Tamil UI" \ | ||
"Noto Sans Tamil UI Bold" \ | ||
"Noto Serif Tamil" \ | ||
"Noto Serif Tamil Bold" \ | ||
"Pavanam" \ | ||
"Post No Bills Jaffna" \ | ||
"Post No Bills Jaffna Bold" \ | ||
"Post No Bills Jaffna ExtraBold, Ultra-Bold" \ | ||
"Post No Bills Jaffna Light, Light" \ | ||
"Post No Bills Jaffna Medium, Medium" \ | ||
"Post No Bills Jaffna SemiBold, Semi-Bold" \ | ||
"SUNDARAM-0806" \ | ||
"SUNDARAM-0807" \ | ||
"SUNDARAM-0808" \ | ||
"SUNDARAM-0810" \ | ||
"SUNDARAM-0812" \ | ||
"SUNDARAM-0819" \ | ||
"SUNDARAM-0820" \ | ||
"SUNDARAM-0821" \ | ||
"SUNDARAM-0823" \ | ||
"SUNDARAM-0824" \ | ||
"SUNDARAM-0827" \ | ||
"SUNDARAM-0830" \ | ||
"SUNDARAM-0831" \ | ||
"SUNDARAM-1341" \ | ||
"SUNDARAM-1342" \ | ||
"SUNDARAM-1351" \ | ||
"SUNDARAM-1352" \ | ||
"SUNDARAM-2852" \ | ||
"SUNDARAM-2865" \ | ||
"SUNDARAM-3811" \ | ||
"SakalBharati" \ | ||
"Sri Tamil" \ | ||
"Sri Tamil Bold" \ | ||
"Sri Tamil Bold Oblique" \ | ||
"Sri Tamil Oblique" \ | ||
"Sri Tamil Sans" \ | ||
"Sri Tamil Sans Oblique" \ | ||
"TABUni-Tamil021" \ | ||
"TABUni-Tamil032" \ | ||
"TAMUni-Tamil042" \ | ||
"TAMUni-Tamil046" \ | ||
"TAMUni-Tamil150" \ | ||
"TAMUni-Tamil195" \ | ||
"TAMu_Kadambri" \ | ||
"TAMu_Kalyani" \ | ||
"TAMu_Maduram" \ | ||
"TAU-Achu" \ | ||
"TAU-Achu Italic," \ | ||
"TAU-Barathi" \ | ||
"TAU-Barathi Bold" \ | ||
"TAU-Barathi Bold Italic" \ | ||
"TAU-Barathi Italic" \ | ||
"TAU-Ezhil" \ | ||
"TAU-Ezhil Bold, Bold" \ | ||
"TAU-Ezhil Italic, Italic" \ | ||
"TAU-Kabilar" \ | ||
"TAU-Kabilar Bold" \ | ||
"TAU-Kabilar Bold Italic" \ | ||
"TAU-Kabilar Italic" \ | ||
"TAU-Kambar" \ | ||
"TAU-Kambar Bold" \ | ||
"TAU-Kambar Bold Italic" \ | ||
"TAU-Kambar Italic" \ | ||
"TAU-Kaveri" \ | ||
"TAU-Kaveri Bold" \ | ||
"TAU-Kaveri Bold Italic" \ | ||
"TAU-Kaveri Italic" \ | ||
"TAU-Kurinji" \ | ||
"TAU-Kurinji Bold, Bold" \ | ||
"TAU-Kurinji Italic, Medium Italic" \ | ||
"TAU-Malar" \ | ||
"TAU-Malar Bold, Bold" \ | ||
"TAU-Malar Italic, Italic" \ | ||
"TAU-Marutham" \ | ||
"TAU-Marutham Bold," \ | ||
"TAU-Marutham Italic," \ | ||
"TAU-Mullai Bold, Bold" \ | ||
"TAU-Mullai Italic" \ | ||
"TAU-Mullai Italic, Italic" \ | ||
"TAU-Neythal" \ | ||
"TAU-Neythal Bold, Bold" \ | ||
"TAU-Neythal Italic, Italic" \ | ||
"TAU-Nilavu Bold, Bold" \ | ||
"TAU-Nilavu Italic" \ | ||
"TAU-Nilavu Italic, Italic" \ | ||
"TAU-Paalai" \ | ||
"TAU-Paalai Bold, Bold" \ | ||
"TAU-Paalai Italic, Italic" \ | ||
"TAU-Urai" \ | ||
"TAU-Urai Bold," \ | ||
"TAU-Urai Italic, Italic" \ | ||
"TAU-Valluvar" \ | ||
"TAU-Valluvar Bold" \ | ||
"TAU-Valluvar Bold Italic" \ | ||
"TAU-Valluvar Italic" \ | ||
"TAU_Elango_" \ | ||
"Vijaya" \ | ||
"Vijaya Bold" \ | ||
"e-Grantamil" | ||
|
||
echo -e "\n***** Making test data for testlayertamil set for layertamil training." | ||
bash src/training/tesstrain.sh --fonts_dir /home/ubuntu/.fonts --lang tam --linedata_only \ | ||
--noextract_font_properties --langdata_dir ../langdata \ | ||
--training_text ../langdata/tam/tam.layertamiltest.training_text \ | ||
--tessdata_dir ./tessdata --output_dir ../tesstutorial/testlayertamil \ | ||
--save_box_tiff \ | ||
--fontlist \ | ||
"Arial Unicode MS" \ | ||
"FreeSerif" \ | ||
"Lohit Tamil" \ | ||
"Lohit Tamil Classical" \ | ||
"Vijaya" \ | ||
"e-Grantamil" | ||
|
||
|
||
echo -e "\n***** Making evaluation data for evallayertamil set for layertamil training using Impact font." | ||
bash src/training/tesstrain.sh --fonts_dir /home/ubuntu/.fonts --lang tam --linedata_only \ | ||
--noextract_font_properties --langdata_dir ../langdata \ | ||
--training_text ../langdata/tam/tam.layertamileval.training_text \ | ||
--tessdata_dir ./tessdata \ | ||
--output_dir ../tesstutorial/evallayertamil \ | ||
--fontlist \ | ||
"Lohit Tamil Classical" \ | ||
"e-Grantamil" | ||
|
||
rm -rf ../tesstutorial/tam_layertamil_tam | ||
mkdir -p ../tesstutorial/tam_layertamil_tam | ||
|
||
echo -e "\n***** Extract LSTM model from best traineddata for Tamil. \n" | ||
combine_tessdata -e ./tessdata/best/Tamil.traineddata \ | ||
../tesstutorial/tam_layertamil_tam/tam.lstm | ||
|
Oops, something went wrong.