26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__ 27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__ 29 #include "allheaders.h" 39 class BLOB_CHOICE_LIST_CLIST;
101 class CubeLineObject;
103 class CubeRecoContext;
105 class EquationDetect;
107 #ifndef NO_CUBE_BUILD 108 class TesseractCubeCombiner;
145 WordData() : word(NULL), row(NULL), block(NULL), prev_word(NULL) {}
147 : word(page_res_it.word()), row(page_res_it.row()->row),
148 block(page_res_it.block()->block), prev_word(NULL) {}
150 : word(word_res), row(row_in), block(block_in), prev_word(NULL) {}
179 void ResetAdaptiveClassifier();
181 void ResetDocumentDictionary();
192 pixDestroy(&pix_binary_);
202 pixDestroy(&pix_grey_);
203 pix_grey_ = grey_pix;
208 pixDestroy(&pix_original_);
209 pix_original_ = original_pix;
218 Pix*
BestPix()
const {
return pix_original_; }
220 pixDestroy(&pix_thresholds_);
221 pix_thresholds_ = thresholds;
224 return source_resolution_;
227 source_resolution_ = ppi;
230 return pixGetWidth(pix_binary_);
233 return pixGetHeight(pix_binary_);
236 return scaled_color_;
239 return scaled_factor_;
242 scaled_factor_ = factor;
243 scaled_color_ = color;
253 return right_to_left_;
256 return sub_langs_.size();
259 return sub_langs_[index];
264 for (
int i = 0; i < sub_langs_.size(); ++i) {
265 if (sub_langs_[i]->tessedit_ocr_engine_mode !=
OEM_CUBE_ONLY)
271 void SetBlackAndWhitelist();
277 void PrepareForPageseg();
284 void PrepareForTessOCR(BLOCK_LIST* block_list,
287 int SegmentPage(
const STRING* input_file, BLOCK_LIST* blocks,
289 void SetupWordScripts(BLOCK_LIST* blocks);
290 int AutoPageSeg(
PageSegMode pageseg_mode, BLOCK_LIST* blocks,
291 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
295 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
296 Pix** music_mask_pix);
301 bool ProcessTargetWord(
const TBOX& word_box,
const TBOX& target_word_box,
302 const char* word_config,
int pass);
304 void SetupAllWordsPassN(
int pass_n,
305 const TBOX* target_word_box,
306 const char* word_config,
310 void SetupWordPassN(
int pass_n,
WordData* word);
312 bool RecogAllWordsPassN(
int pass_n,
ETEXT_DESC* monitor,
315 bool recog_all_words(
PAGE_RES* page_res,
317 const TBOX* target_word_box,
318 const char* word_config,
320 void rejection_passes(
PAGE_RES* page_res,
322 const TBOX* target_word_box,
323 const char* word_config);
324 void bigram_correction_pass(
PAGE_RES *page_res);
325 void blamer_pass(
PAGE_RES* page_res);
327 void script_pos_pass(
PAGE_RES* page_res);
331 int RetryWithLanguage(
const WordData& word_data,
338 bool ReassignDiacritics(
int pass,
PAGE_RES_IT* pr_it,
339 bool* make_next_word_fuzzy);
344 void AssignDiacriticsToOverlappingBlobs(
358 bool SelectGoodDiacriticOutlines(
int pass,
float certainty_threshold,
373 STRING* best_str,
float* c2);
374 void classify_word_and_language(
int pass_n,
PAGE_RES_IT* pr_it,
376 void classify_word_pass1(
const WordData& word_data,
379 void recog_pseudo_word(
PAGE_RES* page_res,
380 TBOX &selection_box);
386 const char *lengths);
388 void classify_word_pass2(
const WordData& word_data,
391 void ReportXhtFixResult(
bool accept_new_word,
float new_x_ht,
397 bool TestNewNormalization(
int original_misfits,
float baseline_shift,
403 void set_word_fonts(
WERD_RES *word);
404 void font_recognition_pass(
PAGE_RES* page_res);
405 void dictionary_correction_pass(
PAGE_RES* page_res);
409 bool SubAndSuperscriptFix(
WERD_RES *word_res);
410 void GetSubAndSuperscriptCandidates(
const WERD_RES *word,
411 int *num_rebuilt_leading,
413 float *leading_certainty,
414 int *num_rebuilt_trailing,
416 float *trailing_certainty,
417 float *avg_certainty,
418 float *unlikely_threshold);
419 WERD_RES *TrySuperscriptSplits(
int num_chopped_leading,
420 float leading_certainty,
422 int num_chopped_trailing,
423 float trailing_certainty,
428 int *retry_trailing);
429 bool BelievableSuperscript(
bool debug,
431 float certainty_threshold,
433 int *right_ok)
const;
436 #ifndef NO_CUBE_BUILD 437 bool init_cube_objects(
bool load_combiner,
441 void run_cube_combiner(
PAGE_RES *page_res);
455 void fill_werd_res(
const BoxWord& cube_box_word,
456 const char* cube_best_str,
458 bool extract_cube_state(
CubeObject* cube_obj,
int* num_chars,
459 Boxa** char_boxes,
CharSamp*** char_samples);
460 bool create_cube_box_word(Boxa *char_boxes,
int num_chars,
465 void output_pass(
PAGE_RES_IT &page_res_it,
const TBOX *target_word_box);
470 void set_unlv_suspects(
WERD_RES *word);
472 BOOL8 acceptable_number_string(
const char *s,
473 const char *lengths);
482 int init_tesseract(
const char *arg0,
483 const char *textbase,
484 const char *language,
490 bool set_only_init_params);
492 const char *language,
494 return init_tesseract(datapath, NULL, language, oem,
495 NULL, 0, NULL, NULL,
false);
513 int init_tesseract_internal(
const char *arg0,
514 const char *textbase,
515 const char *language,
521 bool set_only_init_params);
525 void SetupUniversalFontIds();
527 int init_tesseract_lm(
const char *arg0,
528 const char *textbase,
529 const char *language);
531 void recognize_page(
STRING& image_name);
532 void end_tesseract();
534 bool init_tesseract_lang_data(
const char *arg0,
535 const char *textbase,
536 const char *language,
542 bool set_only_init_params);
544 void ParseLanguageString(
const char* lang_str,
550 #ifndef GRAPHICS_DISABLED 551 void pgeditor_main(
int width,
int height,
PAGE_RES* page_res);
552 #endif // GRAPHICS_DISABLED 553 void process_image_event(
555 BOOL8 process_cmd_win_event(
559 void debug_word(
PAGE_RES* page_res,
const TBOX &selection_box);
569 void blob_feature_display(
PAGE_RES* page_res,
const TBOX& selection_box);
574 inT16 first_alphanum_index(
const char *word,
575 const char *word_lengths);
576 inT16 first_alphanum_offset(
const char *word,
577 const char *word_lengths);
578 inT16 alpha_count(
const char *word,
579 const char *word_lengths);
581 const char *word_lengths);
583 inT16 count_alphanums(
599 void reject_edge_blobs(
WERD_RES *word);
600 void reject_mostly_rejects(
WERD_RES *word);
602 BOOL8 word_adaptable(
607 void recog_word_recursive(
WERD_RES* word);
609 void split_and_recog_word(
WERD_RES* word);
618 BOOL8 digit_or_numeric_punct(
WERD_RES *word,
int char_position);
619 inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
620 void match_current_words(WERD_RES_LIST &words,
ROW *row,
BLOCK* block);
621 inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
622 void fix_noisy_space_list(WERD_RES_LIST &best_perm,
ROW *row,
BLOCK* block);
623 void fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
ROW *row,
BLOCK* block);
624 void fix_sp_fp_word(WERD_RES_IT &word_res_it,
ROW *row,
BLOCK* block);
625 void fix_fuzzy_spaces(
629 void dump_words(WERD_RES_LIST &perm,
inT16 score,
632 inT16 worst_noise_blob(
WERD_RES *word_res,
float *worst_noise_score);
633 float blob_noise_score(
TBLOB *blob);
634 void break_noisiest_blob_word(WERD_RES_LIST &words);
641 void unrej_good_quality_words(
643 void doc_and_block_rejection(
645 BOOL8 good_quality_doc);
646 void quality_based_rejection(
PAGE_RES_IT &page_res_it,
647 BOOL8 good_quality_doc);
648 void convert_bad_unlv_chs(
WERD_RES *word_res);
652 inT16 *accepted_match_count);
654 inT16 count_outline_errs(
char c,
inT16 outline_count);
662 process_selected_words (
665 TBOX & selection_box,
668 void tess_add_doc_word(
671 void tess_segment_pass_n(
int pass_n,
WERD_RES *word);
672 bool tess_acceptable_word(
WERD_RES *word);
696 BLOCK_LIST *block_list);
700 void PreenXHeights(BLOCK_LIST *block_list);
705 BLOCK_LIST *block_list);
719 bool ResegmentCharBox(
PAGE_RES* page_res,
const TBOX *prev_box,
720 const TBOX& box,
const TBOX& next_box,
721 const char* correct_text);
728 bool ResegmentWordBox(BLOCK_LIST *block_list,
729 const TBOX& box,
const TBOX& next_box,
730 const char* correct_text);
733 void ReSegmentByClassification(
PAGE_RES* page_res);
736 bool ConvertStringToUnichars(
const char* utf8,
753 int choices_pos,
int choices_length,
764 void ReportFailedBox(
int boxfile_lineno,
TBOX box,
const char *box_ch,
765 const char *err_msg);
767 void CorrectClassifyWords(
PAGE_RES* page_res);
770 void ApplyBoxTraining(
const STRING& fontname,
PAGE_RES* page_res);
774 int CountMisfitTops(
WERD_RES *word_res);
779 float ComputeCompatibleXheight(
WERD_RES *word_res,
float* baseline_shift);
782 BOOL_VAR_H(tessedit_resegment_from_boxes,
false,
783 "Take segmentation and labeling from box file");
784 BOOL_VAR_H(tessedit_resegment_from_line_boxes,
false,
785 "Conversion of word/line box file to char box file");
787 "Generate training data from boxed chars");
788 BOOL_VAR_H(tessedit_make_boxes_from_boxes,
false,
789 "Generate more boxes from boxed chars");
790 BOOL_VAR_H(tessedit_dump_pageseg_images,
false,
791 "Dump intermediate images made during page segmentation");
793 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 794 " 5=line, 6=word, 7=char" 795 " (Values from PageSegMode enum in publictypes.h)");
797 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" 798 " to loading and running only Tesseract (no Cube, no combiner)." 799 " (Values from OcrEngineMode enum in tesseractclass.h)");
801 "Blacklist of chars not to recognize");
803 "Whitelist of chars to recognize");
805 "List of chars to override tessedit_char_blacklist");
807 "Perform training for ambiguities");
808 INT_VAR_H(pageseg_devanagari_split_strategy,
810 "Whether to use the top-line splitting process for Devanagari " 811 "documents while performing page-segmentation.");
814 "Whether to use the top-line splitting process for Devanagari " 815 "documents while performing ocr.");
817 "Write all parameters to the given file.");
819 "Generate and print debug information for adaption");
822 INT_VAR_H(applybox_page, 0,
"Page number to apply boxes from");
824 "Exposure value follows this pattern in the image" 825 " filename. The name of the image files are expected" 826 " to be in the form [lang].[fontname].exp[num].tif");
827 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode,
false,
828 "Learn both character fragments (as is done in the" 829 " special low exposure mode) as well as unfragmented" 832 "Each bounding box is assumed to contain ngrams. Only" 833 " learn the ngrams whose outlines overlap horizontally.");
834 BOOL_VAR_H(tessedit_display_outwords,
false,
"Draw output words");
835 BOOL_VAR_H(tessedit_dump_choices,
false,
"Dump char choices");
836 BOOL_VAR_H(tessedit_timing_debug,
false,
"Print timing stats");
838 "Try to improve fuzzy spaces");
840 "Don't bother with word plausibility");
841 BOOL_VAR_H(tessedit_fix_hyphens,
true,
"Crunch double hyphens?");
842 BOOL_VAR_H(tessedit_redo_xheight,
true,
"Check/Correct x-height");
844 "Add words to the document dictionary");
845 BOOL_VAR_H(tessedit_debug_fonts,
false,
"Output font info per char");
846 BOOL_VAR_H(tessedit_debug_block_rejection,
false,
"Block and Row stats");
847 BOOL_VAR_H(tessedit_enable_bigram_correction,
true,
848 "Enable correction based on the word bigram dictionary.");
849 BOOL_VAR_H(tessedit_enable_dict_correction,
false,
850 "Enable single word correction based on the dictionary.");
851 INT_VAR_H(tessedit_bigram_debug, 0,
"Amount of debug output for bigram " 854 "Remove and conditionally reassign small outlines when they" 855 " confuse layout analysis, determining diacritics vs noise");
856 INT_VAR_H(debug_noise_removal, 0,
"Debug reassignment of small outlines");
859 double_VAR_H(noise_cert_basechar, -8.0,
"Hingepoint for base char certainty");
862 double_VAR_H(noise_cert_disjoint, -2.5,
"Hingepoint for disjoint certainty");
865 double_VAR_H(noise_cert_punc, -2.5,
"Threshold for new punc char certainty");
868 "Scaling on certainty diff from Hingepoint");
869 INT_VAR_H(noise_maxperblob, 8,
"Max diacritics to apply to a blob");
870 INT_VAR_H(noise_maxperword, 16,
"Max diacritics to apply to a word");
872 BOOL_VAR_H(debug_acceptable_wds,
false,
"Dump word pass/fail chk");
874 STRING_VAR_H(chs_trailing_punct1,
").,;:?!",
"1st Trailing punctuation");
875 STRING_VAR_H(chs_trailing_punct2,
")'`\"",
"2nd Trailing punctuation");
876 double_VAR_H(quality_rej_pc, 0.08,
"good_quality_doc lte rejection limit");
877 double_VAR_H(quality_blob_pc, 0.0,
"good_quality_doc gte good blobs limit");
879 "good_quality_doc lte outline error limit");
880 double_VAR_H(quality_char_pc, 0.95,
"good_quality_doc gte good char limit");
881 INT_VAR_H(quality_min_initial_alphas_reqd, 2,
"alphas in a good word");
882 INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
883 "Adaptation decision algorithm for tess");
885 "Do minimal rejection on pass 1 output");
886 BOOL_VAR_H(tessedit_test_adaption,
false,
"Test adaption criteria");
887 BOOL_VAR_H(tessedit_matcher_log,
false,
"Log matcher activity");
888 INT_VAR_H(tessedit_test_adaption_mode, 3,
889 "Adaptation decision algorithm for tess");
893 INT_VAR_H(paragraph_debug_level, 0,
"Print paragraph debug info.");
895 "Run paragraph detection on the post-text-recognition " 897 INT_VAR_H(cube_debug_level, 1,
"Print cube debug info.");
899 STRING_VAR_H(outlines_2,
"ij!?%\":;",
"Non standard number of outlines");
900 BOOL_VAR_H(docqual_excuse_outline_errs,
false,
901 "Allow outline errs in unrejection?");
903 "Reduce rejection on good docs");
904 BOOL_VAR_H(tessedit_use_reject_spaces,
true,
"Reject spaces?");
906 "%rej allowed before rej whole doc");
908 "%rej allowed before rej whole block");
910 "%rej allowed before rej whole row");
912 "Number of row rejects in whole word rejects" 913 "which prevents whole row rejection");
914 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds,
true,
915 "Only rej partially rejected words in block rejection");
916 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds,
true,
917 "Only rej partially rejected words in row rejection");
918 BOOL_VAR_H(tessedit_dont_blkrej_good_wds,
false,
919 "Use word segmentation quality metric");
920 BOOL_VAR_H(tessedit_dont_rowrej_good_wds,
false,
921 "Use word segmentation quality metric");
922 INT_VAR_H(tessedit_preserve_min_wd_len, 2,
923 "Only preserve wds longer than this");
925 "Apply row rejection to good docs");
927 "rej good doc wd if more than this fraction rejected");
928 BOOL_VAR_H(tessedit_reject_bad_qual_wds,
true,
929 "Reject all bad quality wds");
930 BOOL_VAR_H(tessedit_debug_doc_rejection,
false,
"Page stats");
931 BOOL_VAR_H(tessedit_debug_quality_metrics,
false,
932 "Output data to debug file");
933 BOOL_VAR_H(bland_unrej,
false,
"unrej potential with no chekcs");
935 "good_quality_doc gte good char limit");
937 "Mark v.bad words for tilde crunch");
939 "Add font info to hocr output");
940 BOOL_VAR_H(crunch_early_merge_tess_fails,
true,
"Before word crunch?");
941 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs,
false,
"Take out ~^ early?");
945 "crunch garbage cert lt this");
946 double_VAR_H(crunch_poor_garbage_rate, 60,
"crunch garbage rating lt this");
947 double_VAR_H(crunch_pot_poor_rate, 40,
"POTENTIAL crunch rating lt this");
948 double_VAR_H(crunch_pot_poor_cert, -8.0,
"POTENTIAL crunch cert lt this");
949 BOOL_VAR_H(crunch_pot_garbage,
true,
"POTENTIAL crunch garbage");
950 double_VAR_H(crunch_del_rating, 60,
"POTENTIAL crunch rating lt this");
954 double_VAR_H(crunch_del_min_width, 3.0,
"Del if word width lt xht x this");
956 "Del if word gt xht x this above bl");
957 double_VAR_H(crunch_del_low_word, 0.5,
"Del if word gt xht x this below bl");
958 double_VAR_H(crunch_small_outlines_size, 0.6,
"Small if lt xht x this");
959 INT_VAR_H(crunch_rating_max, 10,
"For adj length in rating per ch");
960 INT_VAR_H(crunch_pot_indicators, 1,
"How many potential indicators needed");
961 BOOL_VAR_H(crunch_leave_ok_strings,
true,
"Don't touch sensible strings");
962 BOOL_VAR_H(crunch_accept_ok,
true,
"Use acceptability in okstring");
963 BOOL_VAR_H(crunch_leave_accept_strings,
false,
964 "Don't pot crunch sensible strings");
965 BOOL_VAR_H(crunch_include_numerals,
false,
"Fiddle alpha figures");
967 "Don't crunch words with long lower case strings");
969 "Don't crunch words with long lower case strings");
970 INT_VAR_H(crunch_long_repetitions, 3,
"Crunch words with long repetitions");
973 "How many non-noise blbs either side?");
974 double_VAR_H(fixsp_small_outlines_size, 0.28,
"Small if lt xht x this");
975 BOOL_VAR_H(tessedit_prefer_joined_punct,
false,
"Reward punctation joins");
976 INT_VAR_H(fixsp_done_mode, 1,
"What constitues done for spacing");
977 INT_VAR_H(debug_fix_space_level, 0,
"Contextual fixspace debug");
979 "Punct. chs expected WITHIN numbers");
981 "Max allowed deviation of blob top outside of font data");
982 INT_VAR_H(x_ht_min_change, 8,
"Min change in xht before actually trying it");
983 INT_VAR_H(superscript_debug, 0,
"Debug level for sub & superscript fixer");
984 double_VAR_H(superscript_worse_certainty, 2.0,
"How many times worse " 985 "certainty does a superscript position glyph need to be for us " 986 "to try classifying it as a char with a different baseline?");
987 double_VAR_H(superscript_bettered_certainty, 0.97,
"What reduction in " 988 "badness do we think sufficient to choose a superscript over " 989 "what we'd thought. For example, a value of 0.6 means we want " 990 "to reduce badness of certainty by 40%");
992 "A superscript scaled down more than this is unbelievably " 993 "small. For example, 0.3 means we expect the font size to " 994 "be no smaller than 30% of the text line font size.");
996 "Maximum top of a character measured as a multiple of x-height " 997 "above the baseline for us to reconsider whether it's a " 1000 "Minimum bottom of a character measured as a multiple of " 1001 "x-height above the baseline for us to reconsider whether it's " 1003 BOOL_VAR_H(tessedit_write_block_separators,
false,
1004 "Write block separators in output");
1006 "Write repetition char code");
1007 BOOL_VAR_H(tessedit_write_unlv,
false,
"Write .unlv output file");
1008 BOOL_VAR_H(tessedit_create_txt,
false,
"Write .txt output file");
1009 BOOL_VAR_H(tessedit_create_hocr,
false,
"Write .html hOCR output file");
1010 BOOL_VAR_H(tessedit_create_tsv,
false,
"Write .tsv output file");
1011 BOOL_VAR_H(tessedit_create_pdf,
false,
"Write .pdf output file");
1013 "Output char for unidentified blobs");
1016 "Min suspect level for rejecting spaces");
1017 INT_VAR_H(suspect_short_words, 2,
"Don't Suspect dict wds longer than this");
1018 BOOL_VAR_H(suspect_constrain_1Il,
false,
"UNLV keep 1Il chars rejected");
1019 double_VAR_H(suspect_rating_per_ch, 999.9,
"Don't touch bad rating limit");
1021 BOOL_VAR_H(tessedit_minimal_rejection,
false,
"Only reject tess failures");
1022 BOOL_VAR_H(tessedit_zero_rejection,
false,
"Don't reject ANYTHING");
1024 "Make output have exactly one word per WERD");
1025 BOOL_VAR_H(tessedit_zero_kelvin_rejection,
false,
1026 "Don't reject ANYTHING AT ALL");
1027 BOOL_VAR_H(tessedit_consistent_reps,
true,
"Force all rep chars the same");
1032 "Aspect ratio dot/hyphen test");
1034 "Aspect ratio dot/hyphen test");
1035 BOOL_VAR_H(rej_trust_doc_dawg,
false,
"Use DOC dawg in 11l conf. detector");
1037 BOOL_VAR_H(rej_1Il_trust_permuter_type,
true,
"Don't double check");
1038 BOOL_VAR_H(rej_use_tess_accepted,
true,
"Individual rejection control");
1039 BOOL_VAR_H(rej_use_tess_blanks,
true,
"Individual rejection control");
1040 BOOL_VAR_H(rej_use_good_perm,
true,
"Individual rejection control");
1042 BOOL_VAR_H(rej_alphas_in_number_perm,
false,
"Extend permuter check");
1043 double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85,
"if >this fract");
1044 INT_VAR_H(tessedit_image_border, 2,
"Rej blbs near image edge limit");
1045 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds,
"-?*\075",
1046 "Allow NN to unrej");
1048 INT_VAR_H(min_sane_x_ht_pixels, 8,
"Reject any x-ht lt or eq than this");
1049 BOOL_VAR_H(tessedit_create_boxfile,
false,
"Output text with boxes");
1051 "-1 -> All pages, else specifc page to process");
1052 BOOL_VAR_H(tessedit_write_images,
false,
"Capture the image from the IPE");
1053 BOOL_VAR_H(interactive_display_mode,
false,
"Run interactively?");
1055 BOOL_VAR_H(tessedit_override_permuter,
true,
"According to dict_word");
1056 INT_VAR_H(tessdata_manager_debug_level, 0,
1057 "Debug level for TessdataManager functions.");
1059 "List of languages to load with this one");
1060 BOOL_VAR_H(tessedit_use_primary_params_model,
false,
1061 "In multilingual mode use params model of the primary language");
1065 "Min acceptable orientation margin");
1066 BOOL_VAR_H(textord_tabfind_show_vlines,
false,
"Debug line finding");
1069 "Allow feature extractors to see the original outline");
1071 "Only initialize with the config file. Useful if the instance is " 1072 "not going to be used for OCR but say only for layout analysis.");
1073 BOOL_VAR_H(textord_equation_detect,
false,
"Turn on equation detector");
1074 BOOL_VAR_H(textord_tabfind_vertical_text,
true,
"Enable vertical detection");
1075 BOOL_VAR_H(textord_tabfind_force_vertical_text,
false,
1076 "Force using vertical text page mode");
1078 "Fraction of textlines deemed vertical to use vertical page " 1080 double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
1081 "Fraction of height used as a minimum gap for aligned blobs.");
1082 INT_VAR_H(tessedit_parallelize, 0,
"Run in parallel where possible");
1084 "Preserve multiple interword spaces");
1086 "Include page separator string in output text after each " 1089 "Page separator (default is form feed control character)");
1099 BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix,
true,
1100 "find horizontal lines such as headers in vertical page mode");
1101 INT_VAR_H(tessedit_ok_mode, 5,
"Acceptance decision algorithm");
1102 BOOL_VAR_H(load_fixed_length_dawgs,
true,
"Load fixed length" 1103 " dawgs (e.g. for non-space delimited languages)");
1104 INT_VAR_H(segment_debug, 0,
"Debug the whole segmentation process");
1106 double_VAR_H(bestrate_pruning_factor, 2.0,
"Multiplying factor of" 1107 " current best rate to prune other hypotheses");
1109 "Turn on word script consistency permuter");
1111 "incorporate segmentation cost in word rating?");
1113 "Score multipler for script consistency within a word. " 1114 "Being a 'reward' factor, it should be <= 1. " 1115 "Smaller value implies bigger reward.");
1117 "Turn on fixed-length phrasebook search permuter");
1119 "Turn on character type (property) consistency permuter");
1121 "Score multipler for char type consistency within a word. ");
1123 "Score multipler for ngram permuter's best choice" 1124 " (only used in the Han script path).");
1126 "Activate character-level n-gram-based permuter");
1127 BOOL_VAR_H(permute_only_top,
false,
"Run only the top choice permuter");
1128 INT_VAR_H(language_model_fixed_length_choices_depth, 3,
1129 "Depth of blob choice lists to explore" 1130 " when fixed length dawgs are on");
1132 "use new state cost heuristics for segmentation state evaluation");
1134 "base factor for adding segmentation cost into word rating." 1135 "It's a multiplying factor, the larger the value above 1, " 1136 "the bigger the effect of segmentation cost.");
1138 "weight associated with char rating in combined cost of state");
1140 "weight associated with width evidence in combined cost of" 1143 "weight associated with seam cut in combined cost of state");
1145 "max char width-to-height ratio allowed in segmentation");
1147 "Enable new segmentation search path.");
1148 double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
1149 "Maximum character width-to-height ratio for" 1150 "fixed pitch fonts");
1154 FILE *init_recog_training(
const STRING &fname);
1155 void recog_training_segmented(
const STRING &fname,
1159 void ambigs_classify_and_output(
const char *label,
1163 #ifndef NO_CUBE_BUILD 1171 const char* backup_config_file_;
1184 Pix* pix_thresholds_;
1187 int source_resolution_;
1194 bool right_to_left_;
1206 int font_table_size_;
1207 #ifndef NO_CUBE_BUILD 1219 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void SetScaledColor(int factor, Pix *color)
Pix * scaled_color() const
bool right_to_left() const
int source_resolution() const
void set_pix_thresholds(Pix *thresholds)
#define STRING_VAR_H(name, val, comment)
const Textord & textord() const
void flip_0O(WERD_RES *word)
const FCOORD & reskew() const
void set_pix_grey(Pix *grey_pix)
#define INT_VAR_H(name, val, comment)
bool tilde_crunch_written
Textord * mutable_textord()
Pix ** mutable_pix_binary()
void dont_allow_1Il(WERD_RES *word)
inT32 adaption_word_number
void flip_hyphens(WERD_RES *word)
void set_source_resolution(int ppi)
#define BOOL_VAR_H(name, val, comment)
CubeRecoContext * GetCubeRecoContext()
bool write_results_empty_block
inT16 doc_good_char_quality
bool last_char_was_newline
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
PointerVector< WERD_RES > lang_words
inT16 word_blob_quality(WERD_RES *word, ROW *row)
BOOL8 non_0_digit(const char *str, int length)
WordData(const PAGE_RES_IT &page_res_it)
int scaled_factor() const
Assume a single uniform block of text. (Default.)
void set_pix_original(Pix *original_pix)
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
Pix * pix_original() const
Tesseract * get_sub_lang(int index) const
#define double_VAR_H(name, val, comment)
int num_sub_langs() const