44 #include "allheaders.h"
59 "Take segmentation and labeling from box file",
61 BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
62 "Conversion of word/line box file to char box file",
65 "Generate training data from boxed chars", this->params()),
67 "Generate more boxes from boxed chars", this->params()),
69 "Dump intermediate images made during page segmentation",
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
76 " 5=line, 6=word, 7=char"
77 " (Values from PageSegMode enum in publictypes.h)",
80 "Which OCR engine(s) to run (Tesseract, Cube, both)."
81 " Defaults to loading and running only Tesseract"
82 " (no Cube,no combiner)."
83 " Values from OcrEngineMode enum in tesseractclass.h)",
86 "Blacklist of chars not to recognize", this->params()),
88 "Whitelist of chars to recognize", this->params()),
90 "List of chars to override tessedit_char_blacklist",
93 "Perform training for ambiguities", this->params()),
96 "Whether to use the top-line splitting process for Devanagari "
97 "documents while performing page-segmentation.",
101 "Whether to use the top-line splitting process for Devanagari "
102 "documents while performing ocr.",
105 "Write all parameters to the given file.", this->params()),
107 "Generate and print debug"
108 " information for adaption",
110 INT_MEMBER(bidi_debug, 0,
"Debug level for BiDi", this->params()),
111 INT_MEMBER(applybox_debug, 1,
"Debug level", this->params()),
112 INT_MEMBER(applybox_page, 0,
"Page number to apply boxes from",
115 "Exposure value follows"
116 " this pattern in the image filename. The name of the image"
117 " files are expected to be in the form"
118 " [lang].[fontname].exp[num].tif",
120 BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
121 "Learn both character fragments (as is done in the"
122 " special low exposure mode) as well as unfragmented"
127 " is assumed to contain ngrams. Only learn the ngrams"
128 " whose outlines overlap horizontally.",
130 BOOL_MEMBER(tessedit_display_outwords, false,
"Draw output words",
132 BOOL_MEMBER(tessedit_dump_choices, false,
"Dump char choices",
134 BOOL_MEMBER(tessedit_timing_debug, false,
"Print timing stats",
137 "Try to improve fuzzy spaces", this->params()),
139 "Don't bother with word plausibility", this->params()),
140 BOOL_MEMBER(tessedit_fix_hyphens, true,
"Crunch double hyphens?",
142 BOOL_MEMBER(tessedit_redo_xheight, true,
"Check/Correct x-height",
145 "Add words to the document dictionary", this->params()),
146 BOOL_MEMBER(tessedit_debug_fonts, false,
"Output font info per char",
148 BOOL_MEMBER(tessedit_debug_block_rejection, false,
"Block and Row stats",
150 BOOL_MEMBER(tessedit_enable_bigram_correction, true,
151 "Enable correction based on the word bigram dictionary.",
153 BOOL_MEMBER(tessedit_enable_dict_correction, false,
154 "Enable single word correction based on the dictionary.",
157 "Amount of debug output for bigram correction.",
160 "Remove and conditionally reassign small outlines when they"
161 " confuse layout analysis, determining diacritics vs noise",
163 INT_MEMBER(debug_noise_removal, 0,
"Debug reassignment of small outlines",
169 "Hingepoint for base char certainty", this->params()),
173 "Hingepoint for disjoint certainty", this->params()),
177 "Threshold for new punc char certainty", this->params()),
180 "Scaling on certainty diff from Hingepoint",
182 INT_MEMBER(noise_maxperblob, 8,
"Max diacritics to apply to a blob",
184 INT_MEMBER(noise_maxperword, 16,
"Max diacritics to apply to a word",
186 INT_MEMBER(debug_x_ht_level, 0,
"Reestimate debug", this->params()),
187 BOOL_MEMBER(debug_acceptable_wds, false,
"Dump word pass/fail chk",
189 STRING_MEMBER(chs_leading_punct,
"('`\"",
"Leading punctuation",
191 STRING_MEMBER(chs_trailing_punct1,
").,;:?!",
"1st Trailing punctuation",
193 STRING_MEMBER(chs_trailing_punct2,
")'`\"",
"2nd Trailing punctuation",
196 "good_quality_doc lte rejection limit", this->params()),
198 "good_quality_doc gte good blobs limit", this->params()),
200 "good_quality_doc lte outline error limit", this->params()),
202 "good_quality_doc gte good char limit", this->params()),
203 INT_MEMBER(quality_min_initial_alphas_reqd, 2,
"alphas in a good word",
206 "Adaptation decision algorithm for tess", this->params()),
208 "Do minimal rejection on pass 1 output", this->params()),
209 BOOL_MEMBER(tessedit_test_adaption, false,
"Test adaption criteria",
211 BOOL_MEMBER(tessedit_matcher_log, false,
"Log matcher activity",
214 "Adaptation decision algorithm for tess", this->params()),
215 BOOL_MEMBER(test_pt, false,
"Test for point", this->params()),
216 double_MEMBER(test_pt_x, 99999.99,
"xcoord", this->params()),
217 double_MEMBER(test_pt_y, 99999.99,
"ycoord", this->params()),
218 INT_MEMBER(paragraph_debug_level, 0,
"Print paragraph debug info.",
221 "Run paragraph detection on the post-text-recognition "
224 INT_MEMBER(cube_debug_level, 0,
"Print cube debug info.", this->params()),
225 STRING_MEMBER(outlines_odd,
"%| ",
"Non standard number of outlines",
227 STRING_MEMBER(outlines_2,
"ij!?%\":;",
"Non standard number of outlines",
230 "Allow outline errs in unrejection?", this->params()),
232 "Reduce rejection on good docs", this->params()),
233 BOOL_MEMBER(tessedit_use_reject_spaces, true,
"Reject spaces?",
236 "%rej allowed before rej whole doc", this->params()),
238 "%rej allowed before rej whole block", this->params()),
240 "%rej allowed before rej whole row", this->params()),
242 "Number of row rejects in whole word rejects"
243 "which prevents whole row rejection",
245 BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
246 "Only rej partially rejected words in block rejection",
248 BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
249 "Only rej partially rejected words in row rejection",
252 "Use word segmentation quality metric", this->params()),
254 "Use word segmentation quality metric", this->params()),
256 "Only preserve wds longer than this", this->params()),
258 "Apply row rejection to good docs", this->params()),
260 "rej good doc wd if more than this fraction rejected",
263 "Reject all bad quality wds", this->params()),
264 BOOL_MEMBER(tessedit_debug_doc_rejection, false,
"Page stats",
267 "Output data to debug file", this->params()),
268 BOOL_MEMBER(bland_unrej, false,
"unrej potential with no chekcs",
271 "good_quality_doc gte good char limit", this->params()),
273 "Mark v.bad words for tilde crunch", this->params()),
274 BOOL_MEMBER(hocr_font_info, false,
"Add font info to hocr output",
276 BOOL_MEMBER(crunch_early_merge_tess_fails, true,
"Before word crunch?",
278 BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
279 "Take out ~^ early?", this->params()),
280 double_MEMBER(crunch_terrible_rating, 80.0,
"crunch rating lt this",
282 BOOL_MEMBER(crunch_terrible_garbage, true,
"As it says", this->params()),
284 "crunch garbage cert lt this", this->params()),
286 "crunch garbage rating lt this", this->params()),
287 double_MEMBER(crunch_pot_poor_rate, 40,
"POTENTIAL crunch rating lt this",
289 double_MEMBER(crunch_pot_poor_cert, -8.0,
"POTENTIAL crunch cert lt this",
291 BOOL_MEMBER(crunch_pot_garbage, true,
"POTENTIAL crunch garbage",
293 double_MEMBER(crunch_del_rating, 60,
"POTENTIAL crunch rating lt this",
295 double_MEMBER(crunch_del_cert, -10.0,
"POTENTIAL crunch cert lt this",
297 double_MEMBER(crunch_del_min_ht, 0.7,
"Del if word ht lt xht x this",
299 double_MEMBER(crunch_del_max_ht, 3.0,
"Del if word ht gt xht x this",
302 "Del if word width lt xht x this", this->params()),
304 "Del if word gt xht x this above bl", this->params()),
306 "Del if word gt xht x this below bl", this->params()),
307 double_MEMBER(crunch_small_outlines_size, 0.6,
"Small if lt xht x this",
309 INT_MEMBER(crunch_rating_max, 10,
"For adj length in rating per ch",
312 "How many potential indicators needed", this->params()),
313 BOOL_MEMBER(crunch_leave_ok_strings, true,
"Don't touch sensible strings",
315 BOOL_MEMBER(crunch_accept_ok, true,
"Use acceptability in okstring",
318 "Don't pot crunch sensible strings", this->params()),
319 BOOL_MEMBER(crunch_include_numerals, false,
"Fiddle alpha figures",
322 "Don't crunch words with long lower case strings",
325 "Don't crunch words with long lower case strings",
328 "Crunch words with long repetitions", this->params()),
329 INT_MEMBER(crunch_debug, 0,
"As it says", this->params()),
331 "How many non-noise blbs either side?", this->params()),
332 double_MEMBER(fixsp_small_outlines_size, 0.28,
"Small if lt xht x this",
335 "Reward punctation joins", this->params()),
336 INT_MEMBER(fixsp_done_mode, 1,
"What constitues done for spacing",
338 INT_MEMBER(debug_fix_space_level, 0,
"Contextual fixspace debug",
341 "Punct. chs expected WITHIN numbers", this->params()),
343 "Max allowed deviation of blob top outside of font data",
346 "Min change in xht before actually trying it", this->params()),
348 "Debug level for sub & superscript fixer", this->params()),
350 superscript_worse_certainty, 2.0,
351 "How many times worse "
352 "certainty does a superscript position glyph need to be for "
353 "us to try classifying it as a char with a different "
357 superscript_bettered_certainty, 0.97,
359 "badness do we think sufficient to choose a superscript "
360 "over what we'd thought. For example, a value of 0.6 means "
361 "we want to reduce badness of certainty by at least 40%",
364 "A superscript scaled down more than this is unbelievably "
365 "small. For example, 0.3 means we expect the font size to "
366 "be no smaller than 30% of the text line font size.",
369 "Maximum top of a character measured as a multiple of "
370 "x-height above the baseline for us to reconsider whether "
374 "Minimum bottom of a character measured as a multiple of "
375 "x-height above the baseline for us to reconsider whether "
376 "it's a superscript.",
378 BOOL_MEMBER(tessedit_write_block_separators, false,
379 "Write block separators in output", this->params()),
380 BOOL_MEMBER(tessedit_write_rep_codes, false,
"Write repetition char code",
382 BOOL_MEMBER(tessedit_write_unlv, false,
"Write .unlv output file",
384 BOOL_MEMBER(tessedit_create_txt, false,
"Write .txt output file",
386 BOOL_MEMBER(tessedit_create_hocr, false,
"Write .html hOCR output file",
388 BOOL_MEMBER(tessedit_create_tsv, false,
"Write .tsv output file",
390 BOOL_MEMBER(tessedit_create_pdf, false,
"Write .pdf output file",
393 "Output char for unidentified blobs", this->params()),
394 INT_MEMBER(suspect_level, 99,
"Suspect marker level", this->params()),
396 "Min suspect level for rejecting spaces", this->params()),
398 "Don't suspect dict wds longer than this", this->params()),
399 BOOL_MEMBER(suspect_constrain_1Il, false,
"UNLV keep 1Il chars rejected",
402 "Don't touch bad rating limit", this->params()),
403 double_MEMBER(suspect_accept_rating, -999.9,
"Accept good rating limit",
406 "Only reject tess failures", this->params()),
407 BOOL_MEMBER(tessedit_zero_rejection, false,
"Don't reject ANYTHING",
410 "Make output have exactly one word per WERD", this->params()),
412 "Don't reject ANYTHING AT ALL", this->params()),
414 "Force all rep chars the same", this->params()),
415 INT_MEMBER(tessedit_reject_mode, 0,
"Rejection algorithm",
417 BOOL_MEMBER(tessedit_rejection_debug, false,
"Adaption debug",
419 BOOL_MEMBER(tessedit_flip_0O, true,
"Contextual 0O O0 flips",
422 "Aspect ratio dot/hyphen test", this->params()),
424 "Aspect ratio dot/hyphen test", this->params()),
426 "Use DOC dawg in 11l conf. detector", this->params()),
427 BOOL_MEMBER(rej_1Il_use_dict_word, false,
"Use dictword test",
429 BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
"Don't double check",
431 BOOL_MEMBER(rej_use_tess_accepted, true,
"Individual rejection control",
433 BOOL_MEMBER(rej_use_tess_blanks, true,
"Individual rejection control",
435 BOOL_MEMBER(rej_use_good_perm, true,
"Individual rejection control",
437 BOOL_MEMBER(rej_use_sensible_wd, false,
"Extend permuter check",
439 BOOL_MEMBER(rej_alphas_in_number_perm, false,
"Extend permuter check",
442 "if >this fract", this->params()),
443 INT_MEMBER(tessedit_image_border, 2,
"Rej blbs near image edge limit",
446 "Allow NN to unrej", this->params()),
447 STRING_MEMBER(conflict_set_I_l_1,
"Il1[]",
"Il1 conflict set",
449 INT_MEMBER(min_sane_x_ht_pixels, 8,
"Reject any x-ht lt or eq than this",
451 BOOL_MEMBER(tessedit_create_boxfile, false,
"Output text with boxes",
455 " , else specifc page to process",
458 "Capture the image from the IPE", this->params()),
459 BOOL_MEMBER(interactive_display_mode, false,
"Run interactively?",
461 STRING_MEMBER(file_type,
".tif",
"Filename extension", this->params()),
462 BOOL_MEMBER(tessedit_override_permuter, true,
"According to dict_word",
466 " TessdataManager functions.",
469 "List of languages to load with this one", this->params()),
470 BOOL_MEMBER(tessedit_use_primary_params_model, false,
471 "In multilingual mode use params model of the"
475 "Min acceptable orientation margin", this->params()),
476 BOOL_MEMBER(textord_tabfind_show_vlines, false,
"Debug line finding",
481 "Allow feature extractors to see the original outline",
484 "Only initialize with the config file. Useful if the "
485 "instance is not going to be used for OCR but say only "
486 "for layout analysis.",
488 BOOL_MEMBER(textord_equation_detect, false,
"Turn on equation detector",
491 "Enable vertical detection", this->params()),
492 BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
493 "Force using vertical text page mode", this->params()),
495 textord_tabfind_vertical_text_ratio, 0.5,
496 "Fraction of textlines deemed vertical to use vertical page "
500 textord_tabfind_aligned_gap_fraction, 0.75,
501 "Fraction of height used as a minimum gap for aligned blobs.",
503 INT_MEMBER(tessedit_parallelize, 0,
"Run in parallel where possible",
506 "Preserve multiple interword spaces", this->params()),
508 "Include page separator string in output text after each "
512 "Page separator (default is form feed control character)",
524 BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
525 "find horizontal lines such as headers in vertical page mode",
527 INT_MEMBER(tessedit_ok_mode, 5,
"Acceptance decision algorithm",
530 "Load fixed length dawgs"
531 " (e.g. for non-space delimited languages)",
533 INT_MEMBER(segment_debug, 0,
"Debug the whole segmentation process",
535 BOOL_MEMBER(permute_debug, 0,
"Debug char permutation process",
538 "Multiplying factor of"
539 " current best rate to prune other hypotheses",
542 "Turn on word script consistency permuter", this->params()),
544 "incorporate segmentation cost in word rating?",
547 "Score multipler for script consistency within a word. "
548 "Being a 'reward' factor, it should be <= 1. "
549 "Smaller value implies bigger reward.",
552 "Turn on fixed-length phrasebook search permuter",
555 "Turn on character type (property) consistency permuter",
558 "Score multipler for char type consistency within a word. ",
561 "Score multipler for ngram permuter's best choice"
562 " (only used in the Han script path).",
565 "Activate character-level n-gram-based permuter",
567 BOOL_MEMBER(permute_only_top, false,
"Run only the top choice permuter",
569 INT_MEMBER(language_model_fixed_length_choices_depth, 3,
570 "Depth of blob choice lists to explore"
571 " when fixed length dawgs are on",
574 "use new state cost heuristics for segmentation state"
578 "base factor for adding segmentation cost into word rating."
579 "It's a multiplying factor, the larger the value above 1, "
580 "the bigger the effect of segmentation cost.",
583 "weight associated with char rating in combined cost of"
587 "weight associated with width evidence in combined cost of"
591 "weight associated with seam cut in combined cost of state",
594 "max char width-to-height ratio allowed in segmentation",
597 "Enable new segmentation search path.", this->params()),
599 "Maximum character width-to-height ratio for"
600 " fixed-pitch fonts",
604 backup_config_file_(NULL),
609 pix_thresholds_(NULL),
610 source_resolution_(0),
612 right_to_left_(false),
617 most_recently_used_(this),
619 #ifndef NO_CUBE_BUILD
621 tess_cube_combiner_(NULL),
628 pixDestroy(&pix_original_);
630 sub_langs_.delete_data_pointers();
631 #ifndef NO_CUBE_BUILD
633 if (cube_cntxt_ != NULL) {
637 if (tess_cube_combiner_ != NULL) {
638 delete tess_cube_combiner_;
639 tess_cube_combiner_ = NULL;
645 pixDestroy(&pix_binary_);
646 pixDestroy(&cube_binary_);
647 pixDestroy(&pix_grey_);
648 pixDestroy(&pix_thresholds_);
649 pixDestroy(&scaled_color_);
650 deskew_ =
FCOORD(1.0f, 0.0f);
651 reskew_ =
FCOORD(1.0f, 0.0f);
654 for (
int i = 0; i < sub_langs_.size(); ++i)
655 sub_langs_[i]->
Clear();
659 equ_detect_ = detector;
666 for (
int i = 0; i < sub_langs_.size(); ++i) {
667 sub_langs_[i]->ResetAdaptiveClassifierInternal();
674 for (
int i = 0; i < sub_langs_.size(); ++i) {
675 sub_langs_[i]->getDict().ResetDocumentDictionary();
685 for (
int i = 0; i < sub_langs_.size(); ++i) {
686 sub_langs_[i]->unicharset.set_black_and_whitelist(
696 pixDestroy(&cube_binary_);
702 for (
int i = 0; i < sub_langs_.size(); ++i) {
705 static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
706 if (pageseg_strategy > max_pageseg_strategy)
707 max_pageseg_strategy = pageseg_strategy;
709 pixDestroy(&sub_langs_[i]->cube_binary_);
710 sub_langs_[i]->cube_binary_ = pixClone(
pix_binary());
711 pixDestroy(&sub_langs_[i]->pix_binary_);
712 sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
718 if (splitter_.
Split(
true)) {
720 pixDestroy(&pix_binary_);
736 for (
int i = 0; i < sub_langs_.size(); ++i) {
739 static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
740 if (ocr_strategy > max_ocr_strategy)
741 max_ocr_strategy = ocr_strategy;
747 bool split_for_ocr = splitter_.
Split(
false);
750 pixDestroy(&pix_binary_);
751 pix_binary_ = pixClone(splitter_.
orig_pix());
756 BLOCK block(
"",
TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
757 pixGetHeight(pix_binary_));
void set_pageseg_split_strategy(SplitStrategy strategy)
void ResetDocumentDictionary()
#define STRING_MEMBER(name, val, comment, vec)
void set_use_cjk_fp_model(bool flag)
Assume a single uniform block of text. (Default.)
void set_segmentation_block_list(BLOCK_LIST *block_list)
void SetEquationDetect(EquationDetect *detector)
char * tessedit_char_blacklist
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
bool textord_use_cjk_fp_model
void SetLangTesseract(Tesseract *lang_tesseract)
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
bool HasDifferentSplitStrategies() const
#define BOOL_MEMBER(name, val, comment, vec)
void SetBlackAndWhitelist()
#define BOOL_INIT_MEMBER(name, val, comment, vec)
bool Split(bool split_for_pageseg)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
void ResetDocumentDictionary()
void ResetAdaptiveClassifierInternal()
char * tessedit_char_unblacklist
C_BLOB_LIST * blob_list()
get blobs
#define INT_INIT_MEMBER(name, val, comment, vec)
void set_ocr_split_strategy(SplitStrategy strategy)
void set_orig_pix(Pix *pix)
void ResetAdaptiveClassifier()
int ocr_devanagari_split_strategy
char * tessedit_char_whitelist
#define double_MEMBER(name, val, comment, vec)
#define INT_MEMBER(name, val, comment, vec)
void extract_edges(Pix *pix, BLOCK *block)
int pageseg_devanagari_split_strategy