19 #ifndef TESSERACT_DICT_DICT_H_ 20 #define TESSERACT_DICT_DICT_H_ 24 #include "dawg_cache.h" 29 #include "unicharset.h" 30 #include "params_training_featdef.h" 35 #define CHARS_PER_LINE 500 36 #define MAX_WERD_LENGTH (int64_t) 128 79 : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
99 return getCCUtil()->unicharset;
102 return getCCUtil()->unicharset;
105 return getCCUtil()->unichar_ambigs;
111 getUnicharset().normed_ids(unichar_id);
112 return normed_ids.
size() == 1 &&
113 (normed_ids[0] == hyphen_unichar_id_ ||
114 normed_ids[0] == slash_unichar_id_);
120 getUnicharset().normed_ids(unichar_id);
121 return normed_ids.
size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
128 !last_word_on_line_ && hyphen_word_;
132 return this->hyphenated() ? hyphen_word_->length() : 0;
138 if (this->hyphenated()) {
139 *word = *hyphen_word_;
140 if (hyphen_debug_level) word->
print(
"copy_hyphen_info: ");
145 if (!last_word_on_line_ || first_pos)
148 getUnicharset().normed_ids(unichar_id);
149 return normed_ids.
size() == 1 && normed_ids[0] == hyphen_unichar_id_;
153 int word_index = word.
length() - 1;
154 return has_hyphen_end(word.
unichar_id(word_index), word_index == 0);
159 void reset_hyphen_vars(
bool last_word_on_line);
181 bool ambigs_mode)
const;
184 bool suppress_patterns)
const;
197 void go_deeper_dawg_fxn(
200 bool word_ending,
WERD_CHOICE *word,
float certainties[],
201 float *limit,
WERD_CHOICE *best_choice,
int *attempts_left,
202 void *void_more_args);
205 void (
Dict::*go_deeper_fxn_)(
const char *debug,
207 int char_choice_index,
210 float certainties[],
float *limit,
212 void *void_more_args);
216 void permute_choices(
219 int char_choice_index,
232 int char_choice_index,
241 bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
242 float curr_rating,
float curr_certainty,
244 const char *debug,
int word_ending,
250 bool fix_replaceable,
259 void ReplaceAmbig(
int wrong_ngram_begin_index,
int wrong_ngram_size,
260 UNICHAR_ID correct_ngram_id,
WERD_CHOICE *werd_choice,
264 int LengthOfShortestAlphaRun(
const WERD_CHOICE &WordChoice)
const;
274 bool AcceptableChoice(
const WERD_CHOICE& best_choice,
279 bool AcceptableResult(
WERD_RES *word)
const;
280 void EndDangerousAmbigs();
282 void DebugWordChoices();
284 void SettupStopperPass1();
286 void SettupStopperPass2();
300 void SetupForLoad(
DawgCache *dawg_cache);
312 if (pending_words_ !=
nullptr)
313 pending_words_->clear();
314 if (document_words_ !=
nullptr)
315 document_words_->clear();
354 int def_letter_is_okay(
void* void_dawg_args,
const UNICHARSET& unicharset,
357 int (
Dict::*letter_is_okay_)(
void* void_dawg_args,
362 UNICHAR_ID unichar_id,
bool word_end)
const {
363 return (this->*letter_is_okay_)(void_dawg_args,
369 double (
Dict::*probability_in_context_)(
const char* lang,
372 const char* character,
373 int character_bytes);
377 const char* character,
378 int character_bytes) {
379 return (this->*probability_in_context_)(
380 getCCUtil()->lang.string(),
381 context, context_bytes,
382 character, character_bytes);
387 const char* lang,
const char* context,
int context_bytes,
388 const char* character,
int character_bytes) {
393 (void)character_bytes;
396 double ngram_probability_in_context(
const char* lang,
399 const char* character,
400 int character_bytes);
403 float (
Dict::*params_model_classify_)(
const char *lang,
void *path);
404 float ParamsModelClassify(
const char *lang,
void *path);
407 ASSERT_HOST(params_model_classify_ !=
nullptr);
408 return (this->*params_model_classify_)(
409 getCCUtil()->lang.string(), path);
413 inline UNICHAR_ID
WildcardID()
const {
return wildcard_unichar_id_; }
415 inline int NumDawgs()
const {
return dawgs_.size(); }
417 inline const Dawg *
GetDawg(
int index)
const {
return dawgs_[index]; }
424 if (edge_ref == NO_EDGE)
return 0;
425 NODE_REF node = dawg->
next_node(edge_ref);
426 if (node == 0) node = NO_EDGE;
434 const Dawg *dawg)
const {
435 if (!dawg)
return ch;
436 switch (dawg->
type()) {
438 return unicharset.
get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
450 UNICHAR_ID unichar_id,
bool word_end,
452 PermuterType *current_permuter)
const;
460 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
461 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
462 perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
463 (numbers_ok && perm == NUMBER_PERM));
465 int valid_word(
const WERD_CHOICE &word,
bool numbers_ok)
const;
467 return valid_word(word,
false);
470 return valid_word(word,
true);
475 return valid_word(word);
487 void add_document_word(
const WERD_CHOICE &best_choice);
491 float additional_adjust,
496 wordseg_rating_adjust_factor_ = f;
499 bool IsSpaceDelimitedLang()
const;
561 STRING_VAR_H(user_words_file,
"",
"A filename of user-provided words.");
562 STRING_VAR_H(user_words_suffix,
"",
563 "A suffix of user-provided words located in tessdata.");
564 STRING_VAR_H(user_patterns_file,
"",
565 "A filename of user-provided patterns.");
566 STRING_VAR_H(user_patterns_suffix,
"",
567 "A suffix of user-provided patterns located in tessdata.");
568 BOOL_VAR_H(load_system_dawg,
true,
"Load system word dawg.");
569 BOOL_VAR_H(load_freq_dawg,
true,
"Load frequent word dawg.");
570 BOOL_VAR_H(load_unambig_dawg,
true,
"Load unambiguous word dawg.");
571 BOOL_VAR_H(load_punc_dawg,
true,
572 "Load dawg with punctuation patterns.");
573 BOOL_VAR_H(load_number_dawg,
true,
"Load dawg with number patterns.");
574 BOOL_VAR_H(load_bigram_dawg,
true,
575 "Load dawg with special word bigrams.");
576 double_VAR_H(xheight_penalty_subscripts, 0.125,
577 "Score penalty (0.1 = 10%) added if there are subscripts " 578 "or superscripts in a word, but it is otherwise OK.");
579 double_VAR_H(xheight_penalty_inconsistent, 0.25,
580 "Score penalty (0.1 = 10%) added if an xheight is " 582 double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
583 "Score multiplier for word matches which have good case and" 584 "are frequent in the given language (lower is better).");
586 double_VAR_H(segment_penalty_dict_case_ok, 1.1,
587 "Score multiplier for word matches that have good case " 588 "(lower is better).");
590 double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
591 "Default score multiplier for word matches, which may have " 592 "case issues (lower is better).");
594 double_VAR_H(segment_penalty_dict_nonword, 1.25,
595 "Score multiplier for glyph fragment segmentations which " 596 "do not match a dictionary word (lower is better).");
598 double_VAR_H(segment_penalty_garbage, 1.50,
599 "Score multiplier for poorly cased strings that are not in" 600 " the dictionary and generally look like garbage (lower is" 602 STRING_VAR_H(output_ambig_words_file,
"",
603 "Output file for ambiguities found in the dictionary");
604 INT_VAR_H(dawg_debug_level, 0,
"Set to 1 for general debug info" 605 ", to 2 for more details, to 3 to see all the debug messages");
606 INT_VAR_H(hyphen_debug_level, 0,
"Debug level for hyphenated words.");
607 INT_VAR_H(max_viterbi_list_size, 10,
"Maximum size of viterbi list.");
608 BOOL_VAR_H(use_only_first_uft8_step,
false,
609 "Use only the first UTF8 step of the given string" 610 " when computing log probabilities.");
611 double_VAR_H(certainty_scale, 20.0,
"Certainty scaling factor");
612 double_VAR_H(stopper_nondict_certainty_base, -2.50,
613 "Certainty threshold for non-dict words");
614 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
615 "Reject certainty offset");
616 INT_VAR_H(stopper_smallword_size, 2,
617 "Size of dict word to be treated as non-dict word");
618 double_VAR_H(stopper_certainty_per_char, -0.50,
619 "Certainty to add for each dict char above small word size.");
620 double_VAR_H(stopper_allowable_character_badness, 3.0,
621 "Max certaintly variation allowed in a word (in sigma)");
622 INT_VAR_H(stopper_debug_level, 0,
"Stopper debug level");
623 BOOL_VAR_H(stopper_no_acceptable_choices,
false,
624 "Make AcceptableChoice() always return false. Useful" 625 " when there is a need to explore all segmentations");
626 INT_VAR_H(tessedit_truncate_wordchoice_log, 10,
"Max words to keep in list");
627 STRING_VAR_H(word_to_debug,
"",
"Word for which stopper debug information" 628 " should be printed to stdout");
629 STRING_VAR_H(word_to_debug_lengths,
"",
630 "Lengths of unichars in word_to_debug");
631 INT_VAR_H(fragments_debug, 0,
"Debug character fragments");
632 BOOL_VAR_H(segment_nonalphabetic_script,
false,
633 "Don't use any alphabetic-specific tricks." 634 "Set to true in the traineddata config file for" 635 " scripts that are cursive or inherently fixed-pitch");
636 BOOL_VAR_H(save_doc_words, 0,
"Save Document Words");
637 double_VAR_H(doc_dict_pending_threshold, 0.0,
638 "Worst certainty for using pending dictionary");
639 double_VAR_H(doc_dict_certainty_threshold, -2.25,
"Worst certainty" 640 " for words that can be inserted into the document dictionary");
641 INT_VAR_H(max_permuter_attempts, 10000,
"Maximum number of different" 642 " character choices to consider during permutation." 643 " This limit is especially useful when user patterns" 644 " are specified, since overly generic patterns can result in" 645 " dawg search exploring an overly large number of options.");
649 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_ FILE * output_ambig_words_file_
Definition: dict.h:555
DawgCache * dawg_cache_
Definition: dict.h:531
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:419
Dawg * freq_dawg_
Definition: dict.h:547
static const int kDocDictMaxRepChars
Definition: dict.h:67
UNICHAR_ID wildcard_unichar_id_
Definition: dict.h:516
Dawg * unambig_dawg_
Definition: dict.h:548
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:386
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:375
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:415
static const char kQuestionSymbol[]
Definition: dict.h:62
UnicharAmbigs * dang_ambigs_table_
Definition: dict.h:510
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:473
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:109
DawgPositionVector hyphen_active_dawgs_
Definition: dict.h:523
static const char kSlashSymbol[]
Definition: dict.h:61
Definition: unicharset.h:146
UNICHAR_ID apostrophe_unichar_id_
Definition: dict.h:517
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
static const float kSimilarityFloor
Definition: dict.h:66
bool last_word_on_line_
Definition: dict.h:524
void ResetDocumentDictionary()
Definition: dict.h:311
DawgType type() const
Definition: dawg.h:128
static const int kRatingPad
Definition: dict.h:55
float rating
Definition: dict.h:44
Definition: baseapi.cpp:94
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:127
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
DawgPositionVector * updated_dawgs
Definition: dict.h:82
float certainty
Definition: dict.h:45
int num_fragments
Definition: dict.h:43
static const char kApostropheSymbol[]
Definition: dict.h:63
UNICHARSET & getUnicharset()
Definition: dict.h:101
UNICHAR_ID slash_unichar_id_
Definition: dict.h:519
const CCUtil * getCCUtil() const
Definition: dict.h:92
static const float kSimCertaintyScale
Definition: dict.h:64
PermuterType permuter
Definition: dict.h:83
int length() const
Definition: ratngs.h:303
CCUtil * getCCUtil()
Definition: dict.h:95
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:131
WERD_CHOICE * hyphen_word_
Definition: dict.h:522
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:469
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:118
Definition: tessdatamanager.h:126
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423
UNICHAR_ID question_unichar_id_
Definition: dict.h:518
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:412
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:417
bool dawg_cache_is_ours_
Definition: dict.h:532
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:433
UnicharAmbigs * replace_ambigs_table_
Definition: dict.h:512
Dawg * punc_dawg_
Definition: dict.h:549
Trie * document_words_
Definition: dict.h:550
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:171
DawgVector dawgs_
Definition: dict.h:534
static const int kDictMaxWildcards
Definition: dict.h:57
GenericVector< GenericVectorEqEq< UNICHAR_ID > > equivalent_symbols_
Definition: dict.h:529
static const char kDictWildcard[]
Definition: dict.h:56
UNICHAR_ID hyphen_unichar_id_
Definition: dict.h:520
Dawg * bigram_dawg_
Definition: dict.h:544
CCUtil * ccutil_
Definition: dict.h:503
Definition: pageres.h:169
float CallParamsModelClassify(void *path)
Definition: dict.h:406
int size() const
Definition: genericvector.h:71
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:466
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:137
static const float kSimCertaintyOffset
Definition: dict.h:65
SuccessorListsVector successors_
Definition: dict.h:535
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
UNICHAR_ID WildcardID() const
Definition: dict.h:413
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:495
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:361
XHeightConsistencyEnum
Definition: dict.h:75
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void print() const
Definition: ratngs.h:580
bool valid_end
Definition: dict.h:85
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:421
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
Definition: unicharset.h:49
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:78
static const char kHyphenSymbol[]
Definition: dict.h:60
DawgPositionVector * active_dawgs
Definition: dict.h:81
Definition: dawg_cache.h:30
float rating() const
Definition: ratngs.h:327
float reject_offset_
Definition: dict.h:514
Trie * pending_words_
Definition: dict.h:536
UNICHAR_ID unichar_id
Definition: dict.h:41
float wordseg_rating_adjust_factor_
Definition: dict.h:553
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:152