22 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 23 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 26 #include "associate.h" 29 #include "lm_consistency.h" 32 #include "params_model.h" 78 bool fixed_pitch,
float max_char_wh_ratio,
79 float rating_cert_scale);
95 int curr_col,
int curr_row,
96 BLOB_CHOICE_LIST *curr_list,
114 if (language_model_use_sigmoidal_certainty) {
118 cert = -cert /
dict_->certainty_scale;
119 return 1.0f / (1.0f + exp(10.0f * cert));
121 return (-1.0f / cert);
126 if (num_problems == 0)
return 0.0f;
127 if (num_problems == 1)
return penalty;
128 return (penalty + (language_model_penalty_increment *
129 static_cast<float>(num_problems-1)));
139 if (dawg_info !=
nullptr) {
141 language_model_penalty_case) +
143 language_model_penalty_script : 0.0f);
146 language_model_penalty_punc) +
148 language_model_penalty_case) +
150 language_model_penalty_chartype) +
152 language_model_penalty_spacing) +
154 language_model_penalty_script : 0.0f) +
156 language_model_penalty_font : 0.0f));
184 bool just_classified,
bool mixed_alnum,
187 ViterbiStateEntry_IT* vse_it,
215 int curr_col,
int curr_row,
227 const char *unichar,
float certainty,
float denom,
228 int curr_col,
int curr_row,
float outline_length,
239 const char *context,
int *unichar_step_len,
240 bool *found_small_prob,
float *ngram_prob);
282 float max_char_wh_ratio,
289 (parent_vse !=
nullptr) ? parent_vse->
length : 0,
291 word_res, language_model_debug_level > 2, associate_stats);
317 INT_VAR_H(language_model_debug_level, 0,
"Language model debug level");
319 "Turn on/off the use of character ngram model");
321 "Maximum order of the character ngram model");
322 INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10,
323 "Maximum number of prunable (those for which PrunablePath() is" 324 " true) entries in each viterbi list recorded in BLOB_CHOICEs");
325 INT_VAR_H(language_model_viterbi_list_max_size, 500,
326 "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
328 "To avoid overly small denominators use this as the floor" 329 " of the probability returned by the ngram model");
330 double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
331 "Average classifier score of a non-matching unichar");
332 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step,
false,
333 "Use only the first UTF8 step of the given string" 334 " when computing log probabilities");
336 "Strength of the character ngram model relative to the" 337 " character classifier ");
339 "Factor to bring log-probs into the same range as ratings" 340 " when multiplied by outline length ");
341 BOOL_VAR_H(language_model_ngram_space_delimited_language,
true,
342 "Words are delimited by space");
343 INT_VAR_H(language_model_min_compound_length, 3,
344 "Minimum length of compound words");
346 double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
347 "Penalty for words not in the frequent word dictionary");
348 double_VAR_H(language_model_penalty_non_dict_word, 0.15,
349 "Penalty for non-dictionary words");
351 "Penalty for inconsistent punctuation");
353 "Penalty for inconsistent case");
355 "Penalty for inconsistent script");
357 "Penalty for inconsistent character type");
359 "Penalty for inconsistent font");
361 "Penalty for inconsistent spacing");
362 double_VAR_H(language_model_penalty_increment, 0.01,
"Penalty increment");
363 INT_VAR_H(wordrec_display_segmentations, 0,
"Display Segmentations");
364 BOOL_VAR_H(language_model_use_sigmoidal_certainty,
false,
365 "Use sigmoidal score for certainty");
427 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_ Definition: lm_state.h:63
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:184
static const LanguageModelFlagsType kDigitFlag
Definition: language_model.h:57
PermuterType permuter
Definition: lm_state.h:67
Definition: lm_pain_points.h:57
static const LanguageModelFlagsType kSmallestRatingFlag
Definition: language_model.h:54
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
Definition: language_model.cpp:1018
bool acceptable_choice_found_
Definition: language_model.h:417
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
Definition: language_model.cpp:1341
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:565
void SetAcceptableChoiceFound(bool val)
Definition: language_model.h:105
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
Definition: language_model.cpp:54
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
Definition: language_model.cpp:937
float max_char_wh_ratio_
Definition: language_model.h:394
float rating_cert_scale_
Definition: language_model.h:375
int NumInconsistentChartype() const
Definition: lm_consistency.h:91
Definition: lm_state.h:93
bool inconsistent_font
Definition: lm_consistency.h:130
static const LanguageModelFlagsType kXhtConsistentFlag
Definition: language_model.h:58
Struct to store information maintained by various language model components.
Definition: lm_state.h:195
Definition: unicharset.h:146
bool Consistent() const
Definition: lm_state.h:135
float CertaintyScore(float cert)
Definition: language_model.h:113
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
Definition: language_model.cpp:387
Definition: lm_consistency.h:39
Definition: baseapi.cpp:94
bool AcceptablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:310
int length
Definition: lm_state.h:169
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
Definition: language_model.cpp:504
bool inconsistent_script
Definition: lm_consistency.h:128
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:176
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Definition: language_model.h:281
INT_VAR_H(language_model_debug_level, 0, "Language model debug level")
DawgPositionVector very_beginning_active_dawgs_
Definition: language_model.h:404
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:880
const UnicityTable< FontInfo > * fontinfo_table_
Definition: language_model.h:380
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:217
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:1241
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:789
int NumInconsistentPunc() const
Definition: lm_consistency.h:85
Definition: fontinfo.h:30
bool pruned
Definition: lm_state.h:84
int NumInconsistentCase() const
Definition: lm_consistency.h:88
float ComputeAdjustment(int num_problems, float penalty)
Definition: language_model.h:125
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
Definition: language_model.cpp:997
DawgPositionVector beginning_active_dawgs_
Definition: language_model.h:405
AssociateStats associate_stats
Definition: lm_state.h:172
bool fixed_pitch_
Definition: language_model.h:391
DawgArgs dawg_args_
Definition: language_model.h:373
Dict * dict_
Definition: language_model.h:384
BOOL_VAR_H(language_model_ngram_on, false, "Turn on/off the use of character ngram model")
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
Definition: language_model.cpp:773
~LanguageModel()
Definition: language_model.cpp:138
static const float kMaxAvgNgramCost
Definition: language_model.h:62
Definition: pageres.h:169
Definition: params_model.h:32
STRING prev_word_str_
Definition: language_model.h:401
static const LanguageModelFlagsType kUpperCaseFlag
Definition: language_model.h:56
bool correct_segmentation_explored_
Definition: language_model.h:419
Definition: language_model.h:51
int prev_word_unichar_step_len_
Definition: language_model.h:402
double_VAR_H(language_model_ngram_small_prob, 0.000001, "To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model")
bool AcceptableChoiceFound()
Definition: language_model.h:104
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
Definition: language_model.h:136
ParamsModel & getParamsModel()
Definition: language_model.h:109
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
Definition: language_model.cpp:140
bool PrunablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:300
static const LanguageModelFlagsType kLowerCaseFlag
Definition: language_model.h:55
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
Definition: language_model.cpp:1199
ParamsModel params_model_
Definition: language_model.h:422
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
Definition: language_model.cpp:427
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:39
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:257
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: language_model.cpp:1390
int NumInconsistentSpaces() const
Definition: lm_consistency.h:100
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:180
Definition: lm_state.h:72
Definition: associate.h:36