19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H_ 20 #define TESSERACT_CLASSIFY_CLASSIFY_H_ 24 #include "config_auto.h" 28 #ifdef DISABLED_LEGACY_ENGINE 35 class Classify :
public CCStruct {
45 INT_VAR_H(classify_debug_level, 0,
"Classify debug level");
48 "Assume the input is numbers [0-9].");
51 "Veto ratio between classifier ratings");
54 "Veto difference between classifier certainties");
63 #else // DISABLED_LEGACY_ENGINE not defined 70 #include "imagedata.h" 72 #include "intmatcher.h" 75 #include "ocrfeatures.h" 76 #include "unicity_table.h" 84 static const int kUnknownFontinfoId = -1;
85 static const int kBlankFontinfoId = -2;
89 class ShapeClassifier;
145 const uint8_t* normalization_factors,
146 const uint16_t* expected_num_features,
179 void LearnPieces(
const char* fontname,
int start,
int length,
float threshold,
193 UNICHAR_ID *ambiguities,
196 int16_t num_features,
198 const uint8_t* norm_factors,
201 int matcher_multiplier,
202 const TBOX& blob_box,
216 int matcher_multiplier,
217 const uint8_t* cn_factors,
224 double im_rating,
int feature_misses,
226 int blob_length,
int matcher_multiplier,
227 const uint8_t* cn_factors);
230 BLOB_CHOICE_LIST *Choices);
233 INT_FEATURE_ARRAY IntFeatures,
236 #ifndef GRAPHICS_DISABLED 242 FEATURE_ID BadFeat[],
250 INT_FEATURE_ARRAY Features,
266 int class_id,
int config_id)
const;
278 int int_result_config)
const;
312 uint8_t* pruner_norm_array,
313 uint8_t* char_norm_array);
319 uint8_t* char_norm_array,
320 uint8_t* pruner_array);
331 int y_offset,
const TBOX &wbox);
376 uint8_t* char_norm_array);
383 bool* pretrained_on,
int* shape_id);
423 BOOL_VAR_H(allow_blob_division,
true,
"Use divisible blobs chopping");
428 "Prioritize blob division over chopping");
429 INT_VAR_H(tessedit_single_match, FALSE,
"Top choice only from CP");
430 BOOL_VAR_H(classify_enable_learning,
true,
"Enable adaptive classifier");
431 INT_VAR_H(classify_debug_level, 0,
"Classify debug level");
435 INT_VAR_H(classify_norm_method, character,
"Normalization Method ...");
437 "Character Normalization Range ...");
438 double_VAR_H(classify_min_norm_scale_x, 0.0,
"Min char x-norm scale ...");
439 double_VAR_H(classify_max_norm_scale_x, 0.325,
"Max char x-norm scale ...");
440 double_VAR_H(classify_min_norm_scale_y, 0.0,
"Min char y-norm scale ...");
441 double_VAR_H(classify_max_norm_scale_y, 0.325,
"Max char y-norm scale ...");
443 "Veto ratio between classifier ratings");
445 "Veto difference between classifier certainties");
448 BOOL_VAR_H(tess_cn_matching, 0,
"Character Normalized Matching");
449 BOOL_VAR_H(tess_bn_matching, 0,
"Baseline Normalized Matching");
450 BOOL_VAR_H(classify_enable_adaptive_matcher, 1,
"Enable adaptive classifier");
451 BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
452 "Use pre-adapted classifier templates");
453 BOOL_VAR_H(classify_save_adapted_templates, 0,
454 "Save adapted templates to a file");
455 BOOL_VAR_H(classify_enable_adaptive_debugger, 0,
"Enable match debugger");
457 "Non-linear stroke-density normalization");
458 INT_VAR_H(matcher_debug_level, 0,
"Matcher Debug Level");
459 INT_VAR_H(matcher_debug_flags, 0,
"Matcher Debug Flags");
460 INT_VAR_H(classify_learning_debug_level, 0,
"Learning Debug Level: ");
461 double_VAR_H(matcher_good_threshold, 0.125,
"Good Match (0-1)");
462 double_VAR_H(matcher_reliable_adaptive_result, 0.0,
"Great Match (0-1)");
463 double_VAR_H(matcher_perfect_threshold, 0.02,
"Perfect Match (0-1)");
464 double_VAR_H(matcher_bad_match_pad, 0.15,
"Bad Match Pad (0-1)");
465 double_VAR_H(matcher_rating_margin, 0.1,
"New template margin (0-1)");
466 double_VAR_H(matcher_avg_noise_size, 12.0,
"Avg. noise blob length: ");
467 INT_VAR_H(matcher_permanent_classes_min, 1,
"Min # of permanent classes");
468 INT_VAR_H(matcher_min_examples_for_prototyping, 3,
469 "Reliable Config Threshold");
470 INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
471 "Enable adaption even if the ambiguities have not been seen");
473 "Maximum angle delta for prototype clustering");
475 "Penalty to apply when a non-alnum is vertically out of " 476 "its expected textline position");
477 double_VAR_H(rating_scale, 1.5,
"Rating scaling factor");
478 double_VAR_H(certainty_scale, 20.0,
"Certainty scaling factor");
480 "Scale factor for features not used");
482 "Prune poor adapted results this much worse than best result");
484 "Threshold at which classify_adapted_pruning_factor starts");
485 INT_VAR_H(classify_adapt_proto_threshold, 230,
486 "Threshold for good protos during adaptive 0-255");
487 INT_VAR_H(classify_adapt_feature_threshold, 230,
488 "Threshold for good features during adaptive 0-255");
490 "Do not include character fragments in the" 491 " results of the classifier");
492 double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0,
493 "Exclude fragments that do not match any whole character" 494 " with at least this certainty");
495 BOOL_VAR_H(classify_debug_character_fragments, FALSE,
496 "Bring up graphical debugging windows for fragments training");
497 BOOL_VAR_H(matcher_debug_separate_windows, FALSE,
498 "Use two different windows for debugging the matching: " 499 "One for the protos and one for the features.");
500 STRING_VAR_H(classify_learn_debug_str,
"",
"Class str to debug learning");
503 INT_VAR_H(classify_class_pruner_threshold, 229,
504 "Class Pruner Threshold 0-255");
505 INT_VAR_H(classify_class_pruner_multiplier, 15,
506 "Class Pruner Multiplier 0-255: ");
507 INT_VAR_H(classify_cp_cutoff_strength, 7,
508 "Class Pruner CutoffStrength: ");
509 INT_VAR_H(classify_integer_matcher_multiplier, 10,
510 "Integer Matcher Multiplier 0-255: ");
539 INT_VAR_H(il1_adaption_test, 0,
"Don't adapt to i/I at beginning of word");
541 "Assume the input is numbers [0-9].");
542 double_VAR_H(speckle_large_max_size, 0.30,
"Max large speckle size");
544 "Penalty to add to worst rating for noise");
583 #endif // DISABLED_LEGACY_ENGINE 585 #endif // TESSERACT_CLASSIFY_CLASSIFY_H_ double_VAR_H(classify_char_norm_range, 0.2, "Character Normalization Range ...")
void ReadNewCutoffs(TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:46
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
Definition: adaptmatch.cpp:2199
Definition: classify.h:99
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:444
bool LooksLikeGarbage(TBLOB *blob)
Definition: adaptmatch.cpp:1637
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
Definition: adaptmatch.cpp:2278
BIT_VECTOR AllProtosOn
Definition: classify.h:521
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:513
Definition: adaptive.h:39
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
Definition: adaptmatch.cpp:2241
int NumAdaptationsFailed
Definition: classify.h:561
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
Definition: adaptmatch.cpp:1839
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
uint8_t NumPermClasses
Definition: adaptive.h:78
BIT_VECTOR TempProtoMask
Definition: classify.h:524
Classify()
Definition: classify.cpp:60
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1030
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
Definition: adaptmatch.cpp:1333
GenericVector< uint16_t > shapetable_cutoffs_
Definition: classify.h:576
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
Definition: adaptmatch.cpp:1206
Definition: intproto.h:118
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:235
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:728
virtual Dict & getDict()
Definition: classify.h:107
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1269
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
Definition: adaptmatch.cpp:1315
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:333
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:256
const ShapeTable * shape_table() const
Definition: classify.h:111
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:496
ScrollView * learn_fragments_debug_win_
Definition: classify.h:579
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:83
void SetStaticClassifier(ShapeClassifier *static_classifier)
Definition: classify.cpp:225
Definition: classify.h:103
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
void SettupPass2()
Definition: adaptmatch.cpp:670
Definition: unicharset.h:146
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP")
Definition: classify.h:98
uint16_t BaselineCutoffs[MAX_NUM_CLASSES]
Definition: classify.h:575
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:192
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:599
Definition: intmatcher.h:83
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
Definition: adaptmatch.cpp:1682
void SetAdaptiveThreshold(float Threshold)
Definition: adaptmatch.cpp:2146
Definition: serialis.h:77
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:42
Definition: adaptive.h:74
Definition: baseapi.cpp:94
void ClassifyAsNoise(ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1403
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:548
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:694
UnicityTable< FontSet > & get_fontset_table()
Definition: classify.h:392
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
Definition: adaptmatch.cpp:1925
Definition: classify.h:97
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:950
void FreeNormProtos()
Definition: normmatch.cpp:157
Definition: ocrfeatures.h:60
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:245
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:375
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
Definition: adaptmatch.cpp:1132
Definition: classify.h:100
Definition: ccstruct.h:25
ScrollView * learn_debug_win_
Definition: classify.h:577
uint16_t CharNormCutoffs[MAX_NUM_CLASSES]
Definition: classify.h:574
Definition: intproto.h:105
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
Definition: adaptmatch.cpp:1092
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:249
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:857
bool EnableLearning
Definition: classify.h:525
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:535
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
Definition: adaptmatch.cpp:1744
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1501
ShapeClassifier * static_classifier_
Definition: classify.h:558
NORM_PROTOS * NormProtos
Definition: classify.h:527
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:233
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:102
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:227
Definition: adaptive.h:62
Definition: fontinfo.h:30
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:219
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
Definition: normmatch.cpp:35
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:284
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:460
Definition: featdefs.h:46
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:1417
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
Definition: adaptmatch.cpp:1702
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
Definition: tessdatamanager.h:126
STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning")
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:174
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:2098
STRING tr_file_data_
Definition: classify.h:564
IntegerMatcher im_
Definition: classify.h:547
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
Definition: shapetable.h:262
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
Definition: adaptmatch.cpp:2018
CharSegmentationType
Definition: classify.h:96
Definition: normalis.h:50
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
Definition: adaptmatch.cpp:1596
Definition: pageres.h:169
Definition: shapeclassifier.h:43
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
Dict dict_
Definition: classify.h:556
Definition: ocrfeatures.h:66
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:998
Definition: oldlist.h:124
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:787
void SettupPass1()
Definition: adaptmatch.cpp:653
virtual ~Classify()
Definition: classify.cpp:215
Definition: intproto.h:132
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:251
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
Definition: adaptmatch.cpp:2164
void ShowMatchDisplay()
Definition: intproto.cpp:973
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:454
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:823
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
Definition: adaptmatch.cpp:2212
void RemoveBadMatches(ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:2038
Definition: trainingsample.h:53
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1534
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:528
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:152
ShapeTable * shape_table_
Definition: classify.h:553
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:74
Definition: adaptmatch.cpp:92
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:1049
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
Definition: shapetable.h:41
int ShapeIDToClassID(int shape_id) const
Definition: adaptmatch.cpp:2225
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:131
ScrollView * learn_fragmented_word_debug_win_
Definition: classify.h:578
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:64
const UnicityTable< FontInfo > & get_fontinfo_table() const
Definition: classify.h:389
BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping")
CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
Definition: intproto.cpp:1274