doxygen/tesseract/dict_8h_source.html

 // File:        dict.h
 // Description: dict class.
 // Author:      Samuel Charron
 //
 // (C) Copyright 2006, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #ifndef TESSERACT_DICT_DICT_H_
 #define TESSERACT_DICT_DICT_H_

 #include "ambigs.h"
 #include "dawg.h"
 #include "dawg_cache.h"
 #include "host.h"
 #include "ratngs.h"
 #include "stopper.h"
 #include "trie.h"
 #include "unicharset.h"
 #include "params_training_featdef.h"

 class MATRIX;
 class WERD_RES;

 #define CHARS_PER_LINE 500
 #define MAX_WERD_LENGTH        (int64_t) 128
 #define NO_RATING               -1

 struct CHAR_FRAGMENT_INFO {
   UNICHAR_ID unichar_id;
   const CHAR_FRAGMENT *fragment;
   int num_fragments;
   float rating;
   float certainty;
 };

 namespace tesseract {

 using DawgVector = GenericVector<Dawg *>;

 //
 // Constants
 //
 static const int kRatingPad = 4;
 static const char kDictWildcard[] = "\u2606";   // WHITE STAR
 static const int kDictMaxWildcards = 2;  // max wildcards for a word
 // TODO(daria): If hyphens are different in different languages and can be
 // inferred from training data we should load their values dynamically.
 static const char kHyphenSymbol[] = "-";
 static const char kSlashSymbol[] = "/";
 static const char kQuestionSymbol[] = "?";
 static const char kApostropheSymbol[] = "'";
 static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling
 static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset
 static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on
 static const int kDocDictMaxRepChars = 4;

 // Enum for describing whether the x-height for the word is consistent:
 //  0 - everything is good.
 //  1 - there are one or two secondary (but consistent) baselines
 //      [think subscript and superscript], or there is an oversized
 //      first character.
 //  2 - the word is inconsistent.
 enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT};

 struct DawgArgs {
   DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
       : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}

   DawgPositionVector *active_dawgs;
   DawgPositionVector *updated_dawgs;
   PermuterType permuter;
   // True if the current position is a valid word end.
   bool valid_end;
 };

 class Dict {
  public:
   Dict(CCUtil* image_ptr);
   ~Dict();
   const CCUtil* getCCUtil() const {
     return ccutil_;
   }
   CCUtil* getCCUtil() {
     return ccutil_;
   }
   const UNICHARSET& getUnicharset() const {
     return getCCUtil()->unicharset;
   }
   UNICHARSET& getUnicharset() {
     return getCCUtil()->unicharset;
   }
   const UnicharAmbigs &getUnicharAmbigs() const {
     return getCCUtil()->unichar_ambigs;
   }

   // Returns true if unichar_id is a word compounding character like - or /.
   inline bool compound_marker(UNICHAR_ID unichar_id) {
     const GenericVector<UNICHAR_ID>& normed_ids =
         getUnicharset().normed_ids(unichar_id);
     return normed_ids.size() == 1 &&
         (normed_ids[0] == hyphen_unichar_id_ ||
          normed_ids[0] == slash_unichar_id_);
   }
   // Returns true if unichar_id is an apostrophe-like character that may
   // separate prefix/suffix words from a main body word.
   inline bool is_apostrophe(UNICHAR_ID unichar_id) {
     const GenericVector<UNICHAR_ID>& normed_ids =
         getUnicharset().normed_ids(unichar_id);
     return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
   }

   /* hyphen.cpp ************************************************************/

   inline bool hyphenated() const { return
     !last_word_on_line_ && hyphen_word_;
   }
   inline int hyphen_base_size() const {
     return this->hyphenated() ? hyphen_word_->length() : 0;
   }
   inline void copy_hyphen_info(WERD_CHOICE *word) const {
     if (this->hyphenated()) {
       *word = *hyphen_word_;
       if (hyphen_debug_level) word->print("copy_hyphen_info: ");
     }
   }
   inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
     if (!last_word_on_line_ || first_pos)
       return false;
     const GenericVector<UNICHAR_ID>& normed_ids =
         getUnicharset().normed_ids(unichar_id);
     return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
   }
   inline bool has_hyphen_end(const WERD_CHOICE &word) const {
     int word_index = word.length() - 1;
     return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
   }
   void reset_hyphen_vars(bool last_word_on_line);
   void set_hyphen_word(const WERD_CHOICE &word,
                        const DawgPositionVector &active_dawgs);

   /* permdawg.cpp ************************************************************/
   // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
   // When this function is refactored, permdawg.cpp can be removed.

   inline void update_best_choice(const WERD_CHOICE &word,
                                  WERD_CHOICE *best_choice) {
     if (word.rating() < best_choice->rating()) {
       *best_choice = word;
     }
   }
   void init_active_dawgs(DawgPositionVector *active_dawgs,
                          bool ambigs_mode) const;
   // Fill the given vector with the default collection of any-length dawgs
   void default_dawgs(DawgPositionVector *anylength_dawgs,
                                bool suppress_patterns) const;


   WERD_CHOICE *dawg_permute_and_select(
       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
   void go_deeper_dawg_fxn(
       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       bool word_ending, WERD_CHOICE *word, float certainties[],
       float *limit, WERD_CHOICE *best_choice, int *attempts_left,
       void *void_more_args);

   void (Dict::*go_deeper_fxn_)(const char *debug,
                                const BLOB_CHOICE_LIST_VECTOR &char_choices,
                                int char_choice_index,
                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                                bool word_ending, WERD_CHOICE *word,
                                float certainties[], float *limit,
                                WERD_CHOICE *best_choice, int *attempts_left,
                                void *void_more_args);
   //
   // Helper functions for dawg_permute_and_select().
   //
   void permute_choices(
       const char *debug,
       const BLOB_CHOICE_LIST_VECTOR &char_choices,
       int char_choice_index,
       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       WERD_CHOICE *word,
       float certainties[],
       float *limit,
       WERD_CHOICE *best_choice,
       int *attempts_left,
       void *more_args);

   void append_choices(
       const char *debug,
       const BLOB_CHOICE_LIST_VECTOR &char_choices,
       const BLOB_CHOICE &blob_choice,
       int char_choice_index,
       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
       WERD_CHOICE *word,
       float certainties[],
       float *limit,
       WERD_CHOICE *best_choice,
       int *attempts_left,
       void *more_args);

     bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
                              float curr_rating, float curr_certainty,
                              const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                              const char *debug, int word_ending,
                              CHAR_FRAGMENT_INFO *char_frag_info);

   /* stopper.cpp *************************************************************/
   bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
                         DANGERR *fixpt,
                         bool fix_replaceable,
                         MATRIX* ratings);
   // Replaces the corresponding wrong ngram in werd_choice with the correct
   // one. The whole correct n-gram is inserted into the ratings matrix and
   // the werd_choice: no more fragments!. Rating and certainty of new entries
   // in matrix and werd_choice are the sum and mean of the wrong ngram
   // respectively.
   // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
   // mystring", with a new entry in the ratings matrix for ".
   void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                     UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
                     MATRIX *ratings);

   int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
   int UniformCertainties(const WERD_CHOICE& word);
   bool AcceptableChoice(const WERD_CHOICE& best_choice,
                         XHeightConsistencyEnum xheight_consistency);
   bool AcceptableResult(WERD_RES *word) const;
   void EndDangerousAmbigs();
   void DebugWordChoices();
   void SettupStopperPass1();
   void SettupStopperPass2();
   /* context.cpp *************************************************************/
   int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const;
   bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);

   /* dict.cpp ****************************************************************/

   static DawgCache *GlobalDawgCache();
   // Sets up ready for a Load or LoadLSTM.
   void SetupForLoad(DawgCache *dawg_cache);
   // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
   void Load(const STRING &lang, TessdataManager *data_file);
   // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
   void LoadLSTM(const STRING &lang, TessdataManager *data_file);
   // Completes the loading process after Load() and/or LoadLSTM().
   // Returns false if no dictionaries were loaded.
   bool FinishLoad();
   void End();

   // Resets the document dictionary analogous to ResetAdaptiveClassifier.
   void ResetDocumentDictionary() {
     if (pending_words_ != nullptr)
       pending_words_->clear();
     if (document_words_ != nullptr)
       document_words_->clear();
   }

   //
   int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
                          UNICHAR_ID unichar_id, bool word_end) const;

   int (Dict::*letter_is_okay_)(void* void_dawg_args,
                                const UNICHARSET& unicharset,
                                UNICHAR_ID unichar_id, bool word_end) const;
   int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
                    UNICHAR_ID unichar_id, bool word_end) const {
     return (this->*letter_is_okay_)(void_dawg_args,
                                     unicharset, unichar_id, word_end);
   }


   double (Dict::*probability_in_context_)(const char* lang,
                                           const char* context,
                                           int context_bytes,
                                           const char* character,
                                           int character_bytes);
   double ProbabilityInContext(const char* context,
                               int context_bytes,
                               const char* character,
                               int character_bytes) {
     return (this->*probability_in_context_)(
         getCCUtil()->lang.string(),
         context, context_bytes,
         character, character_bytes);
   }

   double def_probability_in_context(
       const char* lang, const char* context, int context_bytes,
       const char* character, int character_bytes) {
     (void)lang;
     (void)context;
     (void)context_bytes;
     (void)character;
     (void)character_bytes;
     return 0.0;
   }
   double ngram_probability_in_context(const char* lang,
                                       const char* context,
                                       int context_bytes,
                                       const char* character,
                                       int character_bytes);

   // Interface with params model.
   float (Dict::*params_model_classify_)(const char *lang, void *path);
   float ParamsModelClassify(const char *lang, void *path);
   // Call params_model_classify_ member function.
   float CallParamsModelClassify(void *path) {
     ASSERT_HOST(params_model_classify_ != nullptr);  // ASSERT_HOST -> assert
     return (this->*params_model_classify_)(
         getCCUtil()->lang.string(), path);
   }

   inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
   inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
   inline int NumDawgs() const { return dawgs_.size(); }
   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
   inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
   inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
   static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
     if (edge_ref == NO_EDGE) return 0;  // beginning to explore the dawg
     NODE_REF node = dawg->next_node(edge_ref);
     if (node == 0) node = NO_EDGE;  // end of word
     return node;
   }

   // Given a unichar from a string and a given dawg, return the unichar
   // we should use to match in that dawg type.  (for example, in the number
   // dawg, all numbers are transformed to kPatternUnicharId).
   UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
                            const Dawg *dawg) const {
     if (!dawg) return ch;
     switch (dawg->type()) {
       case DAWG_TYPE_NUMBER:
         return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
       default:
         return ch;
     }
   }

   void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
                            UNICHAR_ID unichar_id, bool word_end,
                            DawgArgs *dawg_args,
                            PermuterType *current_permuter) const;


   inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
     return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
             perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
             perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
             (numbers_ok && perm == NUMBER_PERM));
   }
   int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
   int valid_word(const WERD_CHOICE &word) const {
     return valid_word(word, false);  // return NO_PERM for words with digits
   }
   int valid_word_or_number(const WERD_CHOICE &word) const {
     return valid_word(word, true);  // return NUMBER_PERM for valid numbers
   }
   int valid_word(const char *string) const {
     WERD_CHOICE word(string, getUnicharset());
     return valid_word(word);
   }
   // Do the two WERD_CHOICEs form a meaningful bigram?
   bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
   bool valid_punctuation(const WERD_CHOICE &word);
   int good_choice(const WERD_CHOICE &choice);
   void add_document_word(const WERD_CHOICE &best_choice);
   void adjust_word(WERD_CHOICE *word,
                    bool nonword, XHeightConsistencyEnum xheight_consistency,
                    float additional_adjust,
                    bool modify_rating,
                    bool debug);
   inline void SetWordsegRatingAdjustFactor(float f) {
     wordseg_rating_adjust_factor_ = f;
   }
   bool IsSpaceDelimitedLang() const;

  private:
   CCUtil* ccutil_;
   UnicharAmbigs *dang_ambigs_table_;
   UnicharAmbigs *replace_ambigs_table_;
   float reject_offset_;
   // Cached UNICHAR_IDs:
   UNICHAR_ID wildcard_unichar_id_;    // kDictWildcard.
   UNICHAR_ID apostrophe_unichar_id_;  // kApostropheSymbol.
   UNICHAR_ID question_unichar_id_;    // kQuestionSymbol.
   UNICHAR_ID slash_unichar_id_;       // kSlashSymbol.
   UNICHAR_ID hyphen_unichar_id_;      // kHyphenSymbol.
   // Hyphen-related variables.
   WERD_CHOICE *hyphen_word_;
   DawgPositionVector hyphen_active_dawgs_;
   bool last_word_on_line_;
   // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
   // matching.  The first member of each list is taken as canonical.  For
   // example, the first list contains hyphens and dashes with the first symbol
   // being the ASCII hyphen minus.
   GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
   // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
   DawgCache *dawg_cache_;
   bool dawg_cache_is_ours_;  // we should delete our own dawg_cache_
   // Dawgs.
   DawgVector dawgs_;
   SuccessorListsVector successors_;
   Trie *pending_words_;
   // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
   // any of them are present on the best choices list for a word pair.
   // the bigrams are stored as space-separated words where:
   // (1) leading and trailing punctuation has been removed from each word and
   // (2) any digits have been replaced with '?' marks.
   Dawg *bigram_dawg_;
   // TODO(daria): need to support multiple languages in the future,
   // so maybe will need to maintain a list of dawgs of each kind.
   Dawg *freq_dawg_;
   Dawg *unambig_dawg_;
   Dawg *punc_dawg_;
   Trie *document_words_;
   float wordseg_rating_adjust_factor_;
   // File for recording ambiguities discovered during dictionary search.
   FILE *output_ambig_words_file_;

  public:
   STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
   STRING_VAR_H(user_words_suffix, "",
                "A suffix of user-provided words located in tessdata.");
   STRING_VAR_H(user_patterns_file, "",
                "A filename of user-provided patterns.");
   STRING_VAR_H(user_patterns_suffix, "",
                "A suffix of user-provided patterns located in tessdata.");
   BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
   BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
   BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
   BOOL_VAR_H(load_punc_dawg, true,
              "Load dawg with punctuation patterns.");
   BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
   BOOL_VAR_H(load_bigram_dawg, true,
              "Load dawg with special word bigrams.");
   double_VAR_H(xheight_penalty_subscripts, 0.125,
                "Score penalty (0.1 = 10%) added if there are subscripts "
                "or superscripts in a word, but it is otherwise OK.");
   double_VAR_H(xheight_penalty_inconsistent, 0.25,
                "Score penalty (0.1 = 10%) added if an xheight is "
                "inconsistent.");
   double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
                "Score multiplier for word matches which have good case and"
                "are frequent in the given language (lower is better).");

   double_VAR_H(segment_penalty_dict_case_ok, 1.1,
                "Score multiplier for word matches that have good case "
                "(lower is better).");

   double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
                "Default score multiplier for word matches, which may have "
                "case issues (lower is better).");

   double_VAR_H(segment_penalty_dict_nonword, 1.25,
                "Score multiplier for glyph fragment segmentations which "
                "do not match a dictionary word (lower is better).");

   double_VAR_H(segment_penalty_garbage, 1.50,
                "Score multiplier for poorly cased strings that are not in"
                " the dictionary and generally look like garbage (lower is"
                " better).");
   STRING_VAR_H(output_ambig_words_file, "",
                "Output file for ambiguities found in the dictionary");
   INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
             ", to 2 for more details, to 3 to see all the debug messages");
   INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
   INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
   BOOL_VAR_H(use_only_first_uft8_step, false,
              "Use only the first UTF8 step of the given string"
              " when computing log probabilities.");
   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
   double_VAR_H(stopper_nondict_certainty_base, -2.50,
                "Certainty threshold for non-dict words");
   double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
                "Reject certainty offset");
   INT_VAR_H(stopper_smallword_size, 2,
             "Size of dict word to be treated as non-dict word");
   double_VAR_H(stopper_certainty_per_char, -0.50,
                "Certainty to add for each dict char above small word size.");
   double_VAR_H(stopper_allowable_character_badness, 3.0,
                "Max certaintly variation allowed in a word (in sigma)");
   INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
   BOOL_VAR_H(stopper_no_acceptable_choices, false,
              "Make AcceptableChoice() always return false. Useful"
              " when there is a need to explore all segmentations");
   INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
   STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
                " should be printed to stdout");
   STRING_VAR_H(word_to_debug_lengths, "",
                "Lengths of unichars in word_to_debug");
   INT_VAR_H(fragments_debug, 0, "Debug character fragments");
   BOOL_VAR_H(segment_nonalphabetic_script, false,
              "Don't use any alphabetic-specific tricks."
              "Set to true in the traineddata config file for"
              " scripts that are cursive or inherently fixed-pitch");
   BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
   double_VAR_H(doc_dict_pending_threshold, 0.0,
                "Worst certainty for using pending dictionary");
   double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
                " for words that can be inserted into the document dictionary");
   INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
               " character choices to consider during permutation."
               " This limit is especially useful when user patterns"
               " are specified, since overly generic patterns can result in"
               " dawg search exploring an overly large number of options.");
 };
 }  // namespace tesseract

 #endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_
tesseract::Dict::output_ambig_words_file_
FILE * output_ambig_words_file_
Definition: dict.h:555

tesseract::Dict::dawg_cache_
DawgCache * dawg_cache_
Definition: dict.h:531

tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459

tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:419

tesseract::Dict::freq_dawg_
Dawg * freq_dawg_
Definition: dict.h:547

tesseract::kDocDictMaxRepChars
static const int kDocDictMaxRepChars
Definition: dict.h:67

tesseract::Dict::wildcard_unichar_id_
UNICHAR_ID wildcard_unichar_id_
Definition: dict.h:516

tesseract::Dict::unambig_dawg_
Dawg * unambig_dawg_
Definition: dict.h:548

tesseract::Dict::def_probability_in_context
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:386

tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:375

tesseract::Dawg
Definition: dawg.h:119

tesseract::Dict::NumDawgs
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:415

tesseract::kQuestionSymbol
static const char kQuestionSymbol[]
Definition: dict.h:62

tesseract::Dict::dang_ambigs_table_
UnicharAmbigs * dang_ambigs_table_
Definition: dict.h:510

tesseract::Dict::valid_word
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:473

tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:109

tesseract::XH_SUBNORMAL
Definition: dict.h:75

tesseract::XH_GOOD
Definition: dict.h:75

tesseract::Dict::hyphen_active_dawgs_
DawgPositionVector hyphen_active_dawgs_
Definition: dict.h:523

tesseract::kSlashSymbol
static const char kSlashSymbol[]
Definition: dict.h:61

UNICHARSET
Definition: unicharset.h:146

tesseract::Dict::apostrophe_unichar_id_
UNICHAR_ID apostrophe_unichar_id_
Definition: dict.h:517

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144

tesseract::kSimilarityFloor
static const float kSimilarityFloor
Definition: dict.h:66

MATRIX
Definition: matrix.h:575

tesseract::Dict::last_word_on_line_
bool last_word_on_line_
Definition: dict.h:524

tesseract::Dict::ResetDocumentDictionary
void ResetDocumentDictionary()
Definition: dict.h:311

tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:128

tesseract::kRatingPad
static const int kRatingPad
Definition: dict.h:55

CHAR_FRAGMENT_INFO::rating
float rating
Definition: dict.h:44

tesseract
Definition: baseapi.cpp:94

tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127

CHAR_FRAGMENT_INFO
Definition: dict.h:40

WERD_CHOICE
Definition: ratngs.h:273

tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104

tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:82

CHAR_FRAGMENT_INFO::certainty
float certainty
Definition: dict.h:45

CHAR_FRAGMENT_INFO::num_fragments
int num_fragments
Definition: dict.h:43

tesseract::kApostropheSymbol
static const char kApostropheSymbol[]
Definition: dict.h:63

tesseract::Dict::getUnicharset
UNICHARSET & getUnicharset()
Definition: dict.h:101

tesseract::Dict::slash_unichar_id_
UNICHAR_ID slash_unichar_id_
Definition: dict.h:519

tesseract::DAWG_TYPE_NUMBER
Definition: dawg.h:75

tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:92

tesseract::kSimCertaintyScale
static const float kSimCertaintyScale
Definition: dict.h:64

tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:83

WERD_CHOICE::length
int length() const
Definition: ratngs.h:303

tesseract::CCUtil
Definition: ccutil.h:51

tesseract::Dict::getCCUtil
CCUtil * getCCUtil()
Definition: dict.h:95

tesseract::Dict
Definition: dict.h:88

CHAR_FRAGMENT_INFO::fragment
const CHAR_FRAGMENT * fragment
Definition: dict.h:42

tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:131

tesseract::DawgPositionVector
Definition: dawg.h:381

tesseract::Dict::hyphen_word_
WERD_CHOICE * hyphen_word_
Definition: dict.h:522

tesseract::Dict::valid_word_or_number
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:469

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507

tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:118

tesseract::TessdataManager
Definition: tessdatamanager.h:126

GenericVector
Definition: baseapi.h:37

tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423

tesseract::Dict::question_unichar_id_
UNICHAR_ID question_unichar_id_
Definition: dict.h:518

tesseract::Dict::SetWildcardID
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:412

tesseract::UnicharAmbigs
Definition: ambigs.h:143

STRING
Definition: strngs.h:45

tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:417

tesseract::Dict::dawg_cache_is_ours_
bool dawg_cache_is_ours_
Definition: dict.h:532

tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:433

tesseract::Dict::replace_ambigs_table_
UnicharAmbigs * replace_ambigs_table_
Definition: dict.h:512

tesseract::Dict::punc_dawg_
Dawg * punc_dawg_
Definition: dict.h:549

tesseract::Dict::document_words_
Trie * document_words_
Definition: dict.h:550

tesseract::Dict::update_best_choice
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:171

tesseract::Dict::dawgs_
DawgVector dawgs_
Definition: dict.h:534

tesseract::kDictMaxWildcards
static const int kDictMaxWildcards
Definition: dict.h:57

tesseract::Dict::equivalent_symbols_
GenericVector< GenericVectorEqEq< UNICHAR_ID > > equivalent_symbols_
Definition: dict.h:529

tesseract::kDictWildcard
static const char kDictWildcard[]
Definition: dict.h:56

tesseract::Dict::hyphen_unichar_id_
UNICHAR_ID hyphen_unichar_id_
Definition: dict.h:520

tesseract::Dict::bigram_dawg_
Dawg * bigram_dawg_
Definition: dict.h:544

tesseract::Dict::ccutil_
CCUtil * ccutil_
Definition: dict.h:503

WERD_RES
Definition: pageres.h:169

tesseract::Dict::CallParamsModelClassify
float CallParamsModelClassify(void *path)
Definition: dict.h:406

GenericVector::size
int size() const
Definition: genericvector.h:71

tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:466

tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:137

tesseract::kSimCertaintyOffset
static const float kSimCertaintyOffset
Definition: dict.h:65

tesseract::Dict::successors_
SuccessorListsVector successors_
Definition: dict.h:535

tesseract::DawgPosition
Definition: dawg.h:354

tesseract::Dawg::next_node
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0

tesseract::Dict::WildcardID
UNICHAR_ID WildcardID() const
Definition: dict.h:413

tesseract::Dict::SetWordsegRatingAdjustFactor
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:495

tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:361

tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:75

tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

WERD_CHOICE::print
void print() const
Definition: ratngs.h:580

tesseract::DawgArgs::valid_end
bool valid_end
Definition: dict.h:85

tesseract::Dict::GetUnambigDawg
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:421

WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

CHAR_FRAGMENT
Definition: unicharset.h:49

tesseract::DawgArgs::DawgArgs
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:78

tesseract::kHyphenSymbol
static const char kHyphenSymbol[]
Definition: dict.h:60

tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:81

tesseract::Trie
Definition: trie.h:61

tesseract::DawgArgs
Definition: dict.h:77

tesseract::DawgCache
Definition: dawg_cache.h:30

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:327

tesseract::Dict::reject_offset_
float reject_offset_
Definition: dict.h:514

tesseract::XH_INCONSISTENT
Definition: dict.h:75

tesseract::Dict::pending_words_
Trie * pending_words_
Definition: dict.h:536

CHAR_FRAGMENT_INFO::unichar_id
UNICHAR_ID unichar_id
Definition: dict.h:41

tesseract::Dict::wordseg_rating_adjust_factor_
float wordseg_rating_adjust_factor_
Definition: dict.h:553

BLOB_CHOICE
Definition: ratngs.h:49

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:152