#include <tesseractclass.h>
Public Member Functions | ||||||||||
Tesseract () | ||||||||||
~Tesseract () | ||||||||||
Dict & | getDict () override | |||||||||
void | Clear () | |||||||||
void | ResetAdaptiveClassifier () | |||||||||
void | ResetDocumentDictionary () | |||||||||
void | SetEquationDetect (EquationDetect *detector) | |||||||||
const FCOORD & | reskew () const | |||||||||
Pix ** | mutable_pix_binary () | |||||||||
Pix * | pix_binary () const | |||||||||
Pix * | pix_grey () const | |||||||||
void | set_pix_grey (Pix *grey_pix) | |||||||||
Pix * | pix_original () const | |||||||||
void | set_pix_original (Pix *original_pix) | |||||||||
Pix * | BestPix () const | |||||||||
void | set_pix_thresholds (Pix *thresholds) | |||||||||
int | source_resolution () const | |||||||||
void | set_source_resolution (int ppi) | |||||||||
int | ImageWidth () const | |||||||||
int | ImageHeight () const | |||||||||
Pix * | scaled_color () const | |||||||||
int | scaled_factor () const | |||||||||
void | SetScaledColor (int factor, Pix *color) | |||||||||
const Textord & | textord () const | |||||||||
Textord * | mutable_textord () | |||||||||
bool | right_to_left () const | |||||||||
int | num_sub_langs () const | |||||||||
Tesseract * | get_sub_lang (int index) const | |||||||||
bool | AnyTessLang () const | |||||||||
bool | AnyLSTMLang () const | |||||||||
void | SetBlackAndWhitelist () | |||||||||
void | PrepareForPageseg () | |||||||||
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) | |||||||||
int | SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
void | SetupWordScripts (BLOCK_LIST *blocks) | |||||||||
int | AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) | |||||||||
ColumnFinder * | SetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix) | |||||||||
void | PrerecAllWordsPar (const GenericVector< WordData > &words) | |||||||||
void | TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list) | |||||||||
void | TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data) | |||||||||
ImageData * | GetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block) | |||||||||
ImageData * | GetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const | |||||||||
void | LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words) | |||||||||
void | SearchWords (PointerVector< WERD_RES > *words) | |||||||||
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) | |||||||||
void | SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words) | |||||||||
void | SetupWordPassN (int pass_n, WordData *word) | |||||||||
bool | RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words) | |||||||||
bool | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) | |||||||||
void | rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config) | |||||||||
void | bigram_correction_pass (PAGE_RES *page_res) | |||||||||
void | blamer_pass (PAGE_RES *page_res) | |||||||||
void | script_pos_pass (PAGE_RES *page_res) | |||||||||
int | RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words) | |||||||||
bool | ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) | |||||||||
void | AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs) | |||||||||
void | AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs) | |||||||||
bool | SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines) | |||||||||
float | ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str) | |||||||||
float | ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2) | |||||||||
void | classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) | |||||||||
void | classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) | |||||||||
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) | |||||||||
void | fix_rep_char (PAGE_RES_IT *page_res_it) | |||||||||
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths) | |||||||||
void | match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block) | |||||||||
void | classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) | |||||||||
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) | |||||||||
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | recog_interactive (PAGE_RES_IT *pr_it) | |||||||||
void | set_word_fonts (WERD_RES *word) | |||||||||
void | font_recognition_pass (PAGE_RES *page_res) | |||||||||
void | dictionary_correction_pass (PAGE_RES *page_res) | |||||||||
bool | check_debug_pt (WERD_RES *word, int location) | |||||||||
bool | SubAndSuperscriptFix (WERD_RES *word_res) | |||||||||
void | GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold) | |||||||||
WERD_RES * | TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing) | |||||||||
bool | BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const | |||||||||
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) | |||||||||
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol) | |||||||||
void | set_unlv_suspects (WERD_RES *word) | |||||||||
UNICHAR_ID | get_rep_char (WERD_RES *word) | |||||||||
bool | acceptable_number_string (const char *s, const char *lengths) | |||||||||
int16_t | count_alphanums (const WERD_CHOICE &word) | |||||||||
int16_t | count_alphas (const WERD_CHOICE &word) | |||||||||
void | read_config_file (const char *filename, SetParamConstraint constraint) | |||||||||
int | init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) | |||||||||
int | init_tesseract (const char *datapath, const char *language, OcrEngineMode oem) | |||||||||
int | init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) | |||||||||
void | SetupUniversalFontIds () | |||||||||
int | init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr) | |||||||||
void | recognize_page (STRING &image_name) | |||||||||
void | end_tesseract () | |||||||||
bool | init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) | |||||||||
void | ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load) | |||||||||
SVMenuNode * | build_menu_new () | |||||||||
void | pgeditor_main (int width, int height, PAGE_RES *page_res) | |||||||||
void | process_image_event (const SVEvent &event) | |||||||||
bool | process_cmd_win_event (int32_t cmd_event, char *new_value) | |||||||||
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) | |||||||||
bool | word_display (PAGE_RES_IT *pr_it) | |||||||||
bool | word_bln_display (PAGE_RES_IT *pr_it) | |||||||||
bool | word_blank_and_set_display (PAGE_RES_IT *pr_its) | |||||||||
bool | word_set_display (PAGE_RES_IT *pr_it) | |||||||||
bool | word_dumper (PAGE_RES_IT *pr_it) | |||||||||
void | blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | make_reject_map (WERD_RES *word, ROW *row, int16_t pass) | |||||||||
bool | one_ell_conflict (WERD_RES *word_res, bool update_map) | |||||||||
int16_t | first_alphanum_index (const char *word, const char *word_lengths) | |||||||||
int16_t | first_alphanum_offset (const char *word, const char *word_lengths) | |||||||||
int16_t | alpha_count (const char *word, const char *word_lengths) | |||||||||
bool | word_contains_non_1_digit (const char *word, const char *word_lengths) | |||||||||
void | dont_allow_1Il (WERD_RES *word) | |||||||||
int16_t | count_alphanums (WERD_RES *word) | |||||||||
void | flip_0O (WERD_RES *word) | |||||||||
bool | non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
bool | non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
bool | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) | |||||||||
void | nn_match_word (WERD_RES *word, ROW *row) | |||||||||
void | nn_recover_rejects (WERD_RES *word, ROW *row) | |||||||||
void | set_done (WERD_RES *word, int16_t pass) | |||||||||
int16_t | safe_dict_word (const WERD_RES *werd_res) | |||||||||
void | flip_hyphens (WERD_RES *word) | |||||||||
void | reject_I_1_L (WERD_RES *word) | |||||||||
void | reject_edge_blobs (WERD_RES *word) | |||||||||
void | reject_mostly_rejects (WERD_RES *word) | |||||||||
bool | word_adaptable (WERD_RES *word, uint16_t mode) | |||||||||
void | recog_word_recursive (WERD_RES *word) | |||||||||
void | recog_word (WERD_RES *word) | |||||||||
void | split_and_recog_word (WERD_RES *word) | |||||||||
void | split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const | |||||||||
void | join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const | |||||||||
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) | |||||||||
int16_t | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
void | dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) | |||||||||
bool | fixspace_thinks_word_done (WERD_RES *word) | |||||||||
GARBAGE_LEVEL | garbage_word (WERD_RES *word, BOOL8 ok_dict_word) | |||||||||
bool | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word) | |||||||||
void | tilde_crunch (PAGE_RES_IT &page_res_it) | |||||||||
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) | |||||||||
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) | |||||||||
void | quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) | |||||||||
void | convert_bad_unlv_chs (WERD_RES *word_res) | |||||||||
void | tilde_delete (PAGE_RES_IT &page_res_it) | |||||||||
int16_t | word_blob_quality (WERD_RES *word, ROW *row) | |||||||||
void | word_char_quality (WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count) | |||||||||
void | unrej_good_chs (WERD_RES *word, ROW *row) | |||||||||
int16_t | count_outline_errs (char c, int16_t outline_count) | |||||||||
int16_t | word_outline_errs (WERD_RES *word) | |||||||||
bool | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) | |||||||||
CRUNCH_MODE | word_deletable (WERD_RES *word, int16_t &delete_mode) | |||||||||
int16_t | failure_count (WERD_RES *word) | |||||||||
bool | noise_outlines (TWERD *word) | |||||||||
void | tess_segment_pass_n (int pass_n, WERD_RES *word) | |||||||||
PAGE_RES * | ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list) | |||||||||
void | PreenXHeights (BLOCK_LIST *block_list) | |||||||||
PAGE_RES * | SetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list) | |||||||||
void | MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text) | |||||||||
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text) | |||||||||
void | ReSegmentByClassification (PAGE_RES *page_res) | |||||||||
bool | ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids) | |||||||||
bool | FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res) | |||||||||
void | SearchForText (const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation) | |||||||||
void | TidyUp (PAGE_RES *page_res) | |||||||||
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) | |||||||||
void | CorrectClassifyWords (PAGE_RES *page_res) | |||||||||
void | ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res) | |||||||||
int | CountMisfitTops (WERD_RES *word_res) | |||||||||
float | ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift) | |||||||||
BOOL_VAR_H (tessedit_resegment_from_boxes, false, "Take segmentation and labeling from box file") | ||||||||||
BOOL_VAR_H (tessedit_resegment_from_line_boxes, false, "Conversion of word/line box file to char box file") | ||||||||||
BOOL_VAR_H (tessedit_train_from_boxes, false, "Generate training data from boxed chars") | ||||||||||
BOOL_VAR_H (tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars") | ||||||||||
BOOL_VAR_H (tessedit_train_line_recognizer, false, "Break input into lines and remap boxes if present") | ||||||||||
BOOL_VAR_H (tessedit_dump_pageseg_images, false, "Dump intermediate images made during page segmentation") | ||||||||||
INT_VAR_H (tessedit_pageseg_mode, PSM_SINGLE_BLOCK, "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)") | ||||||||||
INT_VAR_H (tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT, "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available.") | ||||||||||
STRING_VAR_H (tessedit_char_blacklist, "", "Blacklist of chars not to recognize") | ||||||||||
STRING_VAR_H (tessedit_char_whitelist, "", "Whitelist of chars to recognize") | ||||||||||
STRING_VAR_H (tessedit_char_unblacklist, "", "List of chars to override tessedit_char_blacklist") | ||||||||||
BOOL_VAR_H (tessedit_ambigs_training, false, "Perform training for ambiguities") | ||||||||||
INT_VAR_H (pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, "Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation.") | ||||||||||
INT_VAR_H (ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, "Whether to use the top-line splitting process for Devanagari " "documents while performing ocr.") | ||||||||||
STRING_VAR_H (tessedit_write_params_to_file, "", "Write all parameters to the given file.") | ||||||||||
BOOL_VAR_H (tessedit_adaption_debug, false, "Generate and print debug information for adaption") | ||||||||||
INT_VAR_H (bidi_debug, 0, "Debug level for BiDi") | ||||||||||
INT_VAR_H (applybox_debug, 1, "Debug level") | ||||||||||
INT_VAR_H (applybox_page, 0, "Page number to apply boxes from") | ||||||||||
STRING_VAR_H (applybox_exposure_pattern, ".exp", "Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif") | ||||||||||
BOOL_VAR_H (applybox_learn_chars_and_char_frags_mode, false, "Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters.") | ||||||||||
BOOL_VAR_H (applybox_learn_ngrams_mode, false, "Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally.") | ||||||||||
BOOL_VAR_H (tessedit_display_outwords, false, "Draw output words") | ||||||||||
BOOL_VAR_H (tessedit_dump_choices, false, "Dump char choices") | ||||||||||
BOOL_VAR_H (tessedit_timing_debug, false, "Print timing stats") | ||||||||||
BOOL_VAR_H (tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces") | ||||||||||
BOOL_VAR_H (tessedit_unrej_any_wd, false, "Don't bother with word plausibility") | ||||||||||
BOOL_VAR_H (tessedit_fix_hyphens, true, "Crunch double hyphens?") | ||||||||||
BOOL_VAR_H (tessedit_redo_xheight, true, "Check/Correct x-height") | ||||||||||
BOOL_VAR_H (tessedit_enable_doc_dict, true, "Add words to the document dictionary") | ||||||||||
BOOL_VAR_H (tessedit_debug_fonts, false, "Output font info per char") | ||||||||||
BOOL_VAR_H (tessedit_debug_block_rejection, false, "Block and Row stats") | ||||||||||
BOOL_VAR_H (tessedit_enable_bigram_correction, true, "Enable correction based on the word bigram dictionary.") | ||||||||||
BOOL_VAR_H (tessedit_enable_dict_correction, false, "Enable single word correction based on the dictionary.") | ||||||||||
INT_VAR_H (tessedit_bigram_debug, 0, "Amount of debug output for bigram " "correction.") | ||||||||||
BOOL_VAR_H (enable_noise_removal, true, "Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise") | ||||||||||
INT_VAR_H (debug_noise_removal, 0, "Debug reassignment of small outlines") | ||||||||||
double_VAR_H (noise_cert_basechar, -8.0, "Hingepoint for base char certainty") | ||||||||||
double_VAR_H (noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty") | ||||||||||
double_VAR_H (noise_cert_punc, -2.5, "Threshold for new punc char certainty") | ||||||||||
double_VAR_H (noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint") | ||||||||||
INT_VAR_H (noise_maxperblob, 8, "Max diacritics to apply to a blob") | ||||||||||
INT_VAR_H (noise_maxperword, 16, "Max diacritics to apply to a word") | ||||||||||
INT_VAR_H (debug_x_ht_level, 0, "Reestimate debug") | ||||||||||
BOOL_VAR_H (debug_acceptable_wds, false, "Dump word pass/fail chk") | ||||||||||
STRING_VAR_H (chs_leading_punct, "('`\, "Leading punctuation") | ||||||||||
STRING_VAR_H (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation") | ||||||||||
STRING_VAR_H (chs_trailing_punct2, ")'`\, "2nd Trailing punctuation") | ||||||||||
double_VAR_H (quality_rej_pc, 0.08, "good_quality_doc lte rejection limit") | ||||||||||
double_VAR_H (quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit") | ||||||||||
double_VAR_H (quality_outline_pc, 1.0, "good_quality_doc lte outline error limit") | ||||||||||
double_VAR_H (quality_char_pc, 0.95, "good_quality_doc gte good char limit") | ||||||||||
INT_VAR_H (quality_min_initial_alphas_reqd, 2, "alphas in a good word") | ||||||||||
INT_VAR_H (tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess") | ||||||||||
BOOL_VAR_H (tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output") | ||||||||||
BOOL_VAR_H (tessedit_test_adaption, false, "Test adaption criteria") | ||||||||||
BOOL_VAR_H (tessedit_matcher_log, false, "Log matcher activity") | ||||||||||
INT_VAR_H (tessedit_test_adaption_mode, 3, "Adaptation decision algorithm for tess") | ||||||||||
BOOL_VAR_H (test_pt, false, "Test for point") | ||||||||||
double_VAR_H (test_pt_x, 99999.99, "xcoord") | ||||||||||
double_VAR_H (test_pt_y, 99999.99, "ycoord") | ||||||||||
INT_VAR_H (multilang_debug_level, 0, "Print multilang debug info.") | ||||||||||
INT_VAR_H (paragraph_debug_level, 0, "Print paragraph debug info.") | ||||||||||
BOOL_VAR_H (paragraph_text_based, true, "Run paragraph detection on the post-text-recognition " "(more accurate)") | ||||||||||
BOOL_VAR_H (lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm") | ||||||||||
STRING_VAR_H (outlines_odd, "%| ", "Non standard number of outlines") | ||||||||||
STRING_VAR_H (outlines_2, "ij!?%\;", "Non standard number of outlines") | ||||||||||
BOOL_VAR_H (docqual_excuse_outline_errs, false, "Allow outline errs in unrejection?") | ||||||||||
BOOL_VAR_H (tessedit_good_quality_unrej, true, "Reduce rejection on good docs") | ||||||||||
BOOL_VAR_H (tessedit_use_reject_spaces, true, "Reject spaces?") | ||||||||||
double_VAR_H (tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc") | ||||||||||
double_VAR_H (tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block") | ||||||||||
double_VAR_H (tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row") | ||||||||||
double_VAR_H (tessedit_whole_wd_rej_row_percent, 70.00, "Number of row rejects in whole word rejects" "which prevents whole row rejection") | ||||||||||
BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds, true, "Only rej partially rejected words in block rejection") | ||||||||||
BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds, true, "Only rej partially rejected words in row rejection") | ||||||||||
BOOL_VAR_H (tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric") | ||||||||||
BOOL_VAR_H (tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric") | ||||||||||
INT_VAR_H (tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this") | ||||||||||
BOOL_VAR_H (tessedit_row_rej_good_docs, true, "Apply row rejection to good docs") | ||||||||||
double_VAR_H (tessedit_good_doc_still_rowrej_wd, 1.1, "rej good doc wd if more than this fraction rejected") | ||||||||||
BOOL_VAR_H (tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds") | ||||||||||
BOOL_VAR_H (tessedit_debug_doc_rejection, false, "Page stats") | ||||||||||
BOOL_VAR_H (tessedit_debug_quality_metrics, false, "Output data to debug file") | ||||||||||
BOOL_VAR_H (bland_unrej, false, "unrej potential with no checks") | ||||||||||
double_VAR_H (quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit") | ||||||||||
BOOL_VAR_H (unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch") | ||||||||||
BOOL_VAR_H (hocr_font_info, false, "Add font info to hocr output") | ||||||||||
BOOL_VAR_H (crunch_early_merge_tess_fails, true, "Before word crunch?") | ||||||||||
BOOL_VAR_H (crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?") | ||||||||||
double_VAR_H (crunch_terrible_rating, 80.0, "crunch rating lt this") | ||||||||||
BOOL_VAR_H (crunch_terrible_garbage, true, "As it says") | ||||||||||
double_VAR_H (crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this") | ||||||||||
double_VAR_H (crunch_poor_garbage_rate, 60, "crunch garbage rating lt this") | ||||||||||
double_VAR_H (crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this") | ||||||||||
double_VAR_H (crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this") | ||||||||||
BOOL_VAR_H (crunch_pot_garbage, true, "POTENTIAL crunch garbage") | ||||||||||
double_VAR_H (crunch_del_rating, 60, "POTENTIAL crunch rating lt this") | ||||||||||
double_VAR_H (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this") | ||||||||||
double_VAR_H (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this") | ||||||||||
double_VAR_H (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this") | ||||||||||
double_VAR_H (crunch_del_min_width, 3.0, "Del if word width lt xht x this") | ||||||||||
double_VAR_H (crunch_del_high_word, 1.5, "Del if word gt xht x this above bl") | ||||||||||
double_VAR_H (crunch_del_low_word, 0.5, "Del if word gt xht x this below bl") | ||||||||||
double_VAR_H (crunch_small_outlines_size, 0.6, "Small if lt xht x this") | ||||||||||
INT_VAR_H (crunch_rating_max, 10, "For adj length in rating per ch") | ||||||||||
INT_VAR_H (crunch_pot_indicators, 1, "How many potential indicators needed") | ||||||||||
BOOL_VAR_H (crunch_leave_ok_strings, true, "Don't touch sensible strings") | ||||||||||
BOOL_VAR_H (crunch_accept_ok, true, "Use acceptability in okstring") | ||||||||||
BOOL_VAR_H (crunch_leave_accept_strings, false, "Don't pot crunch sensible strings") | ||||||||||
BOOL_VAR_H (crunch_include_numerals, false, "Fiddle alpha figures") | ||||||||||
INT_VAR_H (crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings") | ||||||||||
INT_VAR_H (crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings") | ||||||||||
INT_VAR_H (crunch_long_repetitions, 3, "Crunch words with long repetitions") | ||||||||||
INT_VAR_H (crunch_debug, 0, "As it says") | ||||||||||
INT_VAR_H (fixsp_non_noise_limit, 1, "How many non-noise blbs either side?") | ||||||||||
double_VAR_H (fixsp_small_outlines_size, 0.28, "Small if lt xht x this") | ||||||||||
BOOL_VAR_H (tessedit_prefer_joined_punct, false, "Reward punctuation joins") | ||||||||||
INT_VAR_H (fixsp_done_mode, 1, "What constitues done for spacing") | ||||||||||
INT_VAR_H (debug_fix_space_level, 0, "Contextual fixspace debug") | ||||||||||
STRING_VAR_H (numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers") | ||||||||||
INT_VAR_H (x_ht_acceptance_tolerance, 8, "Max allowed deviation of blob top outside of font data") | ||||||||||
INT_VAR_H (x_ht_min_change, 8, "Min change in xht before actually trying it") | ||||||||||
INT_VAR_H (superscript_debug, 0, "Debug level for sub & superscript fixer") | ||||||||||
double_VAR_H (superscript_worse_certainty, 2.0, "How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?") | ||||||||||
double_VAR_H (superscript_bettered_certainty, 0.97, "What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%") | ||||||||||
double_VAR_H (superscript_scaledown_ratio, 0.4, "A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size.") | ||||||||||
double_VAR_H (subscript_max_y_top, 0.5, "Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript.") | ||||||||||
double_VAR_H (superscript_min_y_bottom, 0.3, "Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript.") | ||||||||||
BOOL_VAR_H (tessedit_write_block_separators, false, "Write block separators in output") | ||||||||||
BOOL_VAR_H (tessedit_write_rep_codes, false, "Write repetition char code") | ||||||||||
BOOL_VAR_H (tessedit_write_unlv, false, "Write .unlv output file") | ||||||||||
BOOL_VAR_H (tessedit_create_txt, false, "Write .txt output file") | ||||||||||
BOOL_VAR_H (tessedit_create_hocr, false, "Write .html hOCR output file") | ||||||||||
BOOL_VAR_H (tessedit_create_tsv, false, "Write .tsv output file") | ||||||||||
BOOL_VAR_H (tessedit_create_pdf, false, "Write .pdf output file") | ||||||||||
BOOL_VAR_H (textonly_pdf, false, "Create PDF with only one invisible text layer") | ||||||||||
INT_VAR_H (jpg_quality, 85, "Set JPEG quality level") | ||||||||||
INT_VAR_H (user_defined_dpi, 0, "Specify DPI for input image") | ||||||||||
INT_VAR_H (min_characters_to_try, 50, "Specify minimum characters to try during OSD") | ||||||||||
STRING_VAR_H (unrecognised_char, "|", "Output char for unidentified blobs") | ||||||||||
INT_VAR_H (suspect_level, 99, "Suspect marker level") | ||||||||||
INT_VAR_H (suspect_space_level, 100, "Min suspect level for rejecting spaces") | ||||||||||
INT_VAR_H (suspect_short_words, 2, "Don't Suspect dict wds longer than this") | ||||||||||
BOOL_VAR_H (suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected") | ||||||||||
double_VAR_H (suspect_rating_per_ch, 999.9, "Don't touch bad rating limit") | ||||||||||
double_VAR_H (suspect_accept_rating, -999.9, "Accept good rating limit") | ||||||||||
BOOL_VAR_H (tessedit_minimal_rejection, false, "Only reject tess failures") | ||||||||||
BOOL_VAR_H (tessedit_zero_rejection, false, "Don't reject ANYTHING") | ||||||||||
BOOL_VAR_H (tessedit_word_for_word, false, "Make output have exactly one word per WERD") | ||||||||||
BOOL_VAR_H (tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL") | ||||||||||
BOOL_VAR_H (tessedit_consistent_reps, true, "Force all rep chars the same") | ||||||||||
INT_VAR_H (tessedit_reject_mode, 0, "Rejection algorithm") | ||||||||||
BOOL_VAR_H (tessedit_rejection_debug, false, "Adaption debug") | ||||||||||
BOOL_VAR_H (tessedit_flip_0O, true, "Contextual 0O O0 flips") | ||||||||||
double_VAR_H (tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test") | ||||||||||
double_VAR_H (tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test") | ||||||||||
BOOL_VAR_H (rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector") | ||||||||||
BOOL_VAR_H (rej_1Il_use_dict_word, false, "Use dictword test") | ||||||||||
BOOL_VAR_H (rej_1Il_trust_permuter_type, true, "Don't double check") | ||||||||||
BOOL_VAR_H (rej_use_tess_accepted, true, "Individual rejection control") | ||||||||||
BOOL_VAR_H (rej_use_tess_blanks, true, "Individual rejection control") | ||||||||||
BOOL_VAR_H (rej_use_good_perm, true, "Individual rejection control") | ||||||||||
BOOL_VAR_H (rej_use_sensible_wd, false, "Extend permuter check") | ||||||||||
BOOL_VAR_H (rej_alphas_in_number_perm, false, "Extend permuter check") | ||||||||||
double_VAR_H (rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract") | ||||||||||
INT_VAR_H (tessedit_image_border, 2, "Rej blbs near image edge limit") | ||||||||||
STRING_VAR_H (ok_repeated_ch_non_alphanum_wds, "-?*\5", "Allow NN to unrej") | ||||||||||
STRING_VAR_H (conflict_set_I_l_1, "Il1[]", "Il1 conflict set") | ||||||||||
INT_VAR_H (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this") | ||||||||||
BOOL_VAR_H (tessedit_create_boxfile, false, "Output text with boxes") | ||||||||||
INT_VAR_H (tessedit_page_number, -1, "-1 -> All pages, else specific page to process") | ||||||||||
BOOL_VAR_H (tessedit_write_images, false, "Capture the image from the IPE") | ||||||||||
BOOL_VAR_H (interactive_display_mode, false, "Run interactively?") | ||||||||||
STRING_VAR_H (file_type, ".tif", "Filename extension") | ||||||||||
BOOL_VAR_H (tessedit_override_permuter, true, "According to dict_word") | ||||||||||
STRING_VAR_H (tessedit_load_sublangs, "", "List of languages to load with this one") | ||||||||||
BOOL_VAR_H (tessedit_use_primary_params_model, false, "In multilingual mode use params model of the primary language") | ||||||||||
double_VAR_H (min_orientation_margin, 7.0, "Min acceptable orientation margin") | ||||||||||
BOOL_VAR_H (textord_tabfind_show_vlines, false, "Debug line finding") | ||||||||||
BOOL_VAR_H (textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model") | ||||||||||
BOOL_VAR_H (poly_allow_detailed_fx, false, "Allow feature extractors to see the original outline") | ||||||||||
BOOL_VAR_H (tessedit_init_config_only, false, "Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis.") | ||||||||||
BOOL_VAR_H (textord_equation_detect, false, "Turn on equation detector") | ||||||||||
BOOL_VAR_H (textord_tabfind_vertical_text, true, "Enable vertical detection") | ||||||||||
BOOL_VAR_H (textord_tabfind_force_vertical_text, false, "Force using vertical text page mode") | ||||||||||
double_VAR_H (textord_tabfind_vertical_text_ratio, 0.5, "Fraction of textlines deemed vertical to use vertical page " "mode") | ||||||||||
double_VAR_H (textord_tabfind_aligned_gap_fraction, 0.75, "Fraction of height used as a minimum gap for aligned blobs.") | ||||||||||
INT_VAR_H (tessedit_parallelize, 0, "Run in parallel where possible") | ||||||||||
BOOL_VAR_H (preserve_interword_spaces, false, "Preserve multiple interword spaces") | ||||||||||
STRING_VAR_H (page_separator, "\, "Page separator(default is form feed control character)") | ||||||||||
INT_VAR_H (lstm_choice_mode, 0, "Allows to include alternative symbols choices in the hOCR output. " "Valid input values are 0, 1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are accumulated per character.") | ||||||||||
FILE * | init_recog_training (const STRING &fname) | |||||||||
void | recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) | |||||||||
void | ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file) | |||||||||
eval_word_spacing() | ||||||||||
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect. Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred. The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space. Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined. The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. | ||||||||||
bool | digit_or_numeric_punct (WERD_RES *word, int char_position) | |||||||||
int16_t | eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
fix_sp_fp_word() | ||||||||||
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words. | ||||||||||
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) | |||||||||
int16_t | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) | |||||||||
float | blob_noise_score (TBLOB *blob) | |||||||||
void | break_noisiest_blob_word (WERD_RES_LIST &words) | |||||||||
fix_fuzzy_spaces() | ||||||||||
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
| ||||||||||
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) | |||||||||
process_selected_words() | ||||||||||
Walk the current block list applying the specified word processor function to each word that overlaps the selection_box. | ||||||||||
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) | |||||||||
tess_add_doc_word | ||||||||||
Add the given word to the document dictionary | ||||||||||
void | tess_add_doc_word (WERD_CHOICE *word_choice) | |||||||||
tess_acceptable_word | ||||||||||
| ||||||||||
bool | tess_acceptable_word (WERD_RES *word) | |||||||||
Public Member Functions inherited from tesseract::Wordrec | ||||||||||
BOOL_VAR_H (merge_fragments_in_matrix, TRUE, "Merge the fragments in the ratings matrix and delete them " "after merging") | ||||||||||
BOOL_VAR_H (wordrec_no_block, FALSE, "Don't output block information") | ||||||||||
BOOL_VAR_H (wordrec_enable_assoc, TRUE, "Associator Enable") | ||||||||||
BOOL_VAR_H (force_word_assoc, FALSE, "force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary.") | ||||||||||
double_VAR_H (wordrec_worst_state, 1, "Worst segmentation state") | ||||||||||
BOOL_VAR_H (fragments_guide_chopper, FALSE, "Use information from fragments to guide chopping process") | ||||||||||
INT_VAR_H (repair_unchopped_blobs, 1, "Fix blobs that aren't chopped") | ||||||||||
double_VAR_H (tessedit_certainty_threshold, -2.25, "Good blob limit") | ||||||||||
INT_VAR_H (chop_debug, 0, "Chop debug") | ||||||||||
BOOL_VAR_H (chop_enable, 1, "Chop enable") | ||||||||||
BOOL_VAR_H (chop_vertical_creep, 0, "Vertical creep") | ||||||||||
INT_VAR_H (chop_split_length, 10000, "Split Length") | ||||||||||
INT_VAR_H (chop_same_distance, 2, "Same distance") | ||||||||||
INT_VAR_H (chop_min_outline_points, 6, "Min Number of Points on Outline") | ||||||||||
INT_VAR_H (chop_seam_pile_size, 150, "Max number of seams in seam_pile") | ||||||||||
BOOL_VAR_H (chop_new_seam_pile, 1, "Use new seam_pile") | ||||||||||
INT_VAR_H (chop_inside_angle, -50, "Min Inside Angle Bend") | ||||||||||
INT_VAR_H (chop_min_outline_area, 2000, "Min Outline Area") | ||||||||||
double_VAR_H (chop_split_dist_knob, 0.5, "Split length adjustment") | ||||||||||
double_VAR_H (chop_overlap_knob, 0.9, "Split overlap adjustment") | ||||||||||
double_VAR_H (chop_center_knob, 0.15, "Split center adjustment") | ||||||||||
INT_VAR_H (chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center.") | ||||||||||
double_VAR_H (chop_sharpness_knob, 0.06, "Split sharpness adjustment") | ||||||||||
double_VAR_H (chop_width_change_knob, 5.0, "Width change adjustment") | ||||||||||
double_VAR_H (chop_ok_split, 100.0, "OK split limit") | ||||||||||
double_VAR_H (chop_good_split, 50.0, "Good split limit") | ||||||||||
INT_VAR_H (chop_x_y_weight, 3, "X / Y length weight") | ||||||||||
INT_VAR_H (segment_adjust_debug, 0, "Segmentation adjustment debug") | ||||||||||
BOOL_VAR_H (assume_fixed_pitch_char_segment, FALSE, "include fixed-pitch heuristics in char segmentation") | ||||||||||
INT_VAR_H (wordrec_debug_level, 0, "Debug level for wordrec") | ||||||||||
INT_VAR_H (wordrec_max_join_chunks, 4, "Max number of broken pieces to associate") | ||||||||||
BOOL_VAR_H (wordrec_skip_no_truth_words, false, "Only run OCR for words that had truth recorded in BlamerBundle") | ||||||||||
BOOL_VAR_H (wordrec_debug_blamer, false, "Print blamer debug messages") | ||||||||||
BOOL_VAR_H (wordrec_run_blamer, false, "Try to set the blame for errors") | ||||||||||
INT_VAR_H (segsearch_debug_level, 0, "SegSearch debug level") | ||||||||||
INT_VAR_H (segsearch_max_pain_points, 2000, "Maximum number of pain points stored in the queue") | ||||||||||
INT_VAR_H (segsearch_max_futile_classifications, 10, "Maximum number of pain point classifications per word.") | ||||||||||
double_VAR_H (segsearch_max_char_wh_ratio, 2.0, "Maximum character width-to-height ratio") | ||||||||||
BOOL_VAR_H (save_alt_choices, true, "Save alternative paths found during chopping " "and segmentation search") | ||||||||||
Wordrec () | ||||||||||
virtual | ~Wordrec ()=default | |||||||||
void | SaveAltChoices (const LIST &best_choices, WERD_RES *word) | |||||||||
void | FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | |||||||||
void | CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | |||||||||
void | SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | |||||||||
void | InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | |||||||||
void | DoSegSearch (WERD_RES *word_res) | |||||||||
void | add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams) | |||||||||
void | choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) | |||||||||
void | combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue) | |||||||||
SEAM * | pick_good_seam (TBLOB *blob) | |||||||||
void | try_point_pairs (EDGEPT *points[50], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) | |||||||||
void | try_vertical_splits (EDGEPT *points[50], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) | |||||||||
PRIORITY | grade_split_length (SPLIT *split) | |||||||||
PRIORITY | grade_sharpness (SPLIT *split) | |||||||||
bool | near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) | |||||||||
virtual BLOB_CHOICE_LIST * | classify_piece (const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle) | |||||||||
void | merge_fragments (MATRIX *ratings, int16_t num_blobs) | |||||||||
void | get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists) | |||||||||
void | merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) | |||||||||
void | fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices) | |||||||||
void | program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict) | |||||||||
void | cc_recog (WERD_RES *word) | |||||||||
void | program_editdown (int32_t elasped_time) | |||||||||
void | set_pass1 () | |||||||||
void | set_pass2 () | |||||||||
int | end_recog () | |||||||||
BLOB_CHOICE_LIST * | call_matcher (TBLOB *blob) | |||||||||
int | dict_word (const WERD_CHOICE &word) | |||||||||
BLOB_CHOICE_LIST * | classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle) | |||||||||
PRIORITY | point_priority (EDGEPT *point) | |||||||||
void | add_point_to_list (PointHeap *point_heap, EDGEPT *point) | |||||||||
bool | is_inside_angle (EDGEPT *pt) | |||||||||
int | angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) | |||||||||
EDGEPT * | pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) | |||||||||
void | prioritize_points (TESSLINE *outline, PointHeap *points) | |||||||||
void | new_min_point (EDGEPT *local_min, PointHeap *points) | |||||||||
void | new_max_point (EDGEPT *local_max, PointHeap *points) | |||||||||
void | vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points) | |||||||||
SEAM * | attempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams) | |||||||||
SEAM * | chop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams) | |||||||||
SEAM * | chop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number) | |||||||||
SEAM * | improve_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number) | |||||||||
SEAM * | chop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number) | |||||||||
void | chop_word_main (WERD_RES *word) | |||||||||
void | improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending) | |||||||||
int | select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment) | |||||||||
int | select_blob_to_split_from_fixpt (DANGERR *fixpt) | |||||||||
Public Member Functions inherited from tesseract::Classify | ||||||||||
Classify () | ||||||||||
virtual | ~Classify () | |||||||||
const ShapeTable * | shape_table () const | |||||||||
void | SetStaticClassifier (ShapeClassifier *static_classifier) | |||||||||
void | AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices) | |||||||||
bool | LargeSpeckle (const TBLOB &blob) | |||||||||
ADAPT_TEMPLATES | NewAdaptedTemplates (bool InitFromUnicharset) | |||||||||
int | GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId) | |||||||||
int | PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results) | |||||||||
void | ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs) | |||||||||
void | PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) | |||||||||
void | WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) | |||||||||
ADAPT_TEMPLATES | ReadAdaptedTemplates (TFile *File) | |||||||||
float | ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) | |||||||||
void | FreeNormProtos () | |||||||||
NORM_PROTOS * | ReadNormProtos (TFile *fp) | |||||||||
void | ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class) | |||||||||
INT_TEMPLATES | CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset) | |||||||||
void | LearnWord (const char *fontname, WERD_RES *word) | |||||||||
void | LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word) | |||||||||
void | InitAdaptiveClassifier (TessdataManager *mgr) | |||||||||
void | InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates) | |||||||||
void | AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) | |||||||||
void | MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results) | |||||||||
void | ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results) | |||||||||
double | ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors) | |||||||||
void | ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) | |||||||||
void | AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results) | |||||||||
int | GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures) | |||||||||
void | DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results) | |||||||||
PROTO_ID | MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask) | |||||||||
int | MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures) | |||||||||
void | MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob) | |||||||||
void | PrintAdaptiveMatchResults (const ADAPT_RESULTS &results) | |||||||||
void | RemoveExtraPuncs (ADAPT_RESULTS *Results) | |||||||||
void | RemoveBadMatches (ADAPT_RESULTS *Results) | |||||||||
void | SetAdaptiveThreshold (float Threshold) | |||||||||
void | ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features) | |||||||||
STRING | ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const | |||||||||
int | ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const | |||||||||
int | ShapeIDToClassID (int shape_id) const | |||||||||
UNICHAR_ID * | BaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) | |||||||||
int | CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results) | |||||||||
int | CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results) | |||||||||
UNICHAR_ID * | GetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass) | |||||||||
void | DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results) | |||||||||
void | AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates) | |||||||||
void | DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class) | |||||||||
bool | AdaptableWord (WERD_RES *word) | |||||||||
void | EndAdaptiveClassifier () | |||||||||
void | SettupPass1 () | |||||||||
void | SettupPass2 () | |||||||||
void | AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices) | |||||||||
void | ClassifyAsNoise (ADAPT_RESULTS *Results) | |||||||||
void | ResetAdaptiveClassifierInternal () | |||||||||
void | SwitchAdaptiveClassifier () | |||||||||
void | StartBackupAdaptiveClassifier () | |||||||||
int | GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array) | |||||||||
void | ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array) | |||||||||
bool | TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config) | |||||||||
void | UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob) | |||||||||
bool | AdaptiveClassifierIsFull () const | |||||||||
bool | AdaptiveClassifierIsEmpty () const | |||||||||
bool | LooksLikeGarbage (TBLOB *blob) | |||||||||
void | RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox) | |||||||||
void | ClearCharNormArray (uint8_t *char_norm_array) | |||||||||
void | ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array) | |||||||||
void | ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures) | |||||||||
INT_TEMPLATES | ReadIntTemplates (TFile *fp) | |||||||||
void | WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset) | |||||||||
CLASS_ID | GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id) | |||||||||
void | ShowMatchDisplay () | |||||||||
UnicityTable< FontInfo > & | get_fontinfo_table () | |||||||||
const UnicityTable< FontInfo > & | get_fontinfo_table () const | |||||||||
UnicityTable< FontSet > & | get_fontset_table () | |||||||||
void | NormalizeOutlines (LIST Outlines, float *XScale, float *YScale) | |||||||||
FEATURE_SET | ExtractOutlineFeatures (TBLOB *Blob) | |||||||||
FEATURE_SET | ExtractPicoFeatures (TBLOB *Blob) | |||||||||
FEATURE_SET | ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) | |||||||||
FEATURE_SET | ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) | |||||||||
void | LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) | |||||||||
bool | WriteTRFile (const STRING &filename) | |||||||||
BOOL_VAR_H (allow_blob_division, true, "Use divisible blobs chopping") | ||||||||||
BOOL_VAR_H (prioritize_division, FALSE, "Prioritize blob division over chopping") | ||||||||||
INT_VAR_H (tessedit_single_match, FALSE, "Top choice only from CP") | ||||||||||
BOOL_VAR_H (classify_enable_learning, true, "Enable adaptive classifier") | ||||||||||
INT_VAR_H (classify_debug_level, 0, "Classify debug level") | ||||||||||
INT_VAR_H (classify_norm_method, character, "Normalization Method ...") | ||||||||||
double_VAR_H (classify_char_norm_range, 0.2, "Character Normalization Range ...") | ||||||||||
double_VAR_H (classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...") | ||||||||||
double_VAR_H (classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...") | ||||||||||
double_VAR_H (classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...") | ||||||||||
double_VAR_H (classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...") | ||||||||||
double_VAR_H (classify_max_rating_ratio, 1.5, "Veto ratio between classifier ratings") | ||||||||||
double_VAR_H (classify_max_certainty_margin, 5.5, "Veto difference between classifier certainties") | ||||||||||
BOOL_VAR_H (tess_cn_matching, 0, "Character Normalized Matching") | ||||||||||
BOOL_VAR_H (tess_bn_matching, 0, "Baseline Normalized Matching") | ||||||||||
BOOL_VAR_H (classify_enable_adaptive_matcher, 1, "Enable adaptive classifier") | ||||||||||
BOOL_VAR_H (classify_use_pre_adapted_templates, 0, "Use pre-adapted classifier templates") | ||||||||||
BOOL_VAR_H (classify_save_adapted_templates, 0, "Save adapted templates to a file") | ||||||||||
BOOL_VAR_H (classify_enable_adaptive_debugger, 0, "Enable match debugger") | ||||||||||
BOOL_VAR_H (classify_nonlinear_norm, 0, "Non-linear stroke-density normalization") | ||||||||||
INT_VAR_H (matcher_debug_level, 0, "Matcher Debug Level") | ||||||||||
INT_VAR_H (matcher_debug_flags, 0, "Matcher Debug Flags") | ||||||||||
INT_VAR_H (classify_learning_debug_level, 0, "Learning Debug Level: ") | ||||||||||
double_VAR_H (matcher_good_threshold, 0.125, "Good Match (0-1)") | ||||||||||
double_VAR_H (matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)") | ||||||||||
double_VAR_H (matcher_perfect_threshold, 0.02, "Perfect Match (0-1)") | ||||||||||
double_VAR_H (matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)") | ||||||||||
double_VAR_H (matcher_rating_margin, 0.1, "New template margin (0-1)") | ||||||||||
double_VAR_H (matcher_avg_noise_size, 12.0, "Avg. noise blob length: ") | ||||||||||
INT_VAR_H (matcher_permanent_classes_min, 1, "Min # of permanent classes") | ||||||||||
INT_VAR_H (matcher_min_examples_for_prototyping, 3, "Reliable Config Threshold") | ||||||||||
INT_VAR_H (matcher_sufficient_examples_for_prototyping, 5, "Enable adaption even if the ambiguities have not been seen") | ||||||||||
double_VAR_H (matcher_clustering_max_angle_delta, 0.015, "Maximum angle delta for prototype clustering") | ||||||||||
double_VAR_H (classify_misfit_junk_penalty, 0.0, "Penalty to apply when a non-alnum is vertically out of " "its expected textline position") | ||||||||||
double_VAR_H (rating_scale, 1.5, "Rating scaling factor") | ||||||||||
double_VAR_H (certainty_scale, 20.0, "Certainty scaling factor") | ||||||||||
double_VAR_H (tessedit_class_miss_scale, 0.00390625, "Scale factor for features not used") | ||||||||||
double_VAR_H (classify_adapted_pruning_factor, 2.5, "Prune poor adapted results this much worse than best result") | ||||||||||
double_VAR_H (classify_adapted_pruning_threshold, -1.0, "Threshold at which classify_adapted_pruning_factor starts") | ||||||||||
INT_VAR_H (classify_adapt_proto_threshold, 230, "Threshold for good protos during adaptive 0-255") | ||||||||||
INT_VAR_H (classify_adapt_feature_threshold, 230, "Threshold for good features during adaptive 0-255") | ||||||||||
BOOL_VAR_H (disable_character_fragments, TRUE, "Do not include character fragments in the" " results of the classifier") | ||||||||||
double_VAR_H (classify_character_fragments_garbage_certainty_threshold, -3.0, "Exclude fragments that do not match any whole character" " with at least this certainty") | ||||||||||
BOOL_VAR_H (classify_debug_character_fragments, FALSE, "Bring up graphical debugging windows for fragments training") | ||||||||||
BOOL_VAR_H (matcher_debug_separate_windows, FALSE, "Use two different windows for debugging the matching: " "One for the protos and one for the features.") | ||||||||||
STRING_VAR_H (classify_learn_debug_str, "", "Class str to debug learning") | ||||||||||
INT_VAR_H (classify_class_pruner_threshold, 229, "Class Pruner Threshold 0-255") | ||||||||||
INT_VAR_H (classify_class_pruner_multiplier, 15, "Class Pruner Multiplier 0-255: ") | ||||||||||
INT_VAR_H (classify_cp_cutoff_strength, 7, "Class Pruner CutoffStrength: ") | ||||||||||
INT_VAR_H (classify_integer_matcher_multiplier, 10, "Integer Matcher Multiplier 0-255: ") | ||||||||||
INT_VAR_H (il1_adaption_test, 0, "Don't adapt to i/I at beginning of word") | ||||||||||
BOOL_VAR_H (classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].") | ||||||||||
double_VAR_H (speckle_large_max_size, 0.30, "Max large speckle size") | ||||||||||
double_VAR_H (speckle_rating_penalty, 10.0, "Penalty to add to worst rating for noise") | ||||||||||
Public Member Functions inherited from tesseract::CCStruct | ||||||||||
CCStruct ()=default | ||||||||||
virtual | ~CCStruct () | |||||||||
Public Member Functions inherited from tesseract::CUtil | ||||||||||
CUtil ()=default | ||||||||||
virtual | ~CUtil () | |||||||||
void | read_variables (const char *filename, bool global_only) | |||||||||
Public Member Functions inherited from tesseract::CCUtil | ||||||||||
CCUtil () | ||||||||||
virtual | ~CCUtil () | |||||||||
void | main_setup (const char *argv0, const char *basename) | |||||||||
CCUtil::main_setup - set location of tessdata and name of image. More... | ||||||||||
ParamsVectors * | params () | |||||||||
Private Attributes | |
const char * | backup_config_file_ |
STRING | word_config_ |
Pix * | pix_binary_ |
Pix * | pix_grey_ |
Pix * | pix_original_ |
Pix * | pix_thresholds_ |
DebugPixa | pixa_debug_ |
int | source_resolution_ |
ShiroRekhaSplitter | splitter_ |
Textord | textord_ |
bool | right_to_left_ |
Pix * | scaled_color_ |
int | scaled_factor_ |
FCOORD | deskew_ |
FCOORD | reskew_ |
TesseractStats | stats_ |
GenericVector< Tesseract * > | sub_langs_ |
Tesseract * | most_recently_used_ |
int | font_table_size_ |
EquationDetect * | equ_detect_ |
LSTMRecognizer * | lstm_recognizer_ |
int | train_line_page_num_ |
tesseract::Tesseract::Tesseract | ( | ) |
tesseract::Tesseract::~Tesseract | ( | ) |
bool tesseract::Tesseract::acceptable_number_string | ( | const char * | s, |
const char * | lengths | ||
) |
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string | ( | const UNICHARSET & | char_set, |
const char * | s, | ||
const char * | lengths | ||
) |
int16_t tesseract::Tesseract::alpha_count | ( | const char * | word, |
const char * | word_lengths | ||
) |
void tesseract::Tesseract::ambigs_classify_and_output | ( | const char * | label, |
PAGE_RES_IT * | pr_it, | ||
FILE * | output_file | ||
) |
|
inline |
|
inline |
PAGE_RES * tesseract::Tesseract::ApplyBoxes | ( | const STRING & | fname, |
bool | find_segmentation, | ||
BLOCK_LIST * | block_list | ||
) |
Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.
void tesseract::Tesseract::AssignDiacriticsToNewBlobs | ( | const GenericVector< C_OUTLINE *> & | outlines, |
int | pass, | ||
WERD * | real_word, | ||
PAGE_RES_IT * | pr_it, | ||
GenericVector< bool > * | word_wanted, | ||
GenericVector< C_BLOB *> * | target_blobs | ||
) |
void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs | ( | const GenericVector< C_OUTLINE *> & | outlines, |
int | pass, | ||
WERD * | real_word, | ||
PAGE_RES_IT * | pr_it, | ||
GenericVector< bool > * | word_wanted, | ||
GenericVector< bool > * | overlapped_any_blob, | ||
GenericVector< C_BLOB *> * | target_blobs | ||
) |
int tesseract::Tesseract::AutoPageSeg | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks, | ||
BLOBNBOX_LIST * | diacritic_blobs, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.
If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).
bool tesseract::Tesseract::BelievableSuperscript | ( | bool | debug, |
const WERD_RES & | word, | ||
float | certainty_threshold, | ||
int * | left_ok, | ||
int * | right_ok | ||
) | const |
Return whether this is believable superscript or subscript text.
We insist that:
[in] | debug | If true, spew debug output |
[in] | word | The word whose best_choice we're evaluating |
[in] | certainty_threshold | If any of the characters have less certainty than this, reject. |
[out] | left_ok | How many left-side characters were ok? |
[out] | right_ok | How many right-side characters were ok? |
|
inline |
void tesseract::Tesseract::bigram_correction_pass | ( | PAGE_RES * | page_res | ) |
void tesseract::Tesseract::blamer_pass | ( | PAGE_RES * | page_res | ) |
float tesseract::Tesseract::blob_noise_score | ( | TBLOB * | blob | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_resegment_from_boxes | , |
false | , | ||
"Take segmentation and labeling from box file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_resegment_from_line_boxes | , |
false | , | ||
"Conversion of word/line box file to char box file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_train_from_boxes | , |
false | , | ||
"Generate training data from boxed chars" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_make_boxes_from_boxes | , |
false | , | ||
"Generate more boxes from boxed chars" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_train_line_recognizer | , |
false | , | ||
"Break input into lines and remap boxes if present" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dump_pageseg_images | , |
false | , | ||
"Dump intermediate images made during page segmentation" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_ambigs_training | , |
false | , | ||
"Perform training for ambiguities" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_adaption_debug | , |
false | , | ||
"Generate and print debug information for adaption" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | applybox_learn_chars_and_char_frags_mode | , |
false | , | ||
"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters." | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | applybox_learn_ngrams_mode | , |
false | , | ||
"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally." | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_display_outwords | , |
false | , | ||
"Draw output words" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dump_choices | , |
false | , | ||
"Dump char choices" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_timing_debug | , |
false | , | ||
"Print timing stats" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_fix_fuzzy_spaces | , |
true | , | ||
"Try to improve fuzzy spaces" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_unrej_any_wd | , |
false | , | ||
"Don't bother with word plausibility" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_fix_hyphens | , |
true | , | ||
"Crunch double hyphens?" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_redo_xheight | , |
true | , | ||
"Check/Correct x-height" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_enable_doc_dict | , |
true | , | ||
"Add words to the document dictionary" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_fonts | , |
false | , | ||
"Output font info per char" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_block_rejection | , |
false | , | ||
"Block and Row stats" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_enable_bigram_correction | , |
true | , | ||
"Enable correction based on the word bigram dictionary." | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_enable_dict_correction | , |
false | , | ||
"Enable single word correction based on the dictionary." | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | enable_noise_removal | , |
true | , | ||
"Remove and conditionally reassign small outlines when they" " confuse layout | analysis, | ||
determining diacritics vs noise" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | debug_acceptable_wds | , |
false | , | ||
"Dump word pass/fail chk" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_minimal_rej_pass1 | , |
false | , | ||
"Do minimal rejection on pass 1 output" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_test_adaption | , |
false | , | ||
"Test adaption criteria" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_matcher_log | , |
false | , | ||
"Log matcher activity" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | test_pt | , |
false | , | ||
"Test for point" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | paragraph_text_based | , |
true | , | ||
"Run paragraph detection on the post-text-recognition " "(more accurate)" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | lstm_use_matrix | , |
1 | , | ||
"Use ratings matrix/beam searct with lstm" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | docqual_excuse_outline_errs | , |
false | , | ||
"Allow outline errs in unrejection?" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_good_quality_unrej | , |
true | , | ||
"Reduce rejection on good docs" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_use_reject_spaces | , |
true | , | ||
"Reject spaces?" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_preserve_blk_rej_perfect_wds | , |
true | , | ||
"Only rej partially rejected words in block rejection" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_preserve_row_rej_perfect_wds | , |
true | , | ||
"Only rej partially rejected words in row rejection" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dont_blkrej_good_wds | , |
false | , | ||
"Use word segmentation quality metric" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dont_rowrej_good_wds | , |
false | , | ||
"Use word segmentation quality metric" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_row_rej_good_docs | , |
true | , | ||
"Apply row rejection to good docs" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_reject_bad_qual_wds | , |
true | , | ||
"Reject all bad quality wds" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_doc_rejection | , |
false | , | ||
"Page stats" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_quality_metrics | , |
false | , | ||
"Output data to debug file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | bland_unrej | , |
false | , | ||
"unrej potential with no checks" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | unlv_tilde_crunching | , |
false | , | ||
"Mark v.bad words for tilde crunch" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | hocr_font_info | , |
false | , | ||
"Add font info to hocr output" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_early_merge_tess_fails | , |
true | , | ||
"Before word crunch?" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_early_convert_bad_unlv_chs | , |
false | , | ||
"Take out ~^ early?" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_terrible_garbage | , |
true | , | ||
"As it says" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_pot_garbage | , |
true | , | ||
"POTENTIAL crunch garbage" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_leave_ok_strings | , |
true | , | ||
"Don't touch sensible strings" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_accept_ok | , |
true | , | ||
"Use acceptability in okstring" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_leave_accept_strings | , |
false | , | ||
"Don't pot crunch sensible strings" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_include_numerals | , |
false | , | ||
"Fiddle alpha figures" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_prefer_joined_punct | , |
false | , | ||
"Reward punctuation joins" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_block_separators | , |
false | , | ||
"Write block separators in output" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_rep_codes | , |
false | , | ||
"Write repetition char code" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_unlv | , |
false | , | ||
"Write .unlv output file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_txt | , |
false | , | ||
"Write .txt output file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_hocr | , |
false | , | ||
"Write .html hOCR output file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_tsv | , |
false | , | ||
"Write .tsv output file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_pdf | , |
false | , | ||
"Write .pdf output file" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | textonly_pdf | , |
false | , | ||
"Create PDF with only one invisible text layer" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | suspect_constrain_1Il | , |
false | , | ||
"UNLV keep 1Il chars rejected" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_minimal_rejection | , |
false | , | ||
"Only reject tess failures" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_zero_rejection | , |
false | , | ||
"Don't reject ANYTHING" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_word_for_word | , |
false | , | ||
"Make output have exactly one word per WERD" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_zero_kelvin_rejection | , |
false | , | ||
"Don't reject ANYTHING AT ALL" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_consistent_reps | , |
true | , | ||
"Force all rep chars the same" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_rejection_debug | , |
false | , | ||
"Adaption debug" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_flip_0O | , |
true | , | ||
"Contextual 0O O0 flips" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_trust_doc_dawg | , |
false | , | ||
"Use DOC dawg in 11l conf. detector" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_1Il_use_dict_word | , |
false | , | ||
"Use dictword test" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_1Il_trust_permuter_type | , |
true | , | ||
"Don't double check" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_tess_accepted | , |
true | , | ||
"Individual rejection control" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_tess_blanks | , |
true | , | ||
"Individual rejection control" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_good_perm | , |
true | , | ||
"Individual rejection control" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_sensible_wd | , |
false | , | ||
"Extend permuter check" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_alphas_in_number_perm | , |
false | , | ||
"Extend permuter check" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_boxfile | , |
false | , | ||
"Output text with boxes" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_images | , |
false | , | ||
"Capture the image from the IPE" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | interactive_display_mode | , |
false | , | ||
"Run interactively?" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_override_permuter | , |
true | , | ||
"According to dict_word" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_use_primary_params_model | , |
false | , | ||
"In multilingual mode use params model of the primary language" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_tabfind_show_vlines | , |
false | , | ||
"Debug line finding" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_use_cjk_fp_model | , |
FALSE | , | ||
"Use CJK fixed pitch model" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | poly_allow_detailed_fx | , |
false | , | ||
"Allow feature extractors to see the original outline" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_init_config_only | , |
false | , | ||
"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis." | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_equation_detect | , |
false | , | ||
"Turn on equation detector" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_tabfind_vertical_text | , |
true | , | ||
"Enable vertical detection" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_tabfind_force_vertical_text | , |
false | , | ||
"Force using vertical text page mode" | |||
) |
tesseract::Tesseract::BOOL_VAR_H | ( | preserve_interword_spaces | , |
false | , | ||
"Preserve multiple interword spaces" | |||
) |
void tesseract::Tesseract::break_noisiest_blob_word | ( | WERD_RES_LIST & | words | ) |
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
SVMenuNode * tesseract::Tesseract::build_menu_new | ( | ) |
bool tesseract::Tesseract::check_debug_pt | ( | WERD_RES * | word, |
int | location | ||
) |
void tesseract::Tesseract::classify_word_and_language | ( | int | pass_n, |
PAGE_RES_IT * | pr_it, | ||
WordData * | word_data | ||
) |
void tesseract::Tesseract::classify_word_pass1 | ( | const WordData & | word_data, |
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | out_words | ||
) |
classify_word_pass1
Baseline normalize the word and pass it to Tess.
void tesseract::Tesseract::classify_word_pass2 | ( | const WordData & | word_data, |
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | out_words | ||
) |
classify_word_pass2
Control what to do with the word in pass 2
float tesseract::Tesseract::ClassifyBlobAsWord | ( | int | pass_n, |
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
STRING * | best_str, | ||
float * | c2 | ||
) |
float tesseract::Tesseract::ClassifyBlobPlusOutlines | ( | const GenericVector< bool > & | ok_outlines, |
const GenericVector< C_OUTLINE *> & | outlines, | ||
int | pass_n, | ||
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
STRING * | best_str | ||
) |
void tesseract::Tesseract::Clear | ( | ) |
float tesseract::Tesseract::ComputeCompatibleXheight | ( | WERD_RES * | word_res, |
float * | baseline_shift | ||
) |
void tesseract::Tesseract::convert_bad_unlv_chs | ( | WERD_RES * | word_res | ) |
bool tesseract::Tesseract::ConvertStringToUnichars | ( | const char * | utf8, |
GenericVector< UNICHAR_ID > * | class_ids | ||
) |
Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
void tesseract::Tesseract::CorrectClassifyWords | ( | PAGE_RES * | page_res | ) |
Creates a fake best_choice entry in each WERD_RES with the correct text.
int16_t tesseract::Tesseract::count_alphanums | ( | const WERD_CHOICE & | word | ) |
int16_t tesseract::Tesseract::count_alphanums | ( | WERD_RES * | word | ) |
int16_t tesseract::Tesseract::count_alphas | ( | const WERD_CHOICE & | word | ) |
int16_t tesseract::Tesseract::count_outline_errs | ( | char | c, |
int16_t | outline_count | ||
) |
int tesseract::Tesseract::CountMisfitTops | ( | WERD_RES * | word_res | ) |
debug_word
Process the whole image, but load word_config_ for the selected word(s).
void tesseract::Tesseract::dictionary_correction_pass | ( | PAGE_RES * | page_res | ) |
bool tesseract::Tesseract::digit_or_numeric_punct | ( | WERD_RES * | word, |
int | char_position | ||
) |
void tesseract::Tesseract::do_re_display | ( | bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) | word_painter | ) |
Redisplay page
void tesseract::Tesseract::doc_and_block_rejection | ( | PAGE_RES_IT & | page_res_it, |
bool | good_quality_doc | ||
) |
void tesseract::Tesseract::dont_allow_1Il | ( | WERD_RES * | word | ) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_basechar | , |
-8. | 0, | ||
"Hingepoint for base char certainty" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_disjoint | , |
-2. | 5, | ||
"Hingepoint for disjoint certainty" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_punc | , |
-2. | 5, | ||
"Threshold for new punc char certainty" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_factor | , |
0. | 375, | ||
"Scaling on certainty diff from Hingepoint" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | quality_rej_pc | , |
0. | 08, | ||
"good_quality_doc lte rejection limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | quality_blob_pc | , |
0. | 0, | ||
"good_quality_doc gte good blobs limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | quality_outline_pc | , |
1. | 0, | ||
"good_quality_doc lte outline error limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | quality_char_pc | , |
0. | 95, | ||
"good_quality_doc gte good char limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | test_pt_x | , |
99999. | 99, | ||
"xcoord" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | test_pt_y | , |
99999. | 99, | ||
"ycoord" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_reject_doc_percent | , |
65. | 00, | ||
"%rej allowed before rej whole doc" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_reject_block_percent | , |
45. | 00, | ||
"%rej allowed before rej whole block" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_reject_row_percent | , |
40. | 00, | ||
"%rej allowed before rej whole row" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_whole_wd_rej_row_percent | , |
70. | 00, | ||
"Number of row rejects in whole word rejects" "which prevents whole row rejection" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_good_doc_still_rowrej_wd | , |
1. | 1, | ||
"rej good doc wd if more than this fraction rejected" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | quality_rowrej_pc | , |
1. | 1, | ||
"good_quality_doc gte good char limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_terrible_rating | , |
80. | 0, | ||
"crunch rating lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_poor_garbage_cert | , |
-9. | 0, | ||
"crunch garbage cert lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_poor_garbage_rate | , |
60 | , | ||
"crunch garbage rating lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_pot_poor_rate | , |
40 | , | ||
"POTENTIAL crunch rating lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_pot_poor_cert | , |
-8. | 0, | ||
"POTENTIAL crunch cert lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_rating | , |
60 | , | ||
"POTENTIAL crunch rating lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_cert | , |
-10. | 0, | ||
"POTENTIAL crunch cert lt this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_min_ht | , |
0. | 7, | ||
"Del if word ht lt xht x this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_max_ht | , |
3. | 0, | ||
"Del if word ht gt xht x this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_min_width | , |
3. | 0, | ||
"Del if word width lt xht x this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_high_word | , |
1. | 5, | ||
"Del if word gt xht x this above bl" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_low_word | , |
0. | 5, | ||
"Del if word gt xht x this below bl" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | crunch_small_outlines_size | , |
0. | 6, | ||
"Small if lt xht x this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | fixsp_small_outlines_size | , |
0. | 28, | ||
"Small if lt xht x this" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | superscript_worse_certainty | , |
2. | 0, | ||
"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | superscript_bettered_certainty | , |
0. | 97, | ||
"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For | example, | ||
a value of 0.6 means we want " "to reduce badness of certainty by 40%" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | superscript_scaledown_ratio | , |
0. | 4, | ||
"A superscript scaled down more than this is unbelievably " "small. For | example, | ||
0.3 means we expect the font size to " "be no smaller than 30% of the text line font size." | |||
) |
tesseract::Tesseract::double_VAR_H | ( | subscript_max_y_top | , |
0. | 5, | ||
"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript." | |||
) |
tesseract::Tesseract::double_VAR_H | ( | superscript_min_y_bottom | , |
0. | 3, | ||
"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript." | |||
) |
tesseract::Tesseract::double_VAR_H | ( | suspect_rating_per_ch | , |
999. | 9, | ||
"Don't touch bad rating limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | suspect_accept_rating | , |
-999. | 9, | ||
"Accept good rating limit" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_lower_flip_hyphen | , |
1. | 5, | ||
"Aspect ratio dot/hyphen test" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_upper_flip_hyphen | , |
1. | 8, | ||
"Aspect ratio dot/hyphen test" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | rej_whole_of_mostly_reject_word_fract | , |
0. | 85, | ||
" | if, | ||
this fract" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | min_orientation_margin | , |
7. | 0, | ||
"Min acceptable orientation margin" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | textord_tabfind_vertical_text_ratio | , |
0. | 5, | ||
"Fraction of textlines deemed vertical to use vertical page " "mode" | |||
) |
tesseract::Tesseract::double_VAR_H | ( | textord_tabfind_aligned_gap_fraction | , |
0. | 75, | ||
"Fraction of height used as a minimum gap for aligned blobs." | |||
) |
void tesseract::Tesseract::dump_words | ( | WERD_RES_LIST & | perm, |
int16_t | score, | ||
int16_t | mode, | ||
bool | improved | ||
) |
void tesseract::Tesseract::end_tesseract | ( | ) |
int16_t tesseract::Tesseract::eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
int16_t tesseract::Tesseract::failure_count | ( | WERD_RES * | word | ) |
bool tesseract::Tesseract::FindSegmentation | ( | const GenericVector< UNICHAR_ID > & | target_text, |
WERD_RES * | word_res | ||
) |
Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.
int16_t tesseract::Tesseract::first_alphanum_index | ( | const char * | word, |
const char * | word_lengths | ||
) |
int16_t tesseract::Tesseract::first_alphanum_offset | ( | const char * | word, |
const char * | word_lengths | ||
) |
void tesseract::Tesseract::fix_fuzzy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
void tesseract::Tesseract::fix_fuzzy_spaces | ( | ETEXT_DESC * | monitor, |
int32_t | word_count, | ||
PAGE_RES * | page_res | ||
) |
void tesseract::Tesseract::fix_noisy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
void tesseract::Tesseract::fix_rep_char | ( | PAGE_RES_IT * | page_res_it | ) |
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
bool tesseract::Tesseract::fixspace_thinks_word_done | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::flip_0O | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::flip_hyphens | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::font_recognition_pass | ( | PAGE_RES * | page_res | ) |
font_recognition_pass
Smooth the fonts for the document.
int16_t tesseract::Tesseract::fp_eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
GARBAGE_LEVEL tesseract::Tesseract::garbage_word | ( | WERD_RES * | word, |
BOOL8 | ok_dict_word | ||
) |
UNICHAR_ID tesseract::Tesseract::get_rep_char | ( | WERD_RES * | word | ) |
|
inline |
|
overridevirtual |
Reimplemented from tesseract::Classify.
ImageData * tesseract::Tesseract::GetLineData | ( | const TBOX & | line_box, |
const GenericVector< TBOX > & | boxes, | ||
const GenericVector< STRING > & | texts, | ||
int | start_box, | ||
int | end_box, | ||
const BLOCK & | block | ||
) |
ImageData * tesseract::Tesseract::GetRectImage | ( | const TBOX & | box, |
const BLOCK & | block, | ||
int | padding, | ||
TBOX * | revised_box | ||
) | const |
void tesseract::Tesseract::GetSubAndSuperscriptCandidates | ( | const WERD_RES * | word, |
int * | num_rebuilt_leading, | ||
ScriptPos * | leading_pos, | ||
float * | leading_certainty, | ||
int * | num_rebuilt_trailing, | ||
ScriptPos * | trailing_pos, | ||
float * | trailing_certainty, | ||
float * | avg_certainty, | ||
float * | unlikely_threshold | ||
) |
Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.
[in] | word | The word to examine. |
[out] | num_rebuilt_leading | the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified. |
[out] | leading_pos | "super" or "sub" (for debugging) |
[out] | leading_certainty | the worst certainty in the leading blobs. |
[out] | num_rebuilt_trailing | the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified. |
[out] | trailing_pos | "super" or "sub" (for debugging) |
[out] | trailing_certainty | the worst certainty in the trailing blobs. |
[out] | avg_certainty | the average certainty of "normal" blobs in the word. |
[out] | unlikely_threshold | the threshold (on certainty) we used to select "bad enough" outlier characters. |
|
inline |
|
inline |
FILE * tesseract::Tesseract::init_recog_training | ( | const STRING & | fname | ) |
int tesseract::Tesseract::init_tesseract | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params, | ||
TessdataManager * | mgr | ||
) |
|
inline |
int tesseract::Tesseract::init_tesseract_internal | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params, | ||
TessdataManager * | mgr | ||
) |
bool tesseract::Tesseract::init_tesseract_lang_data | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params, | ||
TessdataManager * | mgr | ||
) |
int tesseract::Tesseract::init_tesseract_lm | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
TessdataManager * | mgr | ||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_pageseg_mode | , |
PSM_SINGLE_BLOCK | , | ||
"Page seg mode: | 0 = osd only , |
||
1 | = auto+osd , |
||
2 | = auto , |
||
3 | = col , |
||
4 | = block , |
||
" " | 5 = line , |
||
6 | = word |
||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_ocr_engine_mode | , |
tesseract::OEM_DEFAULT | , | ||
"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available." | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | pageseg_devanagari_split_strategy | , |
tesseract::ShiroRekhaSplitter::NO_SPLIT | , | ||
"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation." | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | ocr_devanagari_split_strategy | , |
tesseract::ShiroRekhaSplitter::NO_SPLIT | , | ||
"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr." | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | bidi_debug | , |
0 | , | ||
"Debug level for BiDi" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | applybox_debug | , |
1 | , | ||
"Debug level" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | applybox_page | , |
0 | , | ||
"Page number to apply boxes from" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_bigram_debug | , |
0 | , | ||
"Amount of debug output for bigram " "correction." | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | debug_noise_removal | , |
0 | , | ||
"Debug reassignment of small outlines" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | noise_maxperblob | , |
8 | , | ||
"Max diacritics to apply to a blob" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | noise_maxperword | , |
16 | , | ||
"Max diacritics to apply to a word" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | debug_x_ht_level | , |
0 | , | ||
"Reestimate debug" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | quality_min_initial_alphas_reqd | , |
2 | , | ||
"alphas in a good word" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_tess_adaption_mode | , |
0x27 | , | ||
"Adaptation decision algorithm for tess" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_test_adaption_mode | , |
3 | , | ||
"Adaptation decision algorithm for tess" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | multilang_debug_level | , |
0 | , | ||
"Print multilang debug info." | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | paragraph_debug_level | , |
0 | , | ||
"Print paragraph debug info." | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_preserve_min_wd_len | , |
2 | , | ||
"Only preserve wds longer than this" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_rating_max | , |
10 | , | ||
"For adj length in rating per ch" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_pot_indicators | , |
1 | , | ||
"How many potential indicators needed" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_leave_lc_strings | , |
4 | , | ||
"Don't crunch words with long lower case strings" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_leave_uc_strings | , |
4 | , | ||
"Don't crunch words with long lower case strings" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_long_repetitions | , |
3 | , | ||
"Crunch words with long repetitions" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_debug | , |
0 | , | ||
"As it says" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | fixsp_non_noise_limit | , |
1 | , | ||
"How many non-noise blbs either side?" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | fixsp_done_mode | , |
1 | , | ||
"What constitues done for spacing" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | debug_fix_space_level | , |
0 | , | ||
"Contextual fixspace debug" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | x_ht_acceptance_tolerance | , |
8 | , | ||
"Max allowed deviation of blob top outside of font data" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | x_ht_min_change | , |
8 | , | ||
"Min change in xht before actually trying it" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | superscript_debug | , |
0 | , | ||
"Debug level for sub & superscript fixer" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | jpg_quality | , |
85 | , | ||
"Set JPEG quality level" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | user_defined_dpi | , |
0 | , | ||
"Specify DPI for input image" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | min_characters_to_try | , |
50 | , | ||
"Specify minimum characters to try during OSD" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | suspect_level | , |
99 | , | ||
"Suspect marker level" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | suspect_space_level | , |
100 | , | ||
"Min suspect level for rejecting spaces" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | suspect_short_words | , |
2 | , | ||
"Don't Suspect dict wds longer than this" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_reject_mode | , |
0 | , | ||
"Rejection algorithm" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_image_border | , |
2 | , | ||
"Rej blbs near image edge limit" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | min_sane_x_ht_pixels | , |
8 | , | ||
"Reject any x-ht lt or eq than this" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_page_number | , |
- | 1, | ||
"-1 -> All | pages, | ||
else specific page to process" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_parallelize | , |
0 | , | ||
"Run in parallel where possible" | |||
) |
tesseract::Tesseract::INT_VAR_H | ( | lstm_choice_mode | , |
0 | , | ||
"Allows to include alternative symbols choices in the hOCR output. " "Valid input values are | 0, | ||
1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are accumulated per character." | |||
) |
void tesseract::Tesseract::join_words | ( | WERD_RES * | word, |
WERD_RES * | word2, | ||
BlamerBundle * | orig_bb | ||
) | const |
void tesseract::Tesseract::LSTMRecognizeWord | ( | const BLOCK & | block, |
ROW * | row, | ||
WERD_RES * | word, | ||
PointerVector< WERD_RES > * | words | ||
) |
void tesseract::Tesseract::match_word_pass_n | ( | int | pass_n, |
WERD_RES * | word, | ||
ROW * | row, | ||
BLOCK * | block | ||
) |
match_word_pass2
Baseline normalize the word and pass it to Tess.
void tesseract::Tesseract::MaximallyChopWord | ( | const GenericVector< TBOX > & | boxes, |
BLOCK * | block, | ||
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.
|
inline |
|
inline |
bool tesseract::Tesseract::noise_outlines | ( | TWERD * | word | ) |
bool tesseract::Tesseract::non_0_digit | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
bool tesseract::Tesseract::non_O_upper | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
|
inline |
bool tesseract::Tesseract::one_ell_conflict | ( | WERD_RES * | word_res, |
bool | update_map | ||
) |
void tesseract::Tesseract::output_pass | ( | PAGE_RES_IT & | page_res_it, |
const TBOX * | target_word_box | ||
) |
void tesseract::Tesseract::ParseLanguageString | ( | const char * | lang_str, |
GenericVector< STRING > * | to_load, | ||
GenericVector< STRING > * | not_to_load | ||
) |
void tesseract::Tesseract::pgeditor_main | ( | int | width, |
int | height, | ||
PAGE_RES * | page_res | ||
) |
Top level editor operation: Setup a new window and an according event handler
|
inline |
|
inline |
|
inline |
bool tesseract::Tesseract::potential_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level, | ||
bool | ok_dict_word | ||
) |
void tesseract::Tesseract::PreenXHeights | ( | BLOCK_LIST * | block_list | ) |
Any row xheight that is significantly different from the median is set to the median.
void tesseract::Tesseract::PrepareForPageseg | ( | ) |
void tesseract::Tesseract::PrepareForTessOCR | ( | BLOCK_LIST * | block_list, |
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
void tesseract::Tesseract::PrerecAllWordsPar | ( | const GenericVector< WordData > & | words | ) |
bool tesseract::Tesseract::process_cmd_win_event | ( | int32_t | cmd_event, |
char * | new_value | ||
) |
void tesseract::Tesseract::process_image_event | ( | const SVEvent & | event | ) |
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
void tesseract::Tesseract::process_selected_words | ( | PAGE_RES * | page_res, |
TBOX & | selection_box, | ||
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) | word_processor | ||
) |
bool tesseract::Tesseract::ProcessTargetWord | ( | const TBOX & | word_box, |
const TBOX & | target_word_box, | ||
const char * | word_config, | ||
int | pass | ||
) |
void tesseract::Tesseract::quality_based_rejection | ( | PAGE_RES_IT & | page_res_it, |
bool | good_quality_doc | ||
) |
void tesseract::Tesseract::read_config_file | ( | const char * | filename, |
SetParamConstraint | constraint | ||
) |
bool tesseract::Tesseract::ReassignDiacritics | ( | int | pass, |
PAGE_RES_IT * | pr_it, | ||
bool * | make_next_word_fuzzy | ||
) |
bool tesseract::Tesseract::recog_all_words | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
int | dopasses | ||
) |
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.
page_res | page structure |
monitor | progress monitor |
word_config | word_config file |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
bool tesseract::Tesseract::recog_interactive | ( | PAGE_RES_IT * | pr_it | ) |
Recognize a single word in interactive mode.
pr_it | the page results iterator |
void tesseract::Tesseract::recog_training_segmented | ( | const STRING & | fname, |
PAGE_RES * | page_res, | ||
volatile ETEXT_DESC * | monitor, | ||
FILE * | output_file | ||
) |
void tesseract::Tesseract::recog_word | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::recog_word_recursive | ( | WERD_RES * | word | ) |
bool tesseract::Tesseract::RecogAllWordsPassN | ( | int | pass_n, |
ETEXT_DESC * | monitor, | ||
PAGE_RES_IT * | pr_it, | ||
GenericVector< WordData > * | words | ||
) |
void tesseract::Tesseract::recognize_page | ( | STRING & | image_name | ) |
void tesseract::Tesseract::reject_edge_blobs | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::reject_I_1_L | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::reject_mostly_rejects | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::rejection_passes | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config | ||
) |
void tesseract::Tesseract::ReportFailedBox | ( | int | boxfile_lineno, |
TBOX | box, | ||
const char * | box_ch, | ||
const char * | err_msg | ||
) |
Logs a bad box by line in the box file and box coords.
void tesseract::Tesseract::ReportXhtFixResult | ( | bool | accept_new_word, |
float | new_x_ht, | ||
WERD_RES * | word, | ||
WERD_RES * | new_word | ||
) |
void tesseract::Tesseract::ReSegmentByClassification | ( | PAGE_RES * | page_res | ) |
Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.
bool tesseract::Tesseract::ResegmentCharBox | ( | PAGE_RES * | page_res, |
const TBOX * | prev_box, | ||
const TBOX & | box, | ||
const TBOX * | next_box, | ||
const char * | correct_text | ||
) |
Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.
Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.
This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.
bool tesseract::Tesseract::ResegmentWordBox | ( | BLOCK_LIST * | block_list, |
const TBOX & | box, | ||
const TBOX * | next_box, | ||
const char * | correct_text | ||
) |
Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.
void tesseract::Tesseract::ResetAdaptiveClassifier | ( | ) |
void tesseract::Tesseract::ResetDocumentDictionary | ( | ) |
|
inline |
int tesseract::Tesseract::RetryWithLanguage | ( | const WordData & | word_data, |
WordRecognizer | recognizer, | ||
bool | debug, | ||
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | best_words | ||
) |
|
inline |
int16_t tesseract::Tesseract::safe_dict_word | ( | const WERD_RES * | werd_res | ) |
|
inline |
|
inline |
void tesseract::Tesseract::script_pos_pass | ( | PAGE_RES * | page_res | ) |
void tesseract::Tesseract::SearchForText | ( | const GenericVector< BLOB_CHOICE_LIST *> * | choices, |
int | choices_pos, | ||
int | choices_length, | ||
const GenericVector< UNICHAR_ID > & | target_text, | ||
int | text_index, | ||
float | rating, | ||
GenericVector< int > * | segmentation, | ||
float * | best_rating, | ||
GenericVector< int > * | best_segmentation | ||
) |
Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).
choices | is an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc. |
choices_pos | |
choices_length | |
target_text | |
text_index | |
rating | |
segmentation | |
best_rating | |
best_segmentation |
void tesseract::Tesseract::SearchWords | ( | PointerVector< WERD_RES > * | words | ) |
int tesseract::Tesseract::SegmentPage | ( | const STRING * | input_file, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.
bool tesseract::Tesseract::SelectGoodDiacriticOutlines | ( | int | pass, |
float | certainty_threshold, | ||
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
const GenericVector< C_OUTLINE *> & | outlines, | ||
int | num_outlines, | ||
GenericVector< bool > * | ok_outlines | ||
) |
void tesseract::Tesseract::set_done | ( | WERD_RES * | word, |
int16_t | pass | ||
) |
|
inline |
|
inline |
|
inline |
|
inline |
void tesseract::Tesseract::set_unlv_suspects | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::set_word_fonts | ( | WERD_RES * | word | ) |
set_word_fonts
Get the fonts for the word.
void tesseract::Tesseract::SetBlackAndWhitelist | ( | ) |
void tesseract::Tesseract::SetEquationDetect | ( | EquationDetect * | detector | ) |
|
inline |
void tesseract::Tesseract::SetupAllWordsPassN | ( | int | pass_n, |
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
PAGE_RES * | page_res, | ||
GenericVector< WordData > * | words | ||
) |
If tesseract is to be run, sets the words up ready for it.
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes | ( | const GenericVector< TBOX > & | boxes, |
BLOCK_LIST * | block_list | ||
) |
Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr, | ||
TO_BLOCK_LIST * | to_blocks, | ||
Pix ** | photo_mask_pix, | ||
Pix ** | music_mask_pix | ||
) |
Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.
void tesseract::Tesseract::SetupUniversalFontIds | ( | ) |
void tesseract::Tesseract::SetupWordPassN | ( | int | pass_n, |
WordData * | word | ||
) |
void tesseract::Tesseract::SetupWordScripts | ( | BLOCK_LIST * | blocks | ) |
|
inline |
void tesseract::Tesseract::split_and_recog_word | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::split_word | ( | WERD_RES * | word, |
int | split_pt, | ||
WERD_RES ** | right_piece, | ||
BlamerBundle ** | orig_blamer_bundle | ||
) | const |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_char_blacklist | , |
"" | , | ||
"Blacklist of chars not to recognize" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_char_whitelist | , |
"" | , | ||
"Whitelist of chars to recognize" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_char_unblacklist | , |
"" | , | ||
"List of chars to override tessedit_char_blacklist" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_write_params_to_file | , |
"" | , | ||
"Write all parameters to the given file." | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | applybox_exposure_pattern | , |
".exp" | , | ||
"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form ..exp.tif" | [lang][fontname][num] | ||
) |
tesseract::Tesseract::STRING_VAR_H | ( | chs_leading_punct | ) |
tesseract::Tesseract::STRING_VAR_H | ( | chs_trailing_punct1 | , |
" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | chs_trailing_punct2 | , |
" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | outlines_odd | , |
"%| " | , | ||
"Non standard number of outlines" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | outlines_2 | , |
"ij!?%\ | , | ||
"Non standard number of outlines" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | numeric_punctuation | , |
" | ., | ||
" | , | ||
"Punct. chs expected WITHIN numbers" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | unrecognised_char | , |
"|" | , | ||
"Output char for unidentified blobs" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | ok_repeated_ch_non_alphanum_wds | , |
"-?*\ | , | ||
"Allow NN to unrej" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | conflict_set_I_l_1 | , |
"Il1" | [], | ||
"Il1 conflict set" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | file_type | , |
".tif" | , | ||
"Filename extension" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_load_sublangs | , |
"" | , | ||
"List of languages to load with this one" | |||
) |
tesseract::Tesseract::STRING_VAR_H | ( | page_separator | , |
"\ | , | ||
"Page separator(default is form feed control character)" | |||
) |
bool tesseract::Tesseract::SubAndSuperscriptFix | ( | WERD_RES * | word | ) |
Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.
This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.
bool tesseract::Tesseract::terrible_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level | ||
) |
bool tesseract::Tesseract::tess_acceptable_word | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::tess_add_doc_word | ( | WERD_CHOICE * | word_choice | ) |
void tesseract::Tesseract::tess_segment_pass_n | ( | int | pass_n, |
WERD_RES * | word | ||
) |
bool tesseract::Tesseract::TestNewNormalization | ( | int | original_misfits, |
float | baseline_shift, | ||
float | new_x_ht, | ||
WERD_RES * | word, | ||
BLOCK * | block, | ||
ROW * | row | ||
) |
|
inline |
void tesseract::Tesseract::TidyUp | ( | PAGE_RES * | page_res | ) |
void tesseract::Tesseract::tilde_crunch | ( | PAGE_RES_IT & | page_res_it | ) |
void tesseract::Tesseract::tilde_delete | ( | PAGE_RES_IT & | page_res_it | ) |
void tesseract::Tesseract::TrainFromBoxes | ( | const GenericVector< TBOX > & | boxes, |
const GenericVector< STRING > & | texts, | ||
BLOCK_LIST * | block_list, | ||
DocumentData * | training_data | ||
) |
void tesseract::Tesseract::TrainLineRecognizer | ( | const STRING & | input_imagename, |
const STRING & | output_basename, | ||
BLOCK_LIST * | block_list | ||
) |
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits | ( | int | num_chopped_leading, |
float | leading_certainty, | ||
ScriptPos | leading_pos, | ||
int | num_chopped_trailing, | ||
float | trailing_certainty, | ||
ScriptPos | trailing_pos, | ||
WERD_RES * | word, | ||
bool * | is_good, | ||
int * | retry_rebuild_leading, | ||
int * | retry_rebuild_trailing | ||
) |
Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.
[in] | num_chopped_leading | how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | leading_certainty | the (minimum) certainty had by the characters in the original leading section. |
[in] | leading_pos | "super" or "sub" (for debugging) |
[in] | num_chopped_trailing | how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | trailing_certainty | the (minimum) certainty had by the characters in the original trailing section. |
[in] | trailing_pos | "super" or "sub" (for debugging) |
[in] | word | the word to try to chop up. |
[out] | is_good | do we believe our result? |
[out] | retry_rebuild_leading,retry_rebuild_trailing | If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars. |
void tesseract::Tesseract::unrej_good_quality_words | ( | PAGE_RES_IT & | page_res_it | ) |
bool tesseract::Tesseract::word_adaptable | ( | WERD_RES * | word, |
uint16_t | mode | ||
) |
bool tesseract::Tesseract::word_blank_and_set_display | ( | PAGE_RES_IT * | pr_its | ) |
bool tesseract::Tesseract::word_bln_display | ( | PAGE_RES_IT * | pr_it | ) |
Normalize word and display in word window
void tesseract::Tesseract::word_char_quality | ( | WERD_RES * | word, |
ROW * | row, | ||
int16_t * | match_count, | ||
int16_t * | accepted_match_count | ||
) |
bool tesseract::Tesseract::word_contains_non_1_digit | ( | const char * | word, |
const char * | word_lengths | ||
) |
CRUNCH_MODE tesseract::Tesseract::word_deletable | ( | WERD_RES * | word, |
int16_t & | delete_mode | ||
) |
bool tesseract::Tesseract::word_display | ( | PAGE_RES_IT * | pr_it | ) |
word_display() Word Processor
Display a word according to its display modes
bool tesseract::Tesseract::word_dumper | ( | PAGE_RES_IT * | pr_it | ) |
Dump members to the debug window
int16_t tesseract::Tesseract::word_outline_errs | ( | WERD_RES * | word | ) |
bool tesseract::Tesseract::word_set_display | ( | PAGE_RES_IT * | pr_it | ) |
word_set_display() Word processor
Display word according to current display mode settings
int16_t tesseract::Tesseract::worst_noise_blob | ( | WERD_RES * | word_res, |
float * | worst_noise_score | ||
) |
void tesseract::Tesseract::write_results | ( | PAGE_RES_IT & | page_res_it, |
char | newline_type, | ||
bool | force_eol | ||
) |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |