21 #ifndef TESSERACT_TEXTORD_TEXTORD_H_ 22 #define TESSERACT_TEXTORD_TEXTORD_H_ 28 #include "publictypes.h" 70 explicit Textord(
CCStruct* ccstruct);
83 int height, Pix *binary_pix, Pix *thresholds_pix,
84 Pix *grey_pix,
bool use_box_bottoms,
85 BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
86 TO_BLOCK_LIST *to_blocks);
93 return use_cjk_fp_model_;
96 use_cjk_fp_model_ = flag;
102 TO_BLOCK_LIST *blocks
111 void find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
112 void filter_blobs(
ICOORD page_tr, TO_BLOCK_LIST* blocks,
bool testing_on);
126 int width,
int height, TO_BLOCK_LIST* to_blocks);
128 void MakeBlockRows(
int min_spacing,
int max_spacing,
133 void compute_block_xheight(
TO_BLOCK *block,
float gradient);
134 void compute_row_xheight(
TO_ROW *row,
137 int block_line_size);
138 void make_spline_rows(
TO_BLOCK* block,
143 void make_old_baselines(
TO_BLOCK* block,
146 void correlate_lines(
TO_BLOCK *block,
float gradient);
147 void correlate_neighbours(
TO_BLOCK *block,
150 int correlate_with_stats(
TO_ROW **rows,
153 void find_textlines(
TO_BLOCK *block,
159 void block_spacing_stats(
TO_BLOCK* block,
161 bool& old_text_ord_proportional,
163 int16_t& block_space_gap_width,
165 int16_t& block_non_space_gap_width
167 void row_spacing_stats(
TO_ROW *row,
172 int16_t block_space_gap_width,
174 int16_t block_non_space_gap_width
176 void old_to_method(
TO_ROW *row,
177 STATS *all_gap_stats,
178 STATS *space_gap_stats,
179 STATS *small_gap_stats,
180 int16_t block_space_gap_width,
182 int16_t block_non_space_gap_width
184 bool isolated_row_stats(
TO_ROW* row,
186 STATS* all_gap_stats,
187 bool suspected_table,
190 int16_t stats_count_under(
STATS *stats, int16_t threshold);
191 void improve_row_threshold(
TO_ROW *row,
STATS *all_gap_stats);
192 bool make_a_word_break(
TO_ROW* row,
196 int16_t real_current_gap,
197 int16_t within_xht_current_gap,
203 bool& prev_gap_was_a_space,
204 bool& break_at_next_gap);
207 bool suspected_punct_blob(
TO_ROW* row,
TBOX box);
208 void peek_at_next_gap(
TO_ROW *row,
212 int16_t &next_within_xht_gap);
213 void mark_gap(
TBOX blob,
216 int16_t prev_blob_width,
218 int16_t next_blob_width,
220 float find_mean_blob_spacing(
WERD *
word);
221 bool ignore_big_gap(
TO_ROW* row,
232 float filter_noise_blobs(BLOBNBOX_LIST *src_list,
233 BLOBNBOX_LIST *noise_list,
234 BLOBNBOX_LIST *small_list,
235 BLOBNBOX_LIST *large_list);
240 void cleanup_nontext_block(
BLOCK* block);
241 void cleanup_blocks(
bool clean_noise, BLOCK_LIST *blocks);
242 bool clean_noise_from_row(
ROW* row);
243 void clean_noise_from_words(
ROW *row);
246 void clean_small_noise_from_words(
ROW *row);
250 void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
255 void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
261 "Script has no xheight, so use a single mode for horizontal text");
263 BOOL_VAR_H(tosp_old_to_method,
false,
"Space stats use prechopping?");
264 BOOL_VAR_H(tosp_old_to_constrain_sp_kn,
false,
265 "Constrain relative values of inter and intra-word gaps for " 268 "Block stats to use fixed pitch rows?");
269 BOOL_VAR_H(tosp_force_wordbreak_on_punct,
false,
270 "Force word breaks on punct to break long lines in non-space " 273 "Space stats use prechopping?");
275 "Fix suspected bug in old code");
277 "Only stat OBVIOUS spaces");
279 "Only stat OBVIOUS spaces");
281 "Only stat OBVIOUS spaces");
283 "Only stat OBVIOUS spaces");
284 BOOL_VAR_H(tosp_recovery_isolated_row_stats,
true,
285 "Use row alone when inadequate cert spaces");
286 BOOL_VAR_H(tosp_only_small_gaps_for_kern,
false,
"Better guess");
287 BOOL_VAR_H(tosp_all_flips_fuzzy,
false,
"Pass ANY flip to context?");
289 "Don't restrict kn->sp fuzzy limit to tables");
291 "Use within xht gap for wd breaks");
293 "Use within xht gap for wd breaks");
295 "Only use within xht gap for wd breaks");
297 "Don't chng kn to space next to punct");
298 BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp,
true,
"Default flip");
299 BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn,
true,
"Default flip");
301 "Enable improvement heuristic");
302 INT_VAR_H(tosp_debug_level, 0,
"Debug data");
303 INT_VAR_H(tosp_enough_space_samples_for_median, 3,
304 "or should we use mean");
305 INT_VAR_H(tosp_redo_kern_limit, 10,
306 "No.samples reqd to reestimate for row");
307 INT_VAR_H(tosp_few_samples, 40,
308 "No.gaps reqd with 1 large gap to treat as a table");
309 INT_VAR_H(tosp_short_row, 20,
310 "No.gaps reqd with few cert spaces to use certs");
311 INT_VAR_H(tosp_sanity_method, 1,
"How to avoid being silly");
313 "Factor for defining space threshold in terms of space and " 316 "how far between kern and space?");
318 "how far between kern and space?");
320 "Fract of xheight for narrow");
322 "narrow if w/h less than this");
323 double_VAR_H(tosp_wide_fraction, 0.52,
"Fract of xheight for wide");
325 "wide if w/h less than this");
327 "Fract of xheight for fuzz sp");
329 "Fract of xheight for fuzz sp");
331 "Fract of xheight for fuzz sp");
332 double_VAR_H(tosp_gap_factor, 0.83,
"gap ratio to flip sp->kern");
334 "gap ratio to flip kern->sp");
336 "gap ratio to flip kern->sp");
338 "gap ratio to flip kern->sp");
339 double_VAR_H(tosp_ignore_big_gaps, -1,
"xht multiplier");
340 double_VAR_H(tosp_ignore_very_big_gaps, 3.5,
"xht multiplier");
341 double_VAR_H(tosp_rep_space, 1.6,
"rep gap multiplier for space");
343 "Fract of kerns reqd for isolated row stats");
345 "Min difference of kn & sp in table");
347 "Expect spaces bigger than this");
349 "Fuzzy if less than this");
350 double_VAR_H(tosp_fuzzy_kn_fraction, 0.5,
"New fuzzy kn alg");
351 double_VAR_H(tosp_fuzzy_sp_fraction, 0.5,
"New fuzzy sp alg");
353 "Don't trust spaces less than this time kn");
355 "Thresh guess - mult kn by this");
357 "Thresh guess - mult xht by this");
359 "Multiplier on kn to limit thresh");
361 "Don't autoflip kn to sp when large separation");
363 "Limit use of xht gap with large kns");
365 "Limit use of xht gap with odd small kns");
367 "Don't reduce box if the top left is non blank");
369 "Don't let sp minus kn get too small");
371 "How wide fuzzies need context");
373 BOOL_VAR_H(textord_no_rejects,
false,
"Don't remove noise blobs");
374 BOOL_VAR_H(textord_show_blobs,
false,
"Display unsorted blobs");
375 BOOL_VAR_H(textord_show_boxes,
false,
"Display boxes");
376 INT_VAR_H(textord_max_noise_size, 7,
"Pixel size of noise");
377 INT_VAR_H(textord_baseline_debug, 0,
"Baseline debug level");
378 double_VAR_H(textord_blob_size_bigile, 95,
"Percentile for large blobs");
380 "Fraction of bounding box for noise");
381 double_VAR_H(textord_blob_size_smallile, 20,
"Percentile for small blobs");
382 double_VAR_H(textord_initialx_ile, 0.75,
"Ile of sizes for xheight guess");
383 double_VAR_H(textord_initialasc_ile, 0.90,
"Ile of sizes for xheight guess");
384 INT_VAR_H(textord_noise_sizefraction, 10,
"Fraction of size for maxima");
385 double_VAR_H(textord_noise_sizelimit, 0.5,
"Fraction of x for big t count");
386 INT_VAR_H(textord_noise_translimit, 16,
"Transitions for normal blob");
387 double_VAR_H(textord_noise_normratio, 2.0,
"Dot to norm ratio for deletion");
388 BOOL_VAR_H(textord_noise_rejwords,
true,
"Reject noise-like words");
389 BOOL_VAR_H(textord_noise_rejrows,
true,
"Reject noise-like rows");
390 double_VAR_H(textord_noise_syfract, 0.2,
"xh fract error for norm blobs");
392 "xh fract width error for norm blobs");
394 "Height fraction to discard outlines as speckle noise");
395 INT_VAR_H(textord_noise_sncount, 1,
"super norm blobs to save row");
396 double_VAR_H(textord_noise_rowratio, 6.0,
"Dot to norm ratio for deletion");
397 BOOL_VAR_H(textord_noise_debug, FALSE,
"Debug row garbage detector");
398 double_VAR_H(textord_blshift_maxshift, 0.00,
"Max baseline shift");
399 double_VAR_H(textord_blshift_xfraction, 9.99,
"Min size of baseline shift");
403 #endif // TESSERACT_TEXTORD_TEXTORD_H_ bool use_cjk_fp_model() const
Definition: textord.h:92
const WERD * word() const
Definition: textord.h:54
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
ICOORD page_tr_
Definition: textord.h:119
Definition: quspline.h:32
Definition: baseapi.cpp:94
WordWithBox(WERD *word)
Definition: textord.h:44
TBOX true_bounding_box() const
Definition: textord.h:52
Definition: ccstruct.h:25
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
WERD * word_
Definition: textord.h:58
double_VAR_H(textord_tabvector_vertical_gap_fraction, 0.5, "Max fraction of mean blob width allowed for vertical gaps in vertical text")
Definition: blobbox.h:144
Definition: ocrblock.h:30
const TBOX & bounding_box() const
Definition: textord.h:50
BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection")
Definition: blobbox.h:556
CCStruct * ccstruct_
Definition: textord.h:116
Definition: statistc.h:33
WordWithBox()
Definition: textord.h:43
integer coordinate
Definition: points.h:32
void pad(int xpad, int ypad)
Definition: rect.h:131
TBOX bounding_box_
Definition: textord.h:60
int16_t height() const
Definition: rect.h:108
C_BLOB_LIST * RejBlobs() const
Definition: textord.h:53
Definition: blobbox.h:705
bool use_cjk_fp_model_
Definition: textord.h:121
PageSegMode
Definition: publictypes.h:163
TBOX true_bounding_box() const
Definition: werd.cpp:180