#include <resultiterator.h>
Public Member Functions | |
virtual | ~ResultIterator ()=default |
virtual void | Begin () |
virtual bool | Next (PageIteratorLevel level) |
virtual bool | IsAtBeginningOf (PageIteratorLevel level) const |
virtual bool | IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const |
int | BlanksBeforeWord () const |
virtual char * | GetUTF8Text (PageIteratorLevel level) const |
virtual std::vector< std::vector< std::pair< const char *, float > > > * | GetBestLSTMSymbolChoices () const |
bool | ParagraphIsLtr () const |
Public Member Functions inherited from tesseract::LTRResultIterator | |
LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height) | |
virtual | ~LTRResultIterator () |
char * | GetUTF8Text (PageIteratorLevel level) const |
void | SetLineSeparator (const char *new_line) |
void | SetParagraphSeparator (const char *new_para) |
float | Confidence (PageIteratorLevel level) const |
void | RowAttributes (float *row_height, float *descenders, float *ascenders) const |
const char * | WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const |
const char * | WordRecognitionLanguage () const |
StrongScriptDirection | WordDirection () const |
bool | WordIsFromDictionary () const |
int | BlanksBeforeWord () const |
bool | WordIsNumeric () const |
bool | HasBlamerInfo () const |
const void * | GetParamsTrainingBundle () const |
const char * | GetBlamerDebug () const |
const char * | GetBlamerMisadaptionDebug () const |
bool | HasTruthString () const |
bool | EquivalentToTruth (const char *str) const |
char * | WordTruthUTF8Text () const |
char * | WordNormedUTF8Text () const |
const char * | WordLattice (int *lattice_size) const |
bool | SymbolIsSuperscript () const |
bool | SymbolIsSubscript () const |
bool | SymbolIsDropcap () const |
Public Member Functions inherited from tesseract::PageIterator | |
PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height) | |
virtual | ~PageIterator () |
PageIterator (const PageIterator &src) | |
const PageIterator & | operator= (const PageIterator &src) |
bool | PositionedAtSameWord (const PAGE_RES_IT *other) const |
virtual void | RestartParagraph () |
bool | IsWithinFirstTextlineOfParagraph () const |
virtual void | RestartRow () |
int | Cmp (const PageIterator &other) const |
void | SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots) |
bool | BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const |
bool | BoundingBox (PageIteratorLevel level, const int padding, int *left, int *top, int *right, int *bottom) const |
bool | BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const |
bool | Empty (PageIteratorLevel level) const |
PolyBlockType | BlockType () const |
Pta * | BlockPolygon () const |
Pix * | GetBinaryImage (PageIteratorLevel level) const |
Pix * | GetImage (PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const |
bool | Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const |
void | Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const |
void | ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const |
bool | SetWordBlamerBundle (BlamerBundle *blamer_bundle) |
Static Public Member Functions | |
static ResultIterator * | StartOfParagraph (const LTRResultIterator &resit) |
static void | CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order) |
Static Public Attributes | |
static const int | kMinorRunStart = -1 |
static const int | kMinorRunEnd = -2 |
static const int | kComplexWord = -3 |
Protected Member Functions | |
TESS_LOCAL | ResultIterator (const LTRResultIterator &resit) |
Protected Member Functions inherited from tesseract::PageIterator | |
TESS_LOCAL void | BeginWord (int offset) |
Private Member Functions | |
bool | CurrentParagraphIsLtr () const |
void | CalculateTextlineOrder (bool paragraph_is_ltr, const LTRResultIterator &resit, GenericVectorEqEq< int > *indices) const |
void | CalculateTextlineOrder (bool paragraph_is_ltr, const LTRResultIterator &resit, GenericVector< StrongScriptDirection > *ssd, GenericVectorEqEq< int > *indices) const |
int | LTRWordIndex () const |
void | CalculateBlobOrder (GenericVector< int > *blob_indices) const |
void | MoveToLogicalStartOfTextline () |
void | MoveToLogicalStartOfWord () |
bool | IsAtFinalSymbolOfWord () const |
bool | IsAtFirstSymbolOfWord () const |
void | AppendSuffixMarks (STRING *text) const |
void | AppendUTF8WordText (STRING *text) const |
void | IterateAndAppendUTF8TextlineText (STRING *text) |
void | AppendUTF8ParagraphText (STRING *text) const |
bool | BidiDebug (int min_level) const |
Private Attributes | |
bool | current_paragraph_is_ltr_ |
bool | at_beginning_of_minor_run_ |
bool | in_minor_direction_ |
bool | preserve_interword_spaces_ |
Additional Inherited Members | |
Protected Attributes inherited from tesseract::LTRResultIterator | |
const char * | line_separator_ |
const char * | paragraph_separator_ |
Protected Attributes inherited from tesseract::PageIterator | |
PAGE_RES * | page_res_ |
Tesseract * | tesseract_ |
PAGE_RES_IT * | it_ |
WERD * | word_ |
int | word_length_ |
int | blob_index_ |
C_BLOB_IT * | cblob_it_ |
bool | include_upper_dots_ |
bool | include_lower_dots_ |
int | scale_ |
int | scaled_yres_ |
int | rect_left_ |
int | rect_top_ |
int | rect_width_ |
int | rect_height_ |
|
virtualdefault |
ResultIterator is copy constructible! The default copy constructor works just fine for us.
|
explicitprotected |
We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.
|
private |
Append any extra marks that should be appended to this word when printed. Mostly, these are Unicode BiDi control characters.
|
private |
Appends the text of the current paragraph in reading order to the given buffer. Each textline is terminated in a single newline character, and the paragraph gets an extra newline at the end.
|
private |
Appends the current word in reading order to the given buffer.
|
virtual |
Moves the iterator to point to the start of the page to begin an iteration.
Reimplemented from tesseract::PageIterator.
|
private |
Returns whether the bidi_debug flag is set to at least min_level.
int tesseract::ResultIterator::BlanksBeforeWord | ( | ) | const |
|
private |
Given an iterator pointing at a word, returns the logical reading order of blob indices for the word.
|
static |
Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.
For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:
Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
|
private |
Returns word indices as measured from resit->RestartRow() = index 0 for the reading order of words within a textline given an iterator into the middle of the text line. In addition to non-negative word indices, the following negative values may be inserted: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The previous word contains both left-to-right and right-to-left characters and was treated as neutral.
|
private |
Same as above, but the caller's ssd gets filled in if ssd != nullptr.
|
private |
Calculates the current paragraph's dominant writing direction. Typically, members should use current_paragraph_ltr_ instead.
|
virtual |
|
virtual |
Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.
|
virtual |
IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h
Reimplemented from tesseract::PageIterator.
|
virtual |
Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.
NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.
Reimplemented from tesseract::PageIterator.
|
private |
Are we pointing at the final (reading order) symbol of the word?
|
private |
Are we pointing at the first (reading order) symbol of the word?
|
private |
Appends the text of the current text line, assuming this iterator is positioned at the beginning of the text line This function updates the iterator to point to the first position past the text line. Each textline is terminated in a single newline character. If the textline ends a paragraph, it gets a second terminal newline.
|
private |
What is the index of the current word in a strict left-to-right reading of the row?
|
private |
Precondition: current_paragraph_is_ltr_ is set.
|
private |
Precondition: current_paragraph_is_ltr_ and in_minor_direction_ are set.
|
virtual |
Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.
Reimplemented from tesseract::PageIterator.
bool tesseract::ResultIterator::ParagraphIsLtr | ( | ) | const |
Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).
|
static |
|
private |
Is the currently pointed-at character at the beginning of a minor-direction run?
|
private |
|
private |
Is the currently pointed-at character in a minor-direction sequence?
|
static |
|
static |
|
static |
|
private |
Should detected inter-word spaces be preserved, or "compressed" to a single space character (default behavior).