tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
tesseract::Validator Class Referenceabstract

#include <validator.h>

Inheritance diagram for tesseract::Validator:
Collaboration diagram for tesseract::Validator:

Public Member Functions

virtual ~Validator ()
 

Static Public Member Functions

static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
static bool IsZeroWidthMark (char32 ch)
 

Static Public Attributes

static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 

Protected Types

enum  CharClass {
  CharClass::kConsonant = 'C', CharClass::kVowel = 'V', CharClass::kVirama = 'H', CharClass::kMatra = 'M',
  CharClass::kMatraPiece = 'P', CharClass::kVowelModifier = 'D', CharClass::kZeroWidthNonJoiner = 'z', CharClass::kZeroWidthJoiner = 'Z',
  CharClass::kVedicMark = 'v', CharClass::kNukta = 'N', CharClass::kRobat = 'R', CharClass::kOther = 'O',
  CharClass::kWhitespace = ' ', CharClass::kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 

Protected Member Functions

 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (int length)
 
bool UseMultiCode (int length)
 
virtual bool ConsumeGraphemeIfValid ()=0
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
virtual CharClass UnicodeToCharClass (char32 ch) const =0
 
void Clear ()
 

Static Protected Member Functions

static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 

Protected Attributes

ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
int codes_used_
 
int output_used_
 
bool report_errors_
 

Static Protected Attributes

static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Member Typedef Documentation

◆ IndicPair

using tesseract::Validator::IndicPair = std::pair<CharClass, char32>
protected

Member Enumeration Documentation

◆ CharClass

enum tesseract::Validator::CharClass
strongprotected
Enumerator
kConsonant 
kVowel 
kVirama 
kMatra 
kMatraPiece 
kVowelModifier 
kZeroWidthNonJoiner 
kZeroWidthJoiner 
kVedicMark 
kNukta 
kRobat 
kOther 
kWhitespace 
kCombiner 

Constructor & Destructor Documentation

◆ ~Validator()

tesseract::Validator::~Validator ( )
virtualdefault

◆ Validator()

tesseract::Validator::Validator ( ViramaScript  script,
bool  report_errors 
)
inlineprotected

Member Function Documentation

◆ Clear()

void tesseract::Validator::Clear ( )
protected

◆ CodeOnlyToOutput()

bool tesseract::Validator::CodeOnlyToOutput ( )
inlineprotected

◆ ComputeClassCodes()

void tesseract::Validator::ComputeClassCodes ( const std::vector< char32 > &  text)
protected

◆ ConsumeGraphemeIfValid()

virtual bool tesseract::Validator::ConsumeGraphemeIfValid ( )
protectedpure virtual

◆ IsSubscriptScript()

bool tesseract::Validator::IsSubscriptScript ( ) const
protected

◆ IsVedicAccent()

bool tesseract::Validator::IsVedicAccent ( char32  unicode)
staticprotected

◆ IsVirama()

bool tesseract::Validator::IsVirama ( char32  unicode)
staticprotected

◆ IsZeroWidthMark()

static bool tesseract::Validator::IsZeroWidthMark ( char32  ch)
inlinestatic

◆ MostFrequentViramaScript()

ViramaScript tesseract::Validator::MostFrequentViramaScript ( const std::vector< char32 > &  utf32)
staticprotected

◆ MoveResultsToDest()

void tesseract::Validator::MoveResultsToDest ( GraphemeNormMode  g_mode,
std::vector< std::vector< char32 >> *  dest 
)
protected

◆ MultiCodePart()

void tesseract::Validator::MultiCodePart ( int  length)
inlineprotected

◆ ScriptValidator()

std::unique_ptr< Validator > tesseract::Validator::ScriptValidator ( ViramaScript  script,
bool  report_errors 
)
staticprotected

◆ UnicodeToCharClass()

virtual CharClass tesseract::Validator::UnicodeToCharClass ( char32  ch) const
protectedpure virtual

◆ UseMultiCode()

bool tesseract::Validator::UseMultiCode ( int  length)
inlineprotected

◆ ValidateCleanAndSegment()

bool tesseract::Validator::ValidateCleanAndSegment ( GraphemeNormMode  g_mode,
bool  report_errors,
const std::vector< char32 > &  src,
std::vector< std::vector< char32 >> *  dest 
)
static

◆ ValidateCleanAndSegmentInternal()

bool tesseract::Validator::ValidateCleanAndSegmentInternal ( GraphemeNormMode  g_mode,
const std::vector< char32 > &  src,
std::vector< std::vector< char32 >> *  dest 
)
protected

Member Data Documentation

◆ codes_

std::vector<IndicPair> tesseract::Validator::codes_
protected

◆ codes_used_

int tesseract::Validator::codes_used_
protected

◆ kIndicCodePageSize

const int tesseract::Validator::kIndicCodePageSize = 128
staticprotected

◆ kInvalid

const char32 tesseract::Validator::kInvalid = 0xfffd
static

◆ kJavaneseVirama

const char32 tesseract::Validator::kJavaneseVirama = 0xa9c0
staticprotected

◆ kKhmerVirama

const char32 tesseract::Validator::kKhmerVirama = 0x17d2
staticprotected

◆ kLeftToRightMark

const char32 tesseract::Validator::kLeftToRightMark = 0x200E
static

◆ kMaxJavaneseUnicode

const char32 tesseract::Validator::kMaxJavaneseUnicode = 0xa9df
staticprotected

◆ kMaxSinhalaUnicode

const char32 tesseract::Validator::kMaxSinhalaUnicode = 0xdff
staticprotected

◆ kMaxViramaScriptUnicode

const char32 tesseract::Validator::kMaxViramaScriptUnicode = 0x17ff
staticprotected

◆ kMinIndicUnicode

const char32 tesseract::Validator::kMinIndicUnicode = 0x900
staticprotected

◆ kMyanmarVirama

const char32 tesseract::Validator::kMyanmarVirama = 0x1039
staticprotected

◆ kRightToLeftMark

const char32 tesseract::Validator::kRightToLeftMark = 0x200F
static

◆ kSinhalaVirama

const char32 tesseract::Validator::kSinhalaVirama = 0xdca
staticprotected

◆ kZeroWidthJoiner

const char32 tesseract::Validator::kZeroWidthJoiner = 0x200D
static

◆ kZeroWidthNonJoiner

const char32 tesseract::Validator::kZeroWidthNonJoiner = 0x200C
static

◆ kZeroWidthSpace

const char32 tesseract::Validator::kZeroWidthSpace = 0x200B
static

◆ output_

std::vector<char32> tesseract::Validator::output_
protected

◆ output_used_

int tesseract::Validator::output_used_
protected

◆ parts_

std::vector<std::vector<char32> > tesseract::Validator::parts_
protected

◆ report_errors_

bool tesseract::Validator::report_errors_
protected

◆ script_

ViramaScript tesseract::Validator::script_
protected

The documentation for this class was generated from the following files: