21 #ifndef TESSERACT_TRAINING_VALIDATOR_H_ 22 #define TESSERACT_TRAINING_VALIDATOR_H_ 83 const std::vector<char32>& src,
84 std::vector<std::vector<char32>>* dest);
89 return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
90 ch == kRightToLeftMark || ch == kInvalid;
123 kVowelModifier =
'D',
124 kZeroWidthNonJoiner =
'z',
125 kZeroWidthJoiner =
'Z',
140 report_errors_(report_errors) {}
143 static std::unique_ptr<Validator> ScriptValidator(
ViramaScript script,
152 const std::vector<char32>& src,
153 std::vector<std::vector<char32>>* dest);
156 std::vector<std::vector<char32>>* dest);
161 const std::vector<char32>& utf32);
163 static bool IsVirama(
char32 unicode);
165 static bool IsVedicAccent(
char32 unicode);
167 bool IsSubscriptScript()
const;
173 output_.push_back(codes_[codes_used_].second);
174 return ++codes_used_ == codes_.size();
183 while (output_used_ + length < output_.size()) {
185 std::initializer_list<char32>{output_[output_used_++]});
187 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
188 while (++output_used_ < output_.size()) {
189 parts_.back().push_back(output_[output_used_]);
197 output_.push_back(codes_[codes_used_].second);
198 MultiCodePart(length);
199 return ++codes_used_ == codes_.size();
205 virtual bool ConsumeGraphemeIfValid() = 0;
207 void ComputeClassCodes(
const std::vector<char32>& text);
214 static const int kIndicCodePageSize = 128;
216 static const char32 kMinIndicUnicode = 0x900;
218 static const char32 kMaxSinhalaUnicode = 0xdff;
220 static const char32 kMaxViramaScriptUnicode = 0x17ff;
222 static const char32 kSinhalaVirama = 0xdca;
223 static const char32 kMyanmarVirama = 0x1039;
224 static const char32 kKhmerVirama = 0x17d2;
226 static const char32 kJavaneseVirama = 0xa9c0;
227 static const char32 kMaxJavaneseUnicode = 0xa9df;
247 #endif // TESSERACT_TRAINING_VALIDATOR_H_
bool UseMultiCode(int length)
Definition: validator.h:196
signed int char32
Definition: unichar.h:52
int output_used_
Definition: validator.h:240
std::vector< IndicPair > codes_
Definition: validator.h:232
void MultiCodePart(int length)
Definition: validator.h:182
static const char32 kLeftToRightMark
Definition: validator.h:98
GraphemeNormMode
Definition: validator.h:34
Definition: baseapi.cpp:94
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:134
std::vector< std::vector< char32 > > parts_
Definition: validator.h:234
ViramaScript
Definition: validator.h:53
std::vector< char32 > output_
Definition: validator.h:236
CharClass
Definition: validator.h:112
static const char32 kRightToLeftMark
Definition: validator.h:99
static const char32 kInvalid
Definition: validator.h:100
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:88
static const char32 kZeroWidthJoiner
Definition: validator.h:97
bool CodeOnlyToOutput()
Definition: validator.h:172
bool report_errors_
Definition: validator.h:242
int codes_used_
Definition: validator.h:238
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:136
ViramaScript script_
Definition: validator.h:230
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
static const char32 kZeroWidthSpace
Definition: validator.h:95
Definition: validator.h:72