20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_ 21 #define TESSERACT_CCUTIL_UNICHARSET_H_ 24 #include "genericvector.h" 28 #include "tesscallback.h" 30 #include "unicharmap.h" 34 enum SpecialUnicharCodes {
39 SPECIAL_UNICHAR_CODES_COUNT
44 enum class OldUncleanUnichars {
52 static const int kMinLen = 6;
54 static const int kMaxLen = 3 + UNICHAR_LEN + 2;
56 static const int kMaxChunks = 5;
59 inline void set_all(
const char *unichar,
int pos,
int total,
bool natural) {
66 strncpy(this->unichar, uch, UNICHAR_LEN);
67 this->unichar[UNICHAR_LEN] =
'\0';
69 inline void set_pos(
int p) { this->pos = p; }
71 inline const char*
get_unichar()
const {
return this->unichar; }
72 inline int get_pos()
const {
return this->pos; }
73 inline int get_total()
const {
return this->total; }
77 static STRING to_string(
const char *unichar,
int pos,
int total,
81 return to_string(unichar, pos, total, natural);
86 inline bool equals(
const char *other_unichar,
87 int other_pos,
int other_total)
const {
88 return (strcmp(this->unichar, other_unichar) == 0 &&
89 this->pos == other_pos && this->total == other_total);
100 return (strcmp(this->unichar, fragment->
get_unichar()) == 0 &&
102 this->pos == fragment->
get_pos() + 1);
109 inline bool is_ending()
const {
return this->pos == this->total-1; }
134 char unichar[UNICHAR_LEN + 1];
151 static TESS_API
const char* kCustomLigatures[][2];
154 static TESS_API
const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
160 U_EUROPEAN_NUMBER = 2,
161 U_EUROPEAN_NUMBER_SEPARATOR = 3,
162 U_EUROPEAN_NUMBER_TERMINATOR = 4,
164 U_COMMON_NUMBER_SEPARATOR = 6,
165 U_BLOCK_SEPARATOR = 7,
166 U_SEGMENT_SEPARATOR = 8,
167 U_WHITE_SPACE_NEUTRAL = 9,
168 U_OTHER_NEUTRAL = 10,
169 U_LEFT_TO_RIGHT_EMBEDDING = 11,
170 U_LEFT_TO_RIGHT_OVERRIDE = 12,
171 U_RIGHT_TO_LEFT_ARABIC = 13,
172 U_RIGHT_TO_LEFT_EMBEDDING = 14,
173 U_RIGHT_TO_LEFT_OVERRIDE = 15,
174 U_POP_DIRECTIONAL_FORMAT = 16,
175 U_DIR_NON_SPACING_MARK = 17,
176 U_BOUNDARY_NEUTRAL = 18,
177 U_CHAR_DIRECTION_COUNT
187 UNICHAR_ID unichar_to_id(
const char*
const unichar_repr)
const;
191 UNICHAR_ID unichar_to_id(
const char*
const unichar_repr,
int length)
const;
198 int step(
const char* str)
const;
203 bool encodable_string(
const char *str,
int *first_bad_position)
const;
220 bool encode_string(
const char* str,
bool give_up_on_failure,
223 int* encoded_length)
const;
227 const char* id_to_unichar(UNICHAR_ID
id)
const;
233 const char* id_to_unichar_ext(UNICHAR_ID
id)
const;
237 static STRING debug_utf8_str(
const char* str);
242 return CleanupString(utf8_str, strlen(utf8_str));
244 static std::string CleanupString(
const char* utf8_str,
size_t length);
248 STRING debug_str(UNICHAR_ID
id)
const;
250 return debug_str(unichar_to_id(unichar_repr));
257 void unichar_insert(
const char*
const unichar_repr,
258 OldUncleanUnichars old_style);
260 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
265 std::string cleaned = CleanupString(unichar_repr);
266 if (cleaned != unichar_repr) {
267 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
269 int old_size = size();
270 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
271 if (size() == old_size) {
272 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
280 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
285 bool contains_unichar(
const char*
const unichar_repr)
const;
286 bool contains_unichar(
const char*
const unichar_repr,
int length)
const;
290 bool eq(UNICHAR_ID unichar_id,
const char*
const unichar_repr)
const;
294 for (
int i = 0; i < size_used; ++i) {
295 delete unichars[i].properties.fragment;
296 unichars[i].properties.fragment =
nullptr;
302 if (script_table !=
nullptr) {
303 for (
int i = 0; i < script_table_size_used; ++i)
304 delete[] script_table[i];
305 delete[] script_table;
306 script_table =
nullptr;
307 script_table_size_used = 0;
309 if (unichars !=
nullptr) {
310 delete_pointers_in_unichars();
314 script_table_size_reserved = 0;
318 top_bottom_set_ =
false;
319 script_has_upper_lower_ =
false;
320 script_has_xheight_ =
false;
321 old_style_included_ =
false;
341 void reserve(
int unichars_number);
346 FILE* file = fopen(filename,
"w+b");
347 if (file ==
nullptr)
return false;
348 bool result = save_to_file(file);
357 return save_to_string(&str) &&
368 bool save_to_string(
STRING *str)
const;
373 bool load_from_inmemory_file(
const char*
const memory,
int mem_size,
374 bool skip_fragments);
377 return load_from_inmemory_file(memory, mem_size,
false);
384 FILE* file = fopen(filename,
"rb");
385 if (file ==
nullptr)
return false;
386 bool result = load_from_file(file, skip_fragments);
392 return load_from_file(filename,
false);
397 bool load_from_file(FILE *file,
bool skip_fragments);
405 void post_load_setup();
411 bool major_right_to_left()
const;
422 void set_black_and_whitelist(
const char* blacklist,
const char* whitelist,
423 const char* unblacklist);
427 unichars[unichar_id].properties.isalpha = value;
432 unichars[unichar_id].properties.islower = value;
437 unichars[unichar_id].properties.isupper = value;
442 unichars[unichar_id].properties.isdigit = value;
447 unichars[unichar_id].properties.ispunctuation = value;
452 unichars[unichar_id].properties.isngram = value;
458 unichars[unichar_id].properties.script_id = add_script(value);
463 unichars[unichar_id].properties.other_case = other_case;
468 unichars[unichar_id].properties.direction = value;
473 unichars[unichar_id].properties.mirror = mirror;
478 unichars[unichar_id].properties.normed = normed;
479 unichars[unichar_id].properties.normed_ids.truncate(0);
483 void set_normed_ids(UNICHAR_ID unichar_id);
487 if (INVALID_UNICHAR_ID == unichar_id)
return false;
488 ASSERT_HOST(contains_unichar_id(unichar_id));
489 return unichars[unichar_id].properties.isalpha;
494 if (INVALID_UNICHAR_ID == unichar_id)
return false;
495 ASSERT_HOST(contains_unichar_id(unichar_id));
496 return unichars[unichar_id].properties.islower;
501 if (INVALID_UNICHAR_ID == unichar_id)
return false;
502 ASSERT_HOST(contains_unichar_id(unichar_id));
503 return unichars[unichar_id].properties.isupper;
508 if (INVALID_UNICHAR_ID == unichar_id)
return false;
509 ASSERT_HOST(contains_unichar_id(unichar_id));
510 return unichars[unichar_id].properties.isdigit;
515 if (INVALID_UNICHAR_ID == unichar_id)
return false;
516 ASSERT_HOST(contains_unichar_id(unichar_id));
517 return unichars[unichar_id].properties.ispunctuation;
522 if (INVALID_UNICHAR_ID == unichar_id)
return false;
523 ASSERT_HOST(contains_unichar_id(unichar_id));
524 return unichars[unichar_id].properties.isngram;
529 bool get_isprivate(UNICHAR_ID unichar_id)
const;
533 return top_bottom_set_;
536 void set_ranges_empty();
541 PartialSetPropertiesFromOther(0, src);
544 void PartialSetPropertiesFromOther(
int start_index,
const UNICHARSET& src);
548 void ExpandRangesFromOther(
const UNICHARSET& src);
555 void AppendOtherUnicharset(
const UNICHARSET& src);
558 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2)
const;
564 int* min_bottom,
int* max_bottom,
565 int* min_top,
int* max_top)
const {
566 if (INVALID_UNICHAR_ID == unichar_id) {
567 *min_bottom = *min_top = 0;
568 *max_bottom = *max_top = 256;
571 ASSERT_HOST(contains_unichar_id(unichar_id));
572 *min_bottom = unichars[unichar_id].properties.min_bottom;
573 *max_bottom = unichars[unichar_id].properties.max_bottom;
574 *min_top = unichars[unichar_id].properties.min_top;
575 *max_top = unichars[unichar_id].properties.max_top;
578 int min_bottom,
int max_bottom,
579 int min_top,
int max_top) {
580 unichars[unichar_id].properties.min_bottom =
581 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
582 unichars[unichar_id].properties.max_bottom =
583 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
584 unichars[unichar_id].properties.min_top =
585 ClipToRange<int>(min_top, 0, UINT8_MAX);
586 unichars[unichar_id].properties.max_top =
587 ClipToRange<int>(max_top, 0, UINT8_MAX);
592 float* width,
float* width_sd)
const {
593 if (INVALID_UNICHAR_ID == unichar_id) {
598 ASSERT_HOST(contains_unichar_id(unichar_id));
599 *width = unichars[unichar_id].properties.width;
600 *width_sd = unichars[unichar_id].properties.width_sd;
603 unichars[unichar_id].properties.width = width;
604 unichars[unichar_id].properties.width_sd = width_sd;
609 float* bearing,
float* bearing_sd)
const {
610 if (INVALID_UNICHAR_ID == unichar_id) {
611 *bearing = *bearing_sd = 0.0f;
614 ASSERT_HOST(contains_unichar_id(unichar_id));
615 *bearing = unichars[unichar_id].properties.bearing;
616 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
619 float bearing,
float bearing_sd) {
620 unichars[unichar_id].properties.bearing = bearing;
621 unichars[unichar_id].properties.bearing_sd = bearing_sd;
626 float* advance,
float* advance_sd)
const {
627 if (INVALID_UNICHAR_ID == unichar_id) {
628 *advance = *advance_sd = 0;
631 ASSERT_HOST(contains_unichar_id(unichar_id));
632 *advance = unichars[unichar_id].properties.advance;
633 *advance_sd = unichars[unichar_id].properties.advance_sd;
636 float advance,
float advance_sd) {
637 unichars[unichar_id].properties.advance = advance;
638 unichars[unichar_id].properties.advance_sd = advance_sd;
642 return unichars[unichar_id].properties.AnyRangeEmpty();
648 if (INVALID_UNICHAR_ID == unichar_id)
return true;
649 int script_id = get_script(unichar_id);
650 return script_id != han_sid_ && script_id != thai_sid_ &&
651 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
652 script_id != katakana_sid_;
659 if (INVALID_UNICHAR_ID == unichar_id)
return null_sid_;
660 ASSERT_HOST(contains_unichar_id(unichar_id));
661 return unichars[unichar_id].properties.script_id;
666 unsigned int get_properties(UNICHAR_ID unichar_id)
const;
675 char get_chartype(UNICHAR_ID unichar_id)
const;
679 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
680 ASSERT_HOST(contains_unichar_id(unichar_id));
681 return unichars[unichar_id].properties.other_case;
687 ASSERT_HOST(contains_unichar_id(unichar_id));
688 return unichars[unichar_id].properties.direction;
693 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
694 ASSERT_HOST(contains_unichar_id(unichar_id));
695 return unichars[unichar_id].properties.mirror;
700 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
701 ASSERT_HOST(contains_unichar_id(unichar_id));
702 if (unichars[unichar_id].properties.islower)
return unichar_id;
703 return unichars[unichar_id].properties.other_case;
708 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
709 ASSERT_HOST(contains_unichar_id(unichar_id));
710 if (unichars[unichar_id].properties.isupper)
return unichar_id;
711 return unichars[unichar_id].properties.other_case;
718 return get_fragment(UNICHAR_BROKEN) !=
nullptr &&
719 strcmp(id_to_unichar(UNICHAR_BROKEN),
720 kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
725 bool AnyRepeatedUnicodes()
const;
730 if (INVALID_UNICHAR_ID == unichar_id)
return nullptr;
731 ASSERT_HOST(contains_unichar_id(unichar_id));
732 return unichars[unichar_id].properties.fragment;
737 return get_isalpha(unichar_to_id(unichar_repr));
742 return get_islower(unichar_to_id(unichar_repr));
747 return get_isupper(unichar_to_id(unichar_repr));
752 return get_isdigit(unichar_to_id(unichar_repr));
757 return get_ispunctuation(unichar_to_id(unichar_repr));
763 return get_properties(unichar_to_id(unichar_repr));
767 return get_chartype(unichar_to_id(unichar_repr));
774 return get_script(unichar_to_id(unichar_repr));
780 if (unichar_repr ==
nullptr || unichar_repr[0] ==
'\0' ||
781 !ids.contains(unichar_repr,
false)) {
784 return get_fragment(unichar_to_id(unichar_repr));
791 return get_isalpha(unichar_to_id(unichar_repr, length));
798 return get_islower(unichar_to_id(unichar_repr, length));
805 return get_isupper(unichar_to_id(unichar_repr, length));
812 return get_isdigit(unichar_to_id(unichar_repr, length));
819 return get_ispunctuation(unichar_to_id(unichar_repr, length));
824 if (unichar_id == UNICHAR_SPACE)
return " ";
825 return unichars[unichar_id].properties.normed.string();
831 return unichars[unichar_id].properties.normed_ids;
840 return get_script(unichar_to_id(unichar_repr, length));
845 return script_table_size_used;
850 if (
id >= script_table_size_used ||
id < 0)
852 return script_table[id];
860 int get_script_id_from_name(
const char* script_name)
const;
864 return script == null_script;
870 int add_script(
const char* script);
874 return unichars[unichar_id].properties.enabled;
892 return script_has_upper_lower_;
899 return script_has_xheight_;
910 void SetRangesOpen();
912 void SetRangesEmpty();
915 bool AnyRangeEmpty()
const;
967 char representation[UNICHAR_LEN + 1];
982 void encode_string(
const char* str,
int str_index,
int str_length,
985 int* best_total_length,
994 bool GetStrProperties(
const char* utf8_str,
1001 bool skip_fragments);
1006 static const char* kCleanupMaps[][2];
1043 #endif // TESSERACT_CCUTIL_UNICHARSET_H_ Direction direction
Definition: unicharset.h:946
int size_reserved
Definition: unicharset.h:1012
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:635
UNICHAR_ID mirror
Definition: unicharset.h:953
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:618
bool top_bottom_useful() const
Definition: unicharset.h:532
void delete_pointers_in_unichars()
Definition: unicharset.h:293
bool script_has_xheight_
Definition: unicharset.h:1022
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:467
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:789
bool islower
Definition: unicharset.h:922
STRING debug_str(const char *unichar_repr) const
Definition: unicharset.h:249
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:810
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:736
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:625
float width
Definition: unicharset.h:937
int size_used
Definition: unicharset.h:1011
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:817
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:477
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:849
bool has_special_codes() const
Definition: unicharset.h:717
bool script_has_upper_lower() const
Definition: unicharset.h:891
int default_sid_
Definition: unicharset.h:1040
int script_table_size_used
Definition: unicharset.h:1014
uint8_t min_top
Definition: unicharset.h:934
bool script_has_upper_lower_
Definition: unicharset.h:1019
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:99
bool load_from_inmemory_file(const char *const memory, int mem_size)
Definition: unicharset.h:376
bool ispunctuation
Definition: unicharset.h:925
int han_sid() const
Definition: unicharset.h:883
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:699
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:86
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:803
uint8_t min_bottom
Definition: unicharset.h:932
bool old_style_included_
Definition: unicharset.h:1024
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:751
Direction
Definition: unicharset.h:157
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:361
int null_sid() const
Definition: unicharset.h:878
void set_unichar(const char *uch)
Definition: unicharset.h:65
uint8_t max_top
Definition: unicharset.h:935
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:779
int thai_sid_
Definition: unicharset.h:1037
Definition: unicharset.h:146
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
float bearing_sd
Definition: unicharset.h:941
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:259
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:773
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:602
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
int size() const
Definition: unicharset.h:336
int script_table_size_reserved
Definition: unicharset.h:1015
int32_t length() const
Definition: strngs.cpp:191
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
bool isdigit
Definition: unicharset.h:924
Definition: serialis.h:77
float advance
Definition: unicharset.h:942
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:540
void set_total(int t)
Definition: unicharset.h:70
int cyrillic_sid() const
Definition: unicharset.h:881
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:823
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
int null_sid_
Definition: unicharset.h:1029
int hiragana_sid_
Definition: unicharset.h:1035
int get_script_table_size() const
Definition: unicharset.h:844
int script_id
Definition: unicharset.h:944
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
bool load_from_file(FILE *file)
Definition: unicharset.h:398
char ** script_table
Definition: unicharset.h:1013
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
bool isupper
Definition: unicharset.h:923
float bearing
Definition: unicharset.h:940
int get_total() const
Definition: unicharset.h:73
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:766
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:591
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:608
float advance_sd
Definition: unicharset.h:943
int16_t pos
Definition: unicharset.h:139
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:462
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:641
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:678
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:741
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:746
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:685
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:647
int get_pos() const
Definition: unicharset.h:72
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:91
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:472
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:756
int cyrillic_sid_
Definition: unicharset.h:1032
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:796
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:59
Definition: unicharset.h:168
bool is_natural() const
Definition: unicharset.h:114
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
bool script_has_xheight() const
Definition: unicharset.h:898
bool enabled
Definition: unicharset.h:927
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:426
bool top_bottom_set_
Definition: unicharset.h:1017
int latin_sid() const
Definition: unicharset.h:880
Definition: unicharmap.h:27
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:762
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:457
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:692
UNICHAR_PROPERTIES properties
Definition: unicharset.h:968
uint8_t max_bottom
Definition: unicharset.h:933
int common_sid_
Definition: unicharset.h:1030
bool natural
Definition: unicharset.h:138
int common_sid() const
Definition: unicharset.h:879
STRING normed
Definition: unicharset.h:958
const char * get_unichar() const
Definition: unicharset.h:71
int default_sid() const
Definition: unicharset.h:888
void set_natural(bool value)
Definition: unicharset.h:115
int greek_sid_
Definition: unicharset.h:1033
bool is_null_script(const char *script) const
Definition: unicharset.h:863
CHAR_FRAGMENT * fragment
Definition: unicharset.h:963
bool isalpha
Definition: unicharset.h:921
UNICHAR_SLOT * unichars
Definition: unicharset.h:1009
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:707
bool is_beginning() const
Definition: unicharset.h:106
int greek_sid() const
Definition: unicharset.h:882
void set_pos(int p)
Definition: unicharset.h:69
STRING to_string() const
Definition: unicharset.h:80
int hangul_sid_
Definition: unicharset.h:1038
bool save_to_file(const char *const filename) const
Definition: unicharset.h:345
bool is_ending() const
Definition: unicharset.h:109
UNICHAR_ID other_case
Definition: unicharset.h:945
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:521
int katakana_sid() const
Definition: unicharset.h:885
Definition: unicharset.h:966
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:838
int katakana_sid_
Definition: unicharset.h:1036
int thai_sid() const
Definition: unicharset.h:886
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:658
UNICHARMAP ids
Definition: unicharset.h:1010
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:147
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
static const char * null_script
Definition: unicharset.h:1007
void clear()
Definition: unicharset.h:301
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:264
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
int16_t total
Definition: unicharset.h:140
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:383
bool isngram
Definition: unicharset.h:926
bool save_to_file(FILE *file) const
Definition: unicharset.h:355
GenericVector< UNICHAR_ID > normed_ids
Definition: unicharset.h:957
int hangul_sid() const
Definition: unicharset.h:887
Definition: unicharset.h:904
Definition: unicharset.h:49
int hiragana_sid() const
Definition: unicharset.h:884
float width_sd
Definition: unicharset.h:938
int han_sid_
Definition: unicharset.h:1034
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:577
bool load_from_file(const char *const filename)
Definition: unicharset.h:391
int latin_sid_
Definition: unicharset.h:1031