tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
fontinfo.h
1 // File: fontinfo.h
3 // Description: Font information classes abstracted from intproto.h/cpp.
4 // Author: rays@google.com (Ray Smith)
5 // Created: Tue May 17 17:08:01 PDT 2011
6 //
7 // (C) Copyright 2011, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 
21 #ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
22 #define TESSERACT_CCSTRUCT_FONTINFO_H_
23 
24 #include <cstdint> // for uint16_t, uint32_t
25 #include <cstdio> // for FILE
26 #include "errcode.h"
27 #include "genericvector.h"
28 #include "unichar.h"
29 
30 template <typename T> class UnicityTable;
31 
32 namespace tesseract {
33 
34 // Simple struct to hold a font and a score. The scores come from the low-level
35 // integer matcher, so they are in the uint16_t range. Fonts are an index to
36 // fontinfo_table.
37 // These get copied around a lot, so best to keep them small.
38 struct ScoredFont {
39  ScoredFont() : fontinfo_id(-1), score(0) {}
40  ScoredFont(int font_id, uint16_t classifier_score)
41  : fontinfo_id(font_id), score(classifier_score) {}
42 
43  // Index into fontinfo table, but inside the classifier, may be a shapetable
44  // index.
45  int32_t fontinfo_id;
46  // Raw score from the low-level classifier.
47  uint16_t score;
48 };
49 
50 // Struct for information about spacing between characters in a particular font.
52  int16_t x_gap_before;
53  int16_t x_gap_after;
56 };
57 
58 /*
59  * font_properties contains properties about boldness, italicness, fixed pitch,
60  * serif, fraktur
61  */
62 struct FontInfo {
63  FontInfo() : name(nullptr), properties(0), universal_id(0), spacing_vec(nullptr) {}
64  ~FontInfo() = default;
65 
66  // Writes to the given file. Returns false in case of error.
67  bool Serialize(FILE* fp) const;
68  // Reads from the given file. Returns false in case of error.
69  // If swap is true, assumes a big/little-endian swap is needed.
70  bool DeSerialize(TFile* fp);
71 
72  // Reserves unicharset_size spots in spacing_vec.
73  void init_spacing(int unicharset_size) {
74  spacing_vec = new GenericVector<FontSpacingInfo *>();
75  spacing_vec->init_to_size(unicharset_size, nullptr);
76  }
77  // Adds the given pointer to FontSpacingInfo to spacing_vec member
78  // (FontInfo class takes ownership of the pointer).
79  // Note: init_spacing should be called before calling this function.
80  void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
81  ASSERT_HOST(spacing_vec != nullptr && spacing_vec->size() > uch_id);
82  (*spacing_vec)[uch_id] = spacing_info;
83  }
84 
85  // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
86  const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
87  return (spacing_vec == nullptr || spacing_vec->size() <= uch_id) ?
88  nullptr : (*spacing_vec)[uch_id];
89  }
90 
91  // Fills spacing with the value of the x gap expected between the two given
92  // UNICHAR_IDs. Returns true on success.
93  bool get_spacing(UNICHAR_ID prev_uch_id,
94  UNICHAR_ID uch_id,
95  int *spacing) const {
96  const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
97  const FontSpacingInfo *fsi = this->get_spacing(uch_id);
98  if (prev_fsi == nullptr || fsi == nullptr) return false;
99  int i = 0;
100  for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
101  if (prev_fsi->kerned_unichar_ids[i] == uch_id) break;
102  }
103  if (i < prev_fsi->kerned_unichar_ids.size()) {
104  *spacing = prev_fsi->kerned_x_gaps[i];
105  } else {
106  *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
107  }
108  return true;
109  }
110 
111  bool is_italic() const { return properties & 1; }
112  bool is_bold() const { return (properties & 2) != 0; }
113  bool is_fixed_pitch() const { return (properties & 4) != 0; }
114  bool is_serif() const { return (properties & 8) != 0; }
115  bool is_fraktur() const { return (properties & 16) != 0; }
116 
117  char* name;
118  uint32_t properties;
119  // The universal_id is a field reserved for the initialization process
120  // to assign a unique id number to all fonts loaded for the current
121  // combination of languages. This id will then be returned by
122  // ResultIterator::WordFontAttributes.
123  int32_t universal_id;
124  // Horizontal spacing between characters (indexed by UNICHAR_ID).
126 };
127 
128 // Every class (character) owns a FontSet that represents all the fonts that can
129 // render this character.
130 // Since almost all the characters from the same script share the same set of
131 // fonts, the sets are shared over multiple classes (see
132 // Classify::fontset_table_). Thus, a class only store an id to a set.
133 // Because some fonts cannot render just one character of a set, there are a
134 // lot of FontSet that differ only by one font. Rather than storing directly
135 // the FontInfo in the FontSet structure, it's better to share FontInfos among
136 // FontSets (Classify::fontinfo_table_).
137 struct FontSet {
138  int size;
139  int* configs; // FontInfo ids
140 };
141 
142 // Class that adds a bit of functionality on top of GenericVector to
143 // implement a table of FontInfo that replaces UniCityTable<FontInfo>.
144 // TODO(rays) change all references once all existing traineddata files
145 // are replaced.
146 class FontInfoTable : public GenericVector<FontInfo> {
147  public:
148  FontInfoTable();
149  ~FontInfoTable();
150 
151  // Writes to the given file. Returns false in case of error.
152  bool Serialize(FILE* fp) const;
153  // Reads from the given file. Returns false in case of error.
154  // If swap is true, assumes a big/little-endian swap is needed.
155  bool DeSerialize(TFile* fp);
156 
157  // Returns true if the given set of fonts includes one with the same
158  // properties as font_id.
159  bool SetContainsFontProperties(
160  int font_id, const GenericVector<ScoredFont>& font_set) const;
161  // Returns true if the given set of fonts includes multiple properties.
162  bool SetContainsMultipleFontProperties(
163  const GenericVector<ScoredFont>& font_set) const;
164 
165  // Moves any non-empty FontSpacingInfo entries from other to this.
166  void MoveSpacingInfoFrom(FontInfoTable* other);
167  // Moves this to the target unicity table.
168  void MoveTo(UnicityTable<FontInfo>* target);
169 };
170 
171 // Compare FontInfo structures.
172 bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2);
173 // Compare FontSet structures.
174 bool CompareFontSet(const FontSet& fs1, const FontSet& fs2);
175 // Deletion callbacks for GenericVector.
178 
179 // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
180 bool read_info(TFile* f, FontInfo* fi);
181 bool write_info(FILE* f, const FontInfo& fi);
182 bool read_spacing_info(TFile* f, FontInfo* fi);
183 bool write_spacing_info(FILE* f, const FontInfo& fi);
184 bool read_set(TFile* f, FontSet* fs);
185 bool write_set(FILE* f, const FontSet& fs);
186 
187 } // namespace tesseract.
188 
189 #endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */
bool write_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:162
int16_t x_gap_before
Definition: fontinfo.h:52
Definition: fontinfo.h:62
bool is_fixed_pitch() const
Definition: fontinfo.h:113
ScoredFont()
Definition: fontinfo.h:39
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
bool read_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:152
GenericVector< FontSpacingInfo * > * spacing_vec
Definition: fontinfo.h:125
bool CompareFontSet(const FontSet &fs1, const FontSet &fs2)
Definition: fontinfo.cpp:128
int * configs
Definition: fontinfo.h:139
GenericVector< UNICHAR_ID > kerned_unichar_ids
Definition: fontinfo.h:54
bool write_spacing_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:197
bool is_fraktur() const
Definition: fontinfo.h:115
Definition: serialis.h:77
uint32_t properties
Definition: fontinfo.h:118
Definition: baseapi.cpp:94
bool write_set(FILE *f, const FontSet &fs)
Definition: fontinfo.cpp:231
ScoredFont(int font_id, uint16_t classifier_score)
Definition: fontinfo.h:40
Definition: fontinfo.h:38
bool is_italic() const
Definition: fontinfo.h:111
int32_t fontinfo_id
Definition: fontinfo.h:45
char * name
Definition: fontinfo.h:117
void FontSetDeleteCallback(FontSet fs)
Definition: fontinfo.cpp:146
Definition: fontinfo.h:30
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:139
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
int size
Definition: fontinfo.h:138
int16_t x_gap_after
Definition: fontinfo.h:53
FontInfo()
Definition: fontinfo.h:63
int32_t universal_id
Definition: fontinfo.h:123
void init_spacing(int unicharset_size)
Definition: fontinfo.h:73
Definition: fontinfo.h:146
bool read_spacing_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:169
int size() const
Definition: genericvector.h:71
bool is_bold() const
Definition: fontinfo.h:112
Definition: fontinfo.h:137
void init_to_size(int size, const T &t)
Definition: genericvector.h:708
bool is_serif() const
Definition: fontinfo.h:114
GenericVector< int16_t > kerned_x_gaps
Definition: fontinfo.h:55
bool read_set(TFile *f, FontSet *fs)
Definition: fontinfo.cpp:225
uint16_t score
Definition: fontinfo.h:47
bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const
Definition: fontinfo.h:93
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:27
const FontSpacingInfo * get_spacing(UNICHAR_ID uch_id) const
Definition: fontinfo.h:86
Definition: fontinfo.h:51
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
Definition: fontinfo.h:80