38 #include "tesscallback.h" 42 #define NO_EDGE (int64_t) 0xffffffffffffffffi64 45 #define NO_EDGE (int64_t) 0xffffffffffffffffll 53 using EDGE_RECORD = uint64_t;
54 using EDGE_ARRAY = EDGE_RECORD *;
55 using EDGE_REF = int64_t;
56 using NODE_REF = int64_t;
57 using NODE_MAP = EDGE_REF *;
64 NodeChild(UNICHAR_ID
id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
65 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
85 #define FORWARD_EDGE (int32_t) 0 86 #define BACKWARD_EDGE (int32_t) 1 87 #define MAX_NODE_EDGES_DISPLAY (int64_t) 100 88 #define MARKER_FLAG (int64_t) 1 89 #define DIRECTION_FLAG (int64_t) 2 90 #define WERD_END_FLAG (int64_t) 4 91 #define LETTER_START_BIT 0 92 #define NUM_FLAG_BITS 3 93 #define REFFORMAT "%" PRId64 122 static const int16_t kDawgMagicNumber = 42;
126 static const UNICHAR_ID kPatternUnicharID = 0;
130 inline PermuterType
permuter()
const {
return perm_; }
139 bool prefix_in_dawg(
const WERD_CHOICE &prefix,
bool requires_complete)
const;
143 int check_for_words(
const char *filename,
145 bool enable_wildcard)
const;
149 void iterate_words(
const UNICHARSET &unicharset,
154 void iterate_words(
const UNICHARSET &unicharset,
160 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID
unichar_id,
161 bool word_end)
const = 0;
166 bool word_end)
const = 0;
170 virtual NODE_REF next_node(EDGE_REF
edge_ref)
const = 0;
174 virtual bool end_of_word(EDGE_REF edge_ref)
const = 0;
177 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref)
const = 0;
181 virtual void print_node(NODE_REF node,
int max_num_edges)
const = 0;
197 EDGE_REF edge_ref, UNICHAR_ID unichar_id,
bool word_end)
const {
210 debug_level_(debug_level) {}
214 return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
218 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
222 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
223 BACKWARD_EDGE : FORWARD_EDGE;
227 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
231 const EDGE_RECORD &edge_rec)
const {
232 return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
236 EDGE_RECORD *edge_rec, EDGE_REF value) {
237 *edge_rec &= (~next_node_mask_);
238 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
242 *edge_rec |= (MARKER_FLAG << flag_start_bit_);
253 UNICHAR_ID unichar_id,
254 const EDGE_RECORD &edge_rec)
const {
255 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
256 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
257 bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
258 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
259 curr_word_end, curr_unichar_id))
return 0;
260 if (unichar_id > curr_unichar_id)
return 1;
261 if (unichar_id == curr_unichar_id) {
262 if (next_node > curr_next_node)
return 1;
263 if (next_node == curr_next_node) {
264 if (word_end > curr_word_end)
return 1;
274 UNICHAR_ID unichar_id,
275 NODE_REF other_next_node,
277 UNICHAR_ID other_unichar_id)
const {
278 return ((unichar_id == other_unichar_id) &&
279 (next_node == NO_EDGE || next_node == other_next_node) &&
280 (!word_end || (word_end == other_word_end)));
285 void init(
int unicharset_size);
293 NODE_REF node, UNICHAR_ID wildcard)
const;
296 void iterate_words_rec(
const WERD_CHOICE &word_so_far,
356 : dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
357 back_to_punc(false) {}
359 int punc_idx, EDGE_REF puncref,
361 : dawg_index(dawg_idx), dawg_ref(dawgref),
362 punc_index(punc_idx), punc_ref(puncref),
363 back_to_punc(backtopunc) {
391 const char *debug_msg) {
392 for (
int i = 0; i < size_used_; ++i) {
393 if (data_[i] == new_pos)
return false;
397 tprintf(
"%s[%d, " REFFORMAT
"] [punc: " REFFORMAT
"%s]\n",
417 :
Dawg(type, lang, perm, debug_level) {}
419 PermuterType perm,
int debug_level)
420 :
Dawg(type, lang, perm, debug_level) {
422 ASSERT_HOST(file.
Open(filename,
nullptr));
423 ASSERT_HOST(read_squished_dawg(&file));
424 num_forward_edges_in_node0 = num_forward_edges(0);
427 const STRING &lang, PermuterType perm,
int unicharset_size,
429 :
Dawg(type, lang, perm, debug_level),
431 num_edges_(num_edges) {
432 init(unicharset_size);
433 num_forward_edges_in_node0 = num_forward_edges(0);
434 if (debug_level > 3) print_all(
"SquishedDawg:");
440 if (!read_squished_dawg(fp))
return false;
441 num_forward_edges_in_node0 = num_forward_edges(0);
448 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID
unichar_id,
449 bool word_end)
const;
454 bool word_end)
const {
455 EDGE_REF edge = node;
456 if (!edge_occupied(edge) || edge == NO_EDGE)
return;
457 assert(forward_edge(edge));
459 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
462 }
while (!last_edge(edge++));
468 return next_node_from_edge_rec((edges_[edge]));
474 return end_of_word_from_edge_rec((edges_[edge_ref]));
479 return unichar_id_from_edge_rec((edges_[edge_ref]));
484 void print_node(NODE_REF node,
int max_num_edges)
const;
487 bool write_squished_dawg(
TFile *file);
494 if (!this->write_squished_dawg(&file)) {
495 tprintf(
"Error serializing %s\n", filename);
499 tprintf(
"Error writing file %s\n", filename);
508 set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
512 (edges_[
edge_ref] = next_node_mask_);
516 for (
int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
520 (edges_[
edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
524 return (edge_occupied(edge_ref) &&
525 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
529 return (edge_occupied(edge_ref) &&
530 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
534 return (edges_[edge_ref] != next_node_mask_);
538 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
542 int32_t num_forward_edges(NODE_REF node)
const;
545 bool read_squished_dawg(
TFile *file);
548 void print_edge(EDGE_REF edge)
const;
552 tprintf(
"\n__________________________\n%s\n", msg);
553 for (
int i = 0; i < num_edges_; ++i) print_edge(i);
554 tprintf(
"__________________________\n");
557 std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes)
const;
567 #endif // DICT_DAWG_H_ DawgPosition()
Definition: dawg.h:355
DawgType
Definition: dawg.h:72
bool forward_edge(EDGE_REF edge_ref) const
Returns true if this edge is in the forward direction.
Definition: dawg.h:523
int32_t num_edges_
Definition: dawg.h:561
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:196
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:241
EDGE_REF edge_ref
Definition: dawg.h:63
void set_next_node(EDGE_REF edge_ref, EDGE_REF value)
Sets the next node link for this edge.
Definition: dawg.h:507
int unicharset_size_
Definition: dawg.h:309
SquishedDawg(const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:418
uint64_t flags_mask_
Definition: dawg.h:313
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
int push_back(T object)
Definition: genericvector.h:799
Definition: unicharset.h:146
static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT]
Definition: dawg.h:95
NodeChild(UNICHAR_ID id, EDGE_REF ref)
Definition: dawg.h:64
uint64_t next_node_mask_
Definition: dawg.h:312
void print_all(const char *msg)
Prints the contents of the SquishedDawg.
Definition: dawg.h:551
bool backward_edge(EDGE_REF edge_ref) const
Returns true if this edge is in the backward direction.
Definition: dawg.h:528
Definition: serialis.h:77
DawgType type() const
Definition: dawg.h:128
UNICHAR_ID unichar_id
Definition: dawg.h:62
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:272
bool write_squished_dawg(const char *filename)
Definition: dawg.h:491
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:251
PermuterType permuter() const
Definition: dawg.h:130
int8_t punc_index
Definition: dawg.h:375
Definition: baseapi.cpp:94
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:185
bool edge_occupied(EDGE_REF edge_ref) const
Returns true if the edge spot in this location is occupied.
Definition: dawg.h:533
const STRING & lang() const
Definition: dawg.h:129
bool end_of_word(EDGE_REF edge_ref) const
Definition: dawg.h:473
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:478
EDGE_ARRAY edges_
Definition: dawg.h:560
EDGE_REF dawg_ref
Definition: dawg.h:374
int debug_level_
Definition: dawg.h:316
bool operator==(const DawgPosition &other)
Definition: dawg.h:365
void clear()
Definition: dawg.h:385
STRING lang_
Definition: dawg.h:302
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:196
int NumEdges()
Definition: dawg.h:445
uint64_t letter_mask_
Definition: dawg.h:314
bool CloseWrite(const STRING &filename, FileWriter writer)
Definition: serialis.cpp:310
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:221
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:205
void clear_all_edges()
Goes through all the edges and clears each one out.
Definition: dawg.h:515
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const
Definition: dawg.h:453
int num_forward_edges_in_node0
Definition: dawg.h:562
NodeChild()
Definition: dawg.h:65
int8_t dawg_index
Definition: dawg.h:373
DawgType type_
Definition: dawg.h:301
int next_node_start_bit_
Definition: dawg.h:311
static const char kWildcard[]
Definition: dawg.h:102
bool back_to_punc
Definition: dawg.h:378
void set_empty_edge(EDGE_REF edge_ref)
Sets the edge to be empty.
Definition: dawg.h:511
bool last_edge(EDGE_REF edge_ref) const
Returns true if this edge is the last edge in a sequence.
Definition: dawg.h:537
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: dawg.h:426
NODE_REF next_node(EDGE_REF edge) const
Definition: dawg.h:467
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:415
EDGE_REF punc_ref
Definition: dawg.h:376
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:217
void clear_marker_flag(EDGE_REF edge_ref)
Clears the last flag of this edge.
Definition: dawg.h:519
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:304
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:235
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:389
bool Load(TFile *fp)
Definition: dawg.h:439
DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, bool backtopunc)
Definition: dawg.h:358
int flag_start_bit_
Definition: dawg.h:310
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:295