29 #include "genericvector.h" 43 using EDGE_INDEX = int64_t ;
44 using NODE_MARKER =
bool *;
70 static const int kSaneNumConcreteChars = 0;
74 static const char kAlphaPatternUnicode[];
75 static const char kDigitPatternUnicode[];
76 static const char kAlphanumPatternUnicode[];
77 static const char kPuncPatternUnicode[];
78 static const char kLowerPatternUnicode[];
79 static const char kUpperPatternUnicode[];
81 static const char *get_reverse_policy_name(
89 int unicharset_size,
int debug_level)
90 :
Dawg(type, lang, perm, debug_level) {
91 init(unicharset_size);
93 deref_node_index_mask_ = ~letter_mask_;
95 initialized_patterns_ =
false;
97 virtual ~Trie() { nodes_.delete_data_pointers(); }
104 bool word_end)
const {
105 EDGE_RECORD *edge_ptr;
106 EDGE_INDEX edge_index;
107 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
108 &edge_ptr, &edge_index))
return NO_EDGE;
109 return make_edge_ref(node_ref, edge_index);
117 bool word_end)
const {
119 nodes_[
static_cast<int>(node)]->forward_edges;
120 for (
int i = 0; i < forward_edges.
size(); ++i) {
121 if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
123 make_edge_ref(node, i)));
133 if (edge_ref == NO_EDGE || num_edges_ == 0)
return NO_EDGE;
134 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
142 if (edge_ref == NO_EDGE || num_edges_ == 0)
return false;
143 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
148 if (edge_ref == NO_EDGE || num_edges_ == 0)
return INVALID_UNICHAR_ID;
149 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
154 *edge_rec &= ~letter_mask_;
155 *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
158 return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
163 void print_node(NODE_REF node,
int max_num_edges)
const;
175 bool read_and_add_word_list(
const char *filename,
181 bool read_word_list(
const char *filename,
231 bool read_pattern_list(
const char *filename,
const UNICHARSET &unicharset);
235 void initialize_patterns(
UNICHARSET *unicharset);
239 void unichar_id_to_patterns(UNICHAR_ID unichar_id,
247 UNICHAR_ID unichar_id,
248 bool word_end)
const {
249 if (edge_ref == NO_EDGE)
return NO_EDGE;
250 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
251 return (marker_flag_from_edge_rec(*edge_rec) &&
252 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
253 word_end == end_of_word_from_edge_rec(*edge_rec)) ?
269 return add_word_to_dawg(word,
nullptr);
291 int edge_index =
static_cast<int>(
292 (edge_ref & letter_mask_) >> LETTER_START_BIT);
293 int node_index =
static_cast<int>(
294 (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
300 EDGE_INDEX edge_index)
const {
301 return ((node_index << flag_start_bit_) |
302 (edge_index << LETTER_START_BIT));
305 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt,
bool repeats,
306 int direction,
bool word_end, UNICHAR_ID unichar_id) {
307 EDGE_RECORD flags = 0;
308 if (repeats) flags |= MARKER_FLAG;
309 if (word_end) flags |= WERD_END_FLAG;
310 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
311 *edge = ((nxt << next_node_start_bit_) |
312 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
313 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
317 tprintf(
"|" REFFORMAT
"|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
318 marker_flag_from_edge_rec(edge_rec) ?
"R," :
"",
319 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ?
"F" :
"B",
320 end_of_word_from_edge_rec(edge_rec) ?
",E" :
"",
321 unichar_id_from_edge_rec(edge_rec));
326 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
327 return (node_ref != NO_EDGE &&
334 tprintf(
"\n__________________________\n%s\n", msg);
335 for (
int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
336 tprintf(
"__________________________\n");
343 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
344 int direction,
bool word_end, UNICHAR_ID unichar_id,
345 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index)
const;
349 bool add_edge_linkage(NODE_REF node1, NODE_REF node2,
bool repeats,
350 int direction,
bool word_end,
351 UNICHAR_ID unichar_id);
356 bool repeats,
bool word_end, UNICHAR_ID unichar_id) {
357 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
358 word_end, unichar_id) &&
359 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
360 word_end, unichar_id));
365 void add_word_ending(EDGE_RECORD *edge,
366 NODE_REF the_next_node,
368 UNICHAR_ID unichar_id);
371 NODE_REF new_dawg_node();
375 void remove_edge_linkage(NODE_REF node1, NODE_REF node2,
int direction,
376 bool word_end, UNICHAR_ID unichar_id);
381 bool word_end, UNICHAR_ID unichar_id) {
382 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
383 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
389 bool eliminate_redundant_edges(NODE_REF node,
const EDGE_RECORD &edge1,
390 const EDGE_RECORD &edge2);
397 bool reduce_lettered_edges(EDGE_INDEX edge_index,
398 UNICHAR_ID unichar_id,
401 NODE_MARKER reduced_nodes);
412 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
415 UNICHAR_ID character_class_to_pattern(
char ch);
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:325
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:299
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:153
DawgType
Definition: dawg.h:72
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Definition: trie.h:147
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:333
uint64_t num_edges_
Definition: trie.h:419
Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: trie.h:88
UNICHAR_ID alpha_pattern_
Definition: trie.h:427
EDGE_VECTOR backward_edges
Definition: trie.h:49
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
int push_back(T object)
Definition: genericvector.h:799
Definition: unicharset.h:146
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:355
RTLReversePolicy
Definition: trie.h:63
Definition: baseapi.cpp:94
virtual ~Trie()
Definition: trie.h:97
UNICHAR_ID punc_pattern_
Definition: trie.h:430
bool initialized_patterns_
Definition: trie.h:426
uint64_t deref_direction_mask_
Definition: trie.h:420
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:157
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:290
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:316
uint64_t deref_node_index_mask_
Definition: trie.h:421
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:305
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:246
bool end_of_word(EDGE_REF edge_ref) const
Definition: trie.h:141
UNICHAR_ID digit_pattern_
Definition: trie.h:428
UNICHAR_ID upper_pattern_
Definition: trie.h:432
UNICHAR_ID alphanum_pattern_
Definition: trie.h:429
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
TRIE_NODES nodes_
Definition: trie.h:418
int size() const
Definition: genericvector.h:71
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:423
bool add_word_to_dawg(const WERD_CHOICE &word)
Definition: trie.h:268
EDGE_VECTOR forward_edges
Definition: trie.h:48
UNICHAR_ID lower_pattern_
Definition: trie.h:431
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const
Definition: trie.h:116
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:380