tesseract  3.05.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
trie.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: trie.c (Formerly trie.c)
5  * Description: Functions to build a trie data structure.
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 #ifdef _MSC_VER
29 #pragma warning(disable:4244) // Conversion warnings
30 #pragma warning(disable:4800) // int/bool warnings
31 #endif
32 #include "trie.h"
33 
34 #include "callcpp.h"
35 #include "dawg.h"
36 #include "dict.h"
37 #include "freelist.h"
38 #include "genericvector.h"
39 #include "helpers.h"
40 #include "kdpair.h"
41 
42 namespace tesseract {
43 
44 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
45 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
46 const char kForceReverse[] = "RRP_FORCE_REVERSE";
47 
48 const char * const RTLReversePolicyNames[] = {
52 };
53 
54 const char Trie::kAlphaPatternUnicode[] = "\u2000";
55 const char Trie::kDigitPatternUnicode[] = "\u2001";
56 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
57 const char Trie::kPuncPatternUnicode[] = "\u2003";
58 const char Trie::kLowerPatternUnicode[] = "\u2004";
59 const char Trie::kUpperPatternUnicode[] = "\u2005";
60 
61 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
62  return RTLReversePolicyNames[reverse_policy];
63 }
64 
65 // Reset the Trie to empty.
66 void Trie::clear() {
68  nodes_.clear();
70  num_edges_ = 0;
71  new_dawg_node(); // Need to allocate node 0.
72 }
73 
74 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
75  int direction, bool word_end, UNICHAR_ID unichar_id,
76  EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
77  if (debug_level_ == 3) {
78  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
79  " direction %d word_end %d unichar_id %d, exploring node:\n",
80  node_ref, next_node, direction, word_end, unichar_id);
81  if (node_ref != NO_EDGE) {
82  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
83  }
84  }
85  if (node_ref == NO_EDGE) return false;
86  assert(node_ref < nodes_.size());
87  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
88  nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
89  int vec_size = vec.size();
90  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
91  EDGE_INDEX start = 0;
92  EDGE_INDEX end = vec_size - 1;
93  EDGE_INDEX k;
94  int compare;
95  while (start <= end) {
96  k = (start + end) >> 1; // (start + end) / 2
97  compare = given_greater_than_edge_rec(next_node, word_end,
98  unichar_id, vec[k]);
99  if (compare == 0) { // given == vec[k]
100  *edge_ptr = &(vec[k]);
101  *edge_index = k;
102  return true;
103  } else if (compare == 1) { // given > vec[k]
104  start = k + 1;
105  } else { // given < vec[k]
106  end = k - 1;
107  }
108  }
109  } else { // linear search
110  for (int i = 0; i < vec_size; ++i) {
111  EDGE_RECORD &edge_rec = vec[i];
112  if (edge_rec_match(next_node, word_end, unichar_id,
113  next_node_from_edge_rec(edge_rec),
114  end_of_word_from_edge_rec(edge_rec),
115  unichar_id_from_edge_rec(edge_rec))) {
116  *edge_ptr = &(edge_rec);
117  *edge_index = i;
118  return true;
119  }
120  }
121  }
122  return false; // not found
123 }
124 
125 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
126  int direction, bool word_end,
127  UNICHAR_ID unichar_id) {
128  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
129  &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
130  int search_index;
131  if (node1 == 0 && direction == FORWARD_EDGE) {
132  search_index = 0; // find the index to make the add sorted
133  while (search_index < vec->size() &&
134  given_greater_than_edge_rec(node2, word_end, unichar_id,
135  (*vec)[search_index]) == 1) {
136  search_index++;
137  }
138  } else {
139  search_index = vec->size(); // add is unsorted, so index does not matter
140  }
141  EDGE_RECORD edge_rec;
142  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
143  if (node1 == 0 && direction == BACKWARD_EDGE &&
145  EDGE_INDEX edge_index = root_back_freelist_.pop_back();
146  (*vec)[edge_index] = edge_rec;
147  } else if (search_index < vec->size()) {
148  vec->insert(edge_rec, search_index);
149  } else {
150  vec->push_back(edge_rec);
151  }
152  if (debug_level_ > 1) {
153  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
154  print_edge_rec(edge_rec);
155  tprintf("\n");
156  }
157  num_edges_++;
158  return true;
159 }
160 
162  NODE_REF the_next_node,
163  bool marker_flag,
164  UNICHAR_ID unichar_id) {
165  EDGE_RECORD *back_edge_ptr;
166  EDGE_INDEX back_edge_index;
167  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
168  unichar_id, &back_edge_ptr, &back_edge_index));
169  if (marker_flag) {
170  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
171  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
172  }
173  // Mark both directions as end of word.
174  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
175  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
176 }
177 
179  const GenericVector<bool> *repetitions) {
180  if (word.length() <= 0) return false; // can't add empty words
181  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
182  // Make sure the word does not contain invalid unchar ids.
183  for (int i = 0; i < word.length(); ++i) {
184  if (word.unichar_id(i) < 0 ||
185  word.unichar_id(i) >= unicharset_size_) return false;
186  }
187 
188  EDGE_RECORD *edge_ptr;
189  NODE_REF last_node = 0;
190  NODE_REF the_next_node;
191  bool marker_flag = false;
192  EDGE_INDEX edge_index;
193  int i;
194  inT32 still_finding_chars = true;
195  inT32 word_end = false;
196  bool add_failed = false;
197  bool found;
198 
199  if (debug_level_ > 1) word.print("\nAdding word: ");
200 
201  UNICHAR_ID unichar_id;
202  for (i = 0; i < word.length() - 1; ++i) {
203  unichar_id = word.unichar_id(i);
204  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
205  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
206  if (still_finding_chars) {
207  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
208  unichar_id, &edge_ptr, &edge_index);
209  if (found && debug_level_ > 1) {
210  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
211  edge_index, last_node);
212  }
213  if (!found) {
214  still_finding_chars = false;
215  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
216  // We hit the end of an existing word, but the new word is longer.
217  // In this case we have to disconnect the existing word from the
218  // backwards root node, mark the current position as end-of-word
219  // and add new nodes for the increased length. Disconnecting the
220  // existing word from the backwards root node requires a linear
221  // search, so it is much faster to add the longest words first,
222  // to avoid having to come here.
223  word_end = true;
224  still_finding_chars = false;
225  remove_edge(last_node, 0, word_end, unichar_id);
226  } else {
227  // We have to add a new branch here for the new word.
228  if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
229  last_node = next_node_from_edge_rec(*edge_ptr);
230  }
231  }
232  if (!still_finding_chars) {
233  the_next_node = new_dawg_node();
234  if (debug_level_ > 1)
235  tprintf("adding node " REFFORMAT "\n", the_next_node);
236  if (the_next_node == 0) {
237  add_failed = true;
238  break;
239  }
240  if (!add_new_edge(last_node, the_next_node,
241  marker_flag, word_end, unichar_id)) {
242  add_failed = true;
243  break;
244  }
245  word_end = false;
246  last_node = the_next_node;
247  }
248  }
249  the_next_node = 0;
250  unichar_id = word.unichar_id(i);
251  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
252  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
253  if (still_finding_chars &&
254  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
255  unichar_id, &edge_ptr, &edge_index)) {
256  // An extension of this word already exists in the trie, so we
257  // only have to add the ending flags in both directions.
258  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
259  marker_flag, unichar_id);
260  } else {
261  // Add a link to node 0. All leaves connect to node 0 so the back links can
262  // be used in reduction to a dawg. This root backward node has one edge
263  // entry for every word, (except prefixes of longer words) so it is huge.
264  if (!add_failed &&
265  !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
266  add_failed = true;
267  }
268  if (add_failed) {
269  tprintf("Re-initializing document dictionary...\n");
270  clear();
271  return false;
272  } else {
273  return true;
274  }
275 }
276 
278  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
279  nodes_.push_back(node);
280  return nodes_.length() - 1;
281 }
282 
283 // Sort function to sort words by decreasing order of length.
284 static int sort_strings_by_dec_length(const void* v1, const void* v2) {
285  const STRING* s1 = reinterpret_cast<const STRING*>(v1);
286  const STRING* s2 = reinterpret_cast<const STRING*>(v2);
287  return s2->length() - s1->length();
288 }
289 
291  const UNICHARSET &unicharset,
292  Trie::RTLReversePolicy reverse_policy) {
293  GenericVector<STRING> word_list;
294  if (!read_word_list(filename, unicharset, reverse_policy, &word_list))
295  return false;
296  word_list.sort(sort_strings_by_dec_length);
297  return add_word_list(word_list, unicharset);
298 }
299 
301  const UNICHARSET &unicharset,
302  Trie::RTLReversePolicy reverse_policy,
303  GenericVector<STRING>* words) {
304  FILE *word_file;
305  char string[CHARS_PER_LINE];
306  int word_count = 0;
307 
308  word_file = fopen(filename, "rb");
309  if (word_file == NULL) return false;
310 
311  while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
312  chomp_string(string); // remove newline
313  WERD_CHOICE word(string, unicharset);
314  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
315  word.has_rtl_unichar_id()) ||
316  reverse_policy == RRP_FORCE_REVERSE) {
318  }
319  ++word_count;
320  if (debug_level_ && word_count % 10000 == 0)
321  tprintf("Read %d words so far\n", word_count);
322  if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
323  words->push_back(word.unichar_string());
324  } else if (debug_level_) {
325  tprintf("Skipping invalid word %s\n", string);
326  if (debug_level_ >= 3) word.print();
327  }
328  }
329  if (debug_level_)
330  tprintf("Read %d words total.\n", word_count);
331  fclose(word_file);
332  return true;
333 }
334 
336  const UNICHARSET &unicharset) {
337  for (int i = 0; i < words.size(); ++i) {
338  WERD_CHOICE word(words[i].string(), unicharset);
339  if (!word_in_dawg(word)) {
340  add_word_to_dawg(word);
341  if (!word_in_dawg(word)) {
342  tprintf("Error: word '%s' not in DAWG after adding it\n",
343  words[i].string());
344  return false;
345  }
346  }
347  }
348  return true;
349 }
350 
358  unicharset->unichar_insert(kPuncPatternUnicode);
364  initialized_patterns_ = true;
365  unicharset_size_ = unicharset->size();
366 }
367 
369  const UNICHARSET &unicharset,
370  GenericVector<UNICHAR_ID> *vec) const {
371  bool is_alpha = unicharset.get_isalpha(unichar_id);
372  if (is_alpha) {
375  if (unicharset.get_islower(unichar_id)) {
377  } else if (unicharset.get_isupper(unichar_id)) {
379  }
380  }
381  if (unicharset.get_isdigit(unichar_id)) {
383  if (!is_alpha) vec->push_back(alphanum_pattern_);
384  }
385  if (unicharset.get_ispunctuation(unichar_id)) {
386  vec->push_back(punc_pattern_);
387  }
388 }
389 
391  if (ch == 'c') {
392  return alpha_pattern_;
393  } else if (ch == 'd') {
394  return digit_pattern_;
395  } else if (ch == 'n') {
396  return alphanum_pattern_;
397  } else if (ch == 'p') {
398  return punc_pattern_;
399  } else if (ch == 'a') {
400  return lower_pattern_;
401  } else if (ch == 'A') {
402  return upper_pattern_;
403  } else {
404  return INVALID_UNICHAR_ID;
405  }
406 }
407 
409  const UNICHARSET &unicharset) {
410  if (!initialized_patterns_) {
411  tprintf("please call initialize_patterns() before read_pattern_list()\n");
412  return false;
413  }
414 
415  FILE *pattern_file = fopen(filename, "rb");
416  if (pattern_file == NULL) {
417  tprintf("Error opening pattern file %s\n", filename);
418  return false;
419  }
420 
421  int pattern_count = 0;
422  char string[CHARS_PER_LINE];
423  while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
424  chomp_string(string); // remove newline
425  // Parse the pattern and construct a unichar id vector.
426  // Record the number of repetitions of each unichar in the parallel vector.
427  WERD_CHOICE word(&unicharset);
428  GenericVector<bool> repetitions_vec;
429  const char *str_ptr = string;
430  int step = unicharset.step(str_ptr);
431  bool failed = false;
432  while (step > 0) {
433  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
434  if (step == 1 && *str_ptr == '\\') {
435  ++str_ptr;
436  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
437  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
438  } else {
439  if (word.length() < kSaneNumConcreteChars) {
440  tprintf("Please provide at least %d concrete characters at the"
441  " beginning of the pattern\n", kSaneNumConcreteChars);
442  failed = true;
443  break;
444  }
445  // Parse character class from expression.
446  curr_unichar_id = character_class_to_pattern(*str_ptr);
447  }
448  } else {
449  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
450  }
451  if (curr_unichar_id == INVALID_UNICHAR_ID) {
452  failed = true;
453  break; // failed to parse this pattern
454  }
455  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
456  repetitions_vec.push_back(false);
457  str_ptr += step;
458  step = unicharset.step(str_ptr);
459  // Check if there is a repetition pattern specified after this unichar.
460  if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
461  repetitions_vec[repetitions_vec.size()-1] = true;
462  str_ptr += 2;
463  step = unicharset.step(str_ptr);
464  }
465  }
466  if (failed) {
467  tprintf("Invalid user pattern %s\n", string);
468  continue;
469  }
470  // Insert the pattern into the trie.
471  if (debug_level_ > 2) {
472  tprintf("Inserting expanded user pattern %s\n",
473  word.debug_string().string());
474  }
475  if (!this->word_in_dawg(word)) {
476  this->add_word_to_dawg(word, &repetitions_vec);
477  if (!this->word_in_dawg(word)) {
478  tprintf("Error: failed to insert pattern '%s'\n", string);
479  }
480  }
481  ++pattern_count;
482  }
483  if (debug_level_) {
484  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
485  }
486  fclose(pattern_file);
487  return true;
488 }
489 
491  bool word_end, UNICHAR_ID unichar_id) {
492  EDGE_RECORD *edge_ptr = NULL;
493  EDGE_INDEX edge_index = 0;
494  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
495  unichar_id, &edge_ptr, &edge_index));
496  if (debug_level_ > 1) {
497  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
498  print_edge_rec(*edge_ptr);
499  tprintf("\n");
500  }
501  if (direction == FORWARD_EDGE) {
502  nodes_[node1]->forward_edges.remove(edge_index);
503  } else if (node1 == 0) {
504  KillEdge(&nodes_[node1]->backward_edges[edge_index]);
505  root_back_freelist_.push_back(edge_index);
506  } else {
507  nodes_[node1]->backward_edges.remove(edge_index);
508  }
509  --num_edges_;
510 }
511 
512 // Some optimizations employed in add_word_to_dawg and trie_to_dawg:
513 // 1 Avoid insertion sorting or bubble sorting the tail root node
514 // (back links on node 0, a list of all the leaves.). The node is
515 // huge, and sorting it with n^2 time is terrible.
516 // 2 Avoid using GenericVector::remove on the tail root node.
517 // (a) During add of words to the trie, zero-out the unichars and
518 // keep a freelist of spaces to re-use.
519 // (b) During reduction, just zero-out the unichars of deleted back
520 // links, skipping zero entries while searching.
521 // 3 Avoid linear search of the tail root node. This has to be done when
522 // a suffix is added to an existing word. Adding words by decreasing
523 // length avoids this problem entirely. Words can still be added in
524 // any order, but it is faster to add the longest first.
526  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
527  if (debug_level_ > 2) {
528  print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
529  }
530  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
531  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
532  this->reduce_node_input(0, reduced_nodes);
533  delete[] reduced_nodes;
534 
535  if (debug_level_ > 2) {
536  print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
537  }
538  // Build a translation map from node indices in nodes_ vector to
539  // their target indices in EDGE_ARRAY.
540  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
541  int i, j;
542  node_ref_map[0] = 0;
543  for (i = 0; i < nodes_.size(); ++i) {
544  node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
545  }
546  int num_forward_edges = node_ref_map[i];
547 
548  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
549  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
550  EDGE_ARRAY edge_array =
551  (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
552  EDGE_ARRAY edge_array_ptr = edge_array;
553  for (i = 0; i < nodes_.size(); ++i) {
554  TRIE_NODE_RECORD *node_ptr = nodes_[i];
555  int end = node_ptr->forward_edges.size();
556  for (j = 0; j < end; ++j) {
557  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
558  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
559  ASSERT_HOST(node_ref < nodes_.size());
560  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
561  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
562  end_of_word_from_edge_rec(edge_rec), unichar_id);
563  if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
564  ++edge_array_ptr;
565  }
566  }
567  delete[] node_ref_map;
568 
569  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
571 }
572 
574  const EDGE_RECORD &edge1,
575  const EDGE_RECORD &edge2) {
576  if (debug_level_ > 1) {
577  tprintf("\nCollapsing node %d:\n", node);
579  tprintf("Candidate edges: ");
580  print_edge_rec(edge1);
581  tprintf(", ");
582  print_edge_rec(edge2);
583  tprintf("\n\n");
584  }
585  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
586  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
587  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
588  // Translate all edges going to/from next_node2 to go to/from next_node1.
589  EDGE_RECORD *edge_ptr = NULL;
590  EDGE_INDEX edge_index;
591  int i;
592  // The backward link in node to next_node2 will be zeroed out by the caller.
593  // Copy all the backward links in next_node2 to node next_node1
594  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
595  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
596  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
597  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
598  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
599  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
600  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
601  curr_word_end, curr_unichar_id);
602  // Relocate the corresponding forward edge in curr_next_node
603  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
604  curr_word_end, curr_unichar_id,
605  &edge_ptr, &edge_index));
606  set_next_node_in_edge_rec(edge_ptr, next_node1);
607  }
608  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
609  next_node2_ptr->backward_edges.size());
610  if (debug_level_ > 1) {
611  tprintf("removed %d edges from node " REFFORMAT "\n",
612  next_node2_num_edges, next_node2);
613  }
614  next_node2_ptr->forward_edges.clear();
615  next_node2_ptr->backward_edges.clear();
616  num_edges_ -= next_node2_num_edges;
617  return true;
618 }
619 
621  UNICHAR_ID unichar_id,
622  NODE_REF node,
623  EDGE_VECTOR* backward_edges,
624  NODE_MARKER reduced_nodes) {
625  if (debug_level_ > 1)
626  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
627  // Compare each of the edge pairs with the given unichar_id.
628  bool did_something = false;
629  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
630  // Find the first edge that can be eliminated.
631  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
632  while (i < backward_edges->size()) {
633  if (!DeadEdge((*backward_edges)[i])) {
634  curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
635  if (curr_unichar_id != unichar_id) return did_something;
636  if (can_be_eliminated((*backward_edges)[i])) break;
637  }
638  ++i;
639  }
640  if (i == backward_edges->size()) break;
641  const EDGE_RECORD &edge_rec = (*backward_edges)[i];
642  // Compare it to the rest of the edges with the given unichar_id.
643  for (int j = i + 1; j < backward_edges->size(); ++j) {
644  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
645  if (DeadEdge(next_edge_rec)) continue;
646  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
647  if (next_id != unichar_id) break;
648  if (end_of_word_from_edge_rec(next_edge_rec) ==
649  end_of_word_from_edge_rec(edge_rec) &&
650  can_be_eliminated(next_edge_rec) &&
651  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
652  reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
653  did_something = true;
654  KillEdge(&(*backward_edges)[j]);
655  }
656  }
657  }
658  return did_something;
659 }
660 
662  int num_edges = edges->size();
663  if (num_edges <= 1) return;
665  sort_vec.reserve(num_edges);
666  for (int i = 0; i < num_edges; ++i) {
668  unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
669  }
670  sort_vec.sort();
671  for (int i = 0; i < num_edges; ++i)
672  (*edges)[i] = sort_vec[i].data;
673 }
674 
676  NODE_MARKER reduced_nodes) {
677  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
678  sort_edges(&backward_edges);
679  if (debug_level_ > 1) {
680  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
682  }
683 
684  EDGE_INDEX edge_index = 0;
685  while (edge_index < backward_edges.size()) {
686  if (DeadEdge(backward_edges[edge_index])) continue;
687  UNICHAR_ID unichar_id =
688  unichar_id_from_edge_rec(backward_edges[edge_index]);
689  while (reduce_lettered_edges(edge_index, unichar_id, node,
690  &backward_edges, reduced_nodes));
691  while (++edge_index < backward_edges.size()) {
692  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
693  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
694  }
695  }
696  reduced_nodes[node] = true; // mark as reduced
697 
698  if (debug_level_ > 1) {
699  tprintf("Node " REFFORMAT " after reduction:\n", node);
701  }
702 
703  for (int i = 0; i < backward_edges.size(); ++i) {
704  if (DeadEdge(backward_edges[i])) continue;
705  NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
706  if (next_node != 0 && !reduced_nodes[next_node]) {
707  reduce_node_input(next_node, reduced_nodes);
708  }
709  }
710 }
711 
712 void Trie::print_node(NODE_REF node, int max_num_edges) const {
713  if (node == NO_EDGE) return; // nothing to print
714  TRIE_NODE_RECORD *node_ptr = nodes_[node];
715  int num_fwd = node_ptr->forward_edges.size();
716  int num_bkw = node_ptr->backward_edges.size();
717  EDGE_VECTOR *vec;
718  for (int dir = 0; dir < 2; ++dir) {
719  if (dir == 0) {
720  vec = &(node_ptr->forward_edges);
721  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
722  } else {
723  vec = &(node_ptr->backward_edges);
724  tprintf("\t");
725  }
726  int i;
727  for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
728  i < max_num_edges; ++i) {
729  if (DeadEdge((*vec)[i])) continue;
730  print_edge_rec((*vec)[i]);
731  tprintf(" ");
732  }
733  if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
734  tprintf("\n");
735  }
736 }
737 
738 } // namespace tesseract
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:351
int step(const char *str) const
Definition: unicharset.cpp:211
const char kReverseIfHasRTL[]
Definition: trie.cpp:45
TRIE_NODES nodes_
Definition: trie.h:420
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:490
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:525
bool read_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy, GenericVector< STRING > *words)
Definition: trie.cpp:300
int UNICHAR_ID
Definition: unichar.h:33
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:409
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:266
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:462
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:290
UNICHAR_ID alphanum_pattern_
Definition: trie.h:431
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:53
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
#define WERD_END_FLAG
Definition: dawg.h:89
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:161
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:661
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:307
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
UNICHAR_ID punc_pattern_
Definition: trie.h:432
Definition: strngs.h:44
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:425
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:125
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:573
STRING lang_
Definition: dawg.h:297
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: trie.cpp:368
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:153
uinT64 EDGE_RECORD
Definition: dawg.h:50
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int inT32
Definition: host.h:102
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:408
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:48
void reserve(int size)
EDGE_VECTOR forward_edges
Definition: trie.h:49
RTLReversePolicy
Definition: trie.h:64
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:61
static const char kLowerPatternUnicode[]
Definition: trie.h:79
bool empty() const
Definition: genericvector.h:84
#define MARKER_FLAG
Definition: dawg.h:87
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
int length() const
Definition: ratngs.h:301
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:712
#define CHARS_PER_LINE
Definition: cutil.h:57
const STRING debug_string() const
Definition: ratngs.h:503
#define BACKWARD_EDGE
Definition: dawg.h:85
#define tprintf(...)
Definition: tprintf.h:31
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:469
static const char kDigitPatternUnicode[]
Definition: trie.h:76
static const int kSaneNumConcreteChars
Definition: trie.h:71
int size() const
Definition: genericvector.h:72
int flag_start_bit_
Definition: dawg.h:305
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:235
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:382
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:318
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:245
const STRING & unichar_string() const
Definition: ratngs.h:525
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:448
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:157
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:620
EDGE_VECTOR backward_edges
Definition: trie.h:50
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:299
NODE_REF new_dawg_node()
Definition: trie.cpp:277
inT32 length() const
Definition: strngs.cpp:196
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:211
const char kForceReverse[]
Definition: trie.cpp:46
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:455
static const char kPuncPatternUnicode[]
Definition: trie.h:78
void delete_data_pointers()
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:476
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:304
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
UNICHAR_ID alpha_pattern_
Definition: trie.h:429
static const char kAlphanumPatternUnicode[]
Definition: trie.h:77
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
uinT64 num_edges_
Definition: trie.h:421
void remove(int index)
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:343
static const char kUpperPatternUnicode[]
Definition: trie.h:80
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset)
Definition: trie.cpp:335
static const char kAlphaPatternUnicode[]
Definition: trie.h:75
#define FORWARD_EDGE
Definition: dawg.h:84
void chomp_string(char *str)
Definition: helpers.h:75
DawgType type_
Definition: dawg.h:296
int push_back(T object)
#define REFFORMAT
Definition: dawg.h:92
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:335
void insert(T t, int index)
int debug_level_
Definition: dawg.h:311
bool initialized_patterns_
Definition: trie.h:428
const char kDoNotReverse[]
Definition: trie.cpp:44
bool * NODE_MARKER
Definition: trie.h:45
inT64 NODE_REF
Definition: dawg.h:55
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:357
int size() const
Definition: unicharset.h:296
#define ASSERT_HOST(x)
Definition: errcode.h:84
UNICHAR_ID lower_pattern_
Definition: trie.h:433
const char * string() const
Definition: strngs.cpp:201
int length() const
Definition: genericvector.h:79
int * memalloc(int size)
Definition: freelist.cpp:22
UNICHAR_ID upper_pattern_
Definition: trie.h:434
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
void clear()
Definition: trie.cpp:66
void print() const
Definition: ratngs.h:564
UNICHAR_ID digit_pattern_
Definition: trie.h:430
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:390
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:327
int unicharset_size_
Definition: dawg.h:304
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:675
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:229
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
inT64 EDGE_INDEX
Definition: trie.h:32