tesseract  4.1.1
reject.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: reject.cpp (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #ifdef DISABLED_LEGACY_ENGINE
25 
26 #include "tesseractclass.h"
27 
28 namespace tesseract {
29 
30 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
31  const WERD_CHOICE &word = *werd_res->best_choice;
32  int dict_word_type = werd_res->tesseract->dict_word(word);
33  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
34 }
35 } // namespace tesseract
36 
37 #else
38 
39 #include "tessvars.h"
40 #include <cctype>
41 #include <cerrno>
42 #include <cstring>
43 #include "genericvector.h"
44 #include "reject.h"
45 #include "control.h"
46 #include "docqual.h"
47 #include "helpers.h"
48 
49 #include "tesseractclass.h"
50 
52 
53 /*************************************************************************
54  * set_done()
55  *
56  * Set the done flag based on the word acceptability criteria
57  *************************************************************************/
58 
59 namespace tesseract {
60 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
61  word->done = word->tess_accepted &&
62  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
63  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
64  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
65  word->best_choice->permuter() == FREQ_DAWG_PERM ||
67  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
68  one_ell_conflict(word, false)) {
69  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
70  word->done = false;
71  }
72  if (word->done && ((!word_from_dict &&
73  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
74  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
75  word->done = false;
76  }
78  tprintf("set_done(): done=%d\n", word->done);
79  word->best_choice->print("");
80  }
81 }
82 
83 
84 /*************************************************************************
85  * make_reject_map()
86  *
87  * Sets the done flag to indicate whether the resylt is acceptable.
88  *
89  * Sets a reject map for the word.
90  *************************************************************************/
91 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
92  int i;
93  int offset;
94 
95  flip_0O(word);
96  check_debug_pt(word, -1); // For trap only
97  set_done(word, pass); // Set acceptance
99  reject_blanks(word);
100  /*
101  0: Rays original heuristic - the baseline
102  */
103  if (tessedit_reject_mode == 0) {
104  if (!word->done)
105  reject_poor_matches(word);
106  } else if (tessedit_reject_mode == 5) {
107  /*
108  5: Reject I/1/l from words where there is no strong contextual confirmation;
109  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
110  and the whole of any words which are very small
111  */
112  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
114  } else {
115  one_ell_conflict(word, true);
116  /*
117  Originally the code here just used the done flag. Now I have duplicated
118  and unpacked the conditions for setting the done flag so that each
119  mechanism can be turned on or off independently. This works WITHOUT
120  affecting the done flag setting.
121  */
122  if (rej_use_tess_accepted && !word->tess_accepted)
124 
125  if (rej_use_tess_blanks &&
126  (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
128 
129  WERD_CHOICE* best_choice = word->best_choice;
130  if (rej_use_good_perm) {
131  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
132  best_choice->permuter() == FREQ_DAWG_PERM ||
133  best_choice->permuter() == USER_DAWG_PERM) &&
136  best_choice->unichar_string().string(),
137  best_choice->unichar_lengths().string()) !=
138  AC_UNACCEPTABLE)) {
139  // PASSED TEST
140  } else if (best_choice->permuter() == NUMBER_PERM) {
142  for (i = 0, offset = 0;
143  best_choice->unichar_string()[offset] != '\0';
144  offset += best_choice->unichar_lengths()[i++]) {
145  if (word->reject_map[i].accepted() &&
146  word->uch_set->get_isalpha(
147  best_choice->unichar_string().string() + offset,
148  best_choice->unichar_lengths()[i]))
149  word->reject_map[i].setrej_bad_permuter();
150  // rej alpha
151  }
152  }
153  } else {
155  }
156  }
157  /* Ambig word rejection was here once !!*/
158  }
159  } else {
160  tprintf("BAD tessedit_reject_mode\n");
161  ASSERT_HOST("Fatal error encountered!" == nullptr);
162  }
163 
164  if (tessedit_image_border > -1)
165  reject_edge_blobs(word);
166 
167  check_debug_pt (word, 10);
169  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
170  tprintf("Certainty: %f Rating: %f\n",
171  word->best_choice->certainty (), word->best_choice->rating ());
172  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
173  }
174 
175  flip_hyphens(word);
176  check_debug_pt(word, 20);
177 }
178 } // namespace tesseract
179 
180 
181 void reject_blanks(WERD_RES *word) {
182  int16_t i;
183  int16_t offset;
184 
185  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
186  offset += word->best_choice->unichar_lengths()[i], i += 1) {
187  if (word->best_choice->unichar_string()[offset] == ' ')
188  //rej unrecognised blobs
189  word->reject_map[i].setrej_tess_failure ();
190  }
191 }
192 
193 namespace tesseract {
195  int16_t i;
196  int16_t offset;
197 
198  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
199  offset += word->best_choice->unichar_lengths()[i], i += 1) {
201  contains (word->best_choice->unichar_string()[offset])) {
202  //rej 1Il conflict
203  word->reject_map[i].setrej_1Il_conflict ();
204  }
205  }
206 }
207 } // namespace tesseract
208 
209 
211  float threshold = compute_reject_threshold(word->best_choice);
212  for (int i = 0; i < word->best_choice->length(); ++i) {
213  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
214  word->reject_map[i].setrej_tess_failure();
215  else if (word->best_choice->certainty(i) < threshold)
216  word->reject_map[i].setrej_poor_match();
217  }
218 }
219 
220 
221 /**********************************************************************
222  * compute_reject_threshold
223  *
224  * Set a rejection threshold for this word.
225  * Initially this is a trivial function which looks for the largest
226  * gap in the certainty value.
227  **********************************************************************/
228 
230  float threshold; // rejection threshold
231  float bestgap = 0.0f; // biggest gap
232  float gapstart; // bottom of gap
233 
234  int blob_count = word->length();
235  GenericVector<float> ratings;
236  ratings.resize_no_init(blob_count);
237  for (int i = 0; i < blob_count; ++i) {
238  ratings[i] = word->certainty(i);
239  }
240  ratings.sort();
241  gapstart = ratings[0] - 1; // all reject if none better
242  if (blob_count >= 3) {
243  for (int index = 0; index < blob_count - 1; index++) {
244  if (ratings[index + 1] - ratings[index] > bestgap) {
245  bestgap = ratings[index + 1] - ratings[index];
246  // find biggest
247  gapstart = ratings[index];
248  }
249  }
250  }
251  threshold = gapstart + bestgap / 2;
252 
253  return threshold;
254 }
255 
256 
257 /*************************************************************************
258  * reject_edge_blobs()
259  *
260  * If the word is perilously close to the edge of the image, reject those blobs
261  * in the word which are too close to the edge as they could be clipped.
262  *************************************************************************/
263 namespace tesseract {
265  TBOX word_box = word->word->bounding_box();
266  // Use the box_word as it is already denormed back to image coordinates.
267  int blobcount = word->box_word->length();
268 
269  if (word_box.left() < tessedit_image_border ||
270  word_box.bottom() < tessedit_image_border ||
271  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
272  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
273  ASSERT_HOST(word->reject_map.length() == blobcount);
274  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
275  TBOX blob_box = word->box_word->BlobBox(blobindex);
276  if (blob_box.left() < tessedit_image_border ||
277  blob_box.bottom() < tessedit_image_border ||
278  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
279  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
280  word->reject_map[blobindex].setrej_edge_char();
281  // Close to edge
282  }
283  }
284  }
285 }
286 
287 /**********************************************************************
288  * one_ell_conflict()
289  *
290  * Identify words where there is a potential I/l/1 error.
291  * - A bundle of contextual heuristics!
292  **********************************************************************/
293 bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
294  const char *word;
295  const char *lengths;
296  int16_t word_len; //its length
297  int16_t first_alphanum_index_;
298  int16_t first_alphanum_offset_;
299  int16_t i;
300  int16_t offset;
301  bool non_conflict_set_char; //non conf set a/n?
302  bool conflict = false;
303  bool allow_1s;
304  ACCEPTABLE_WERD_TYPE word_type;
305  bool dict_perm_type;
306  bool dict_word_ok;
307  int dict_word_type;
308 
309  word = word_res->best_choice->unichar_string().string ();
310  lengths = word_res->best_choice->unichar_lengths().string();
311  word_len = strlen(lengths);
312  /*
313  If there are no occurrences of the conflict set characters then the word
314  is OK.
315  */
316  if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
317  return false;
318 
319  /*
320  There is a conflict if there are NO other (confirmed) alphanumerics apart
321  from those in the conflict set.
322  */
323 
324  for (i = 0, offset = 0, non_conflict_set_char = false;
325  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326  non_conflict_set_char =
327  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
328  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
329  !STRING (conflict_set_I_l_1).contains (word[offset]);
330  if (!non_conflict_set_char) {
331  if (update_map)
332  reject_I_1_L(word_res);
333  return true;
334  }
335 
336  /*
337  If the word is accepted by a dawg permuter, and the first alpha character
338  is "I" or "l", check to see if the alternative is also a dawg word. If it
339  is, then there is a potential error otherwise the word is ok.
340  */
341 
342  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
343  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
345  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
346  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
347  dict_word_type = dict_word(*(word_res->best_choice));
348  dict_word_ok = (dict_word_type > 0) &&
349  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
350 
351  if ((rej_1Il_use_dict_word && dict_word_ok) ||
352  (rej_1Il_trust_permuter_type && dict_perm_type) ||
353  (dict_perm_type && dict_word_ok)) {
354  first_alphanum_index_ = first_alphanum_index (word, lengths);
355  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
356  if (lengths[first_alphanum_index_] == 1 &&
357  word[first_alphanum_offset_] == 'I') {
358  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
359  if (safe_dict_word(word_res) > 0) {
360  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
361  if (update_map)
362  word_res->reject_map[first_alphanum_index_].
363  setrej_1Il_conflict();
364  return true;
365  }
366  else {
367  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
368  return false;
369  }
370  }
371 
372  if (lengths[first_alphanum_index_] == 1 &&
373  word[first_alphanum_offset_] == 'l') {
374  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
375  if (safe_dict_word(word_res) > 0) {
376  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
377  if (update_map)
378  word_res->reject_map[first_alphanum_index_].
379  setrej_1Il_conflict();
380  return true;
381  }
382  else {
383  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
384  return false;
385  }
386  }
387  return false;
388  }
389 
390  /*
391  NEW 1Il code. The old code relied on permuter types too much. In fact,
392  tess will use TOP_CHOICE permute for good things like "palette".
393  In this code the string is examined independently to see if it looks like
394  a well formed word.
395  */
396 
397  /*
398  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
399  dictionary word.
400  */
401  first_alphanum_index_ = first_alphanum_index (word, lengths);
402  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
403  if (lengths[first_alphanum_index_] == 1 &&
404  word[first_alphanum_offset_] == 'l') {
405  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
406  if (safe_dict_word(word_res) > 0)
407  return false;
408  else
409  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
410  }
411  else if (lengths[first_alphanum_index_] == 1 &&
412  word[first_alphanum_offset_] == 'I') {
413  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414  if (safe_dict_word(word_res) > 0)
415  return false;
416  else
417  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
418  }
419  /*
420  For strings containing digits:
421  If there are no alphas OR the numeric permuter liked the word,
422  reject any non 1 conflict chs
423  Else reject all conflict chs
424  */
425  if (word_contains_non_1_digit (word, lengths)) {
426  allow_1s = (alpha_count (word, lengths) == 0) ||
427  (word_res->best_choice->permuter () == NUMBER_PERM);
428 
429  int16_t offset;
430  conflict = false;
431  for (i = 0, offset = 0; word[offset] != '\0';
432  offset += word_res->best_choice->unichar_lengths()[i++]) {
433  if ((!allow_1s || (word[offset] != '1')) &&
434  STRING (conflict_set_I_l_1).contains (word[offset])) {
435  if (update_map)
436  word_res->reject_map[i].setrej_1Il_conflict ();
437  conflict = true;
438  }
439  }
440  return conflict;
441  }
442  /*
443  For anything else. See if it conforms to an acceptable word type. If so,
444  treat accordingly.
445  */
446  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
447  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
448  first_alphanum_index_ = first_alphanum_index (word, lengths);
449  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
450  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
451  if (update_map)
452  word_res->reject_map[first_alphanum_index_].
453  setrej_1Il_conflict ();
454  return true;
455  }
456  else
457  return false;
458  }
459  else if (word_type == AC_UPPER_CASE) {
460  return false;
461  }
462  else {
463  if (update_map)
464  reject_I_1_L(word_res);
465  return true;
466  }
467 }
468 
469 
470 int16_t Tesseract::first_alphanum_index(const char *word,
471  const char *word_lengths) {
472  int16_t i;
473  int16_t offset;
474 
475  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
476  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
477  unicharset.get_isdigit(word + offset, word_lengths[i]))
478  return i;
479  }
480  return -1;
481 }
482 
483 int16_t Tesseract::first_alphanum_offset(const char *word,
484  const char *word_lengths) {
485  int16_t i;
486  int16_t offset;
487 
488  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
490  unicharset.get_isdigit(word + offset, word_lengths[i]))
491  return offset;
492  }
493  return -1;
494 }
495 
496 int16_t Tesseract::alpha_count(const char *word,
497  const char *word_lengths) {
498  int16_t i;
499  int16_t offset;
500  int16_t count = 0;
501 
502  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
503  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
504  count++;
505  }
506  return count;
507 }
508 
509 
511  const char* word_lengths) {
512  int16_t i;
513  int16_t offset;
514 
515  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
516  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
517  (word_lengths[i] != 1 || word[offset] != '1'))
518  return true;
519  }
520  return false;
521 }
522 
523 /*************************************************************************
524  * dont_allow_1Il()
525  * Don't unreject LONE accepted 1Il conflict set chars
526  *************************************************************************/
528  int i = 0;
529  int offset;
530  int word_len = word->reject_map.length();
531  const char *s = word->best_choice->unichar_string().string();
532  const char *lengths = word->best_choice->unichar_lengths().string();
533  bool accepted_1Il = false;
534 
535  for (i = 0, offset = 0; i < word_len;
536  offset += word->best_choice->unichar_lengths()[i++]) {
537  if (word->reject_map[i].accepted()) {
538  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
539  accepted_1Il = true;
540  } else {
541  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
542  word->uch_set->get_isdigit(s + offset, lengths[i]))
543  return; // >=1 non 1Il ch accepted
544  }
545  }
546  }
547  if (!accepted_1Il)
548  return; //Nothing to worry about
549 
550  for (i = 0, offset = 0; i < word_len;
551  offset += word->best_choice->unichar_lengths()[i++]) {
552  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
553  word->reject_map[i].accepted())
554  word->reject_map[i].setrej_postNN_1Il();
555  }
556 }
557 
558 
560  int count = 0;
561  const WERD_CHOICE *best_choice = word_res->best_choice;
562  for (int i = 0; i < word_res->reject_map.length(); ++i) {
563  if ((word_res->reject_map[i].accepted()) &&
564  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
565  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
566  count++;
567  }
568  }
569  return count;
570 }
571 
572 
573 // reject all if most rejected.
575  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
576 
577  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
580 }
581 
582 
584  int16_t char_quality;
585  int16_t accepted_char_quality;
586 
587  if (word->best_choice->unichar_lengths().length() <= 1)
588  return false;
589 
591  contains(word->best_choice->unichar_string()[0]))
592  return false;
593 
594  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
595  for (int i = 1; i < word->best_choice->length(); ++i) {
596  if (word->best_choice->unichar_id(i) != uch_id) return false;
597  }
598 
599  word_char_quality(word, row, &char_quality, &accepted_char_quality);
600 
601  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
602  (char_quality == accepted_char_quality))
603  return true;
604  else
605  return false;
606 }
607 
608 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
609  const WERD_CHOICE &word = *werd_res->best_choice;
610  int dict_word_type = werd_res->tesseract->dict_word(word);
611  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
612 }
613 
614 // Note: After running this function word_res->ratings
615 // might not contain the right BLOB_CHOICE corresponding to each character
616 // in word_res->best_choice.
618  WERD_CHOICE *best_choice = word_res->best_choice;
619  int i;
620  int prev_right = -9999;
621  int next_left;
622  TBOX out_box;
623  float aspect_ratio;
624 
626  return;
627 
628  int num_blobs = word_res->rebuild_word->NumBlobs();
629  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
630  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
631  TBLOB* blob = word_res->rebuild_word->blobs[i];
632  out_box = blob->bounding_box();
633  if (i + 1 == num_blobs)
634  next_left = 9999;
635  else
636  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
637  // Don't touch small or touching blobs - it is too dangerous.
638  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640  aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
641  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
642  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
643  word_res->uch_set->contains_unichar_id(unichar_dash) &&
644  word_res->uch_set->get_enabled(unichar_dash)) {
645  /* Certain HYPHEN */
646  best_choice->set_unichar_id(unichar_dash, i);
647  if (word_res->reject_map[i].rejected())
648  word_res->reject_map[i].setrej_hyphen_accept();
649  }
650  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
651  word_res->reject_map[i].accepted())
652  //Suspected HYPHEN
653  word_res->reject_map[i].setrej_hyphen ();
654  }
655  else if (best_choice->unichar_id(i) == unichar_dash) {
656  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
657  (word_res->reject_map[i].rejected()))
658  word_res->reject_map[i].setrej_hyphen_accept();
659  //Certain HYPHEN
660 
661  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
662  (word_res->reject_map[i].accepted()))
663  //Suspected HYPHEN
664  word_res->reject_map[i].setrej_hyphen();
665  }
666  }
667  prev_right = out_box.right();
668  }
669 }
670 
671 // Note: After running this function word_res->ratings
672 // might not contain the right BLOB_CHOICE corresponding to each character
673 // in word_res->best_choice.
674 void Tesseract::flip_0O(WERD_RES *word_res) {
675  WERD_CHOICE *best_choice = word_res->best_choice;
676  int i;
677  TBOX out_box;
678 
679  if (!tessedit_flip_0O)
680  return;
681 
682  int num_blobs = word_res->rebuild_word->NumBlobs();
683  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
684  TBLOB* blob = word_res->rebuild_word->blobs[i];
685  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
687  out_box = blob->bounding_box();
688  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
689  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
690  return; //Beware words with sub/superscripts
691  }
692  }
693  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
694  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
695  if (unichar_0 == INVALID_UNICHAR_ID ||
696  !word_res->uch_set->get_enabled(unichar_0) ||
697  unichar_O == INVALID_UNICHAR_ID ||
698  !word_res->uch_set->get_enabled(unichar_O)) {
699  return; // 0 or O are not present/enabled in unicharset
700  }
701  for (i = 1; i < best_choice->length(); ++i) {
702  if (best_choice->unichar_id(i) == unichar_0 ||
703  best_choice->unichar_id(i) == unichar_O) {
704  /* A0A */
705  if ((i+1) < best_choice->length() &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708  best_choice->set_unichar_id(unichar_O, i);
709  }
710  /* A00A */
711  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712  (i+1) < best_choice->length() &&
713  (best_choice->unichar_id(i+1) == unichar_0 ||
714  best_choice->unichar_id(i+1) == unichar_O) &&
715  (i+2) < best_choice->length() &&
716  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717  best_choice->set_unichar_id(unichar_O, i);
718  i++;
719  }
720  /* AA0<non digit or end of word> */
721  if ((i > 1) &&
722  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724  (((i+1) < best_choice->length() &&
725  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
727  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
728  (i == best_choice->length() - 1))) {
729  best_choice->set_unichar_id(unichar_O, i);
730  }
731  /* 9O9 */
732  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733  (i+1) < best_choice->length() &&
734  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735  best_choice->set_unichar_id(unichar_0, i);
736  }
737  /* 9OOO */
738  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739  (i+2) < best_choice->length() &&
740  (best_choice->unichar_id(i+1) == unichar_0 ||
741  best_choice->unichar_id(i+1) == unichar_O) &&
742  (best_choice->unichar_id(i+2) == unichar_0 ||
743  best_choice->unichar_id(i+2) == unichar_O)) {
744  best_choice->set_unichar_id(unichar_0, i);
745  best_choice->set_unichar_id(unichar_0, i+1);
746  best_choice->set_unichar_id(unichar_0, i+2);
747  i += 2;
748  }
749  /* 9OO<non upper> */
750  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751  (i+2) < best_choice->length() &&
752  (best_choice->unichar_id(i+1) == unichar_0 ||
753  best_choice->unichar_id(i+1) == unichar_O) &&
754  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755  best_choice->set_unichar_id(unichar_0, i);
756  best_choice->set_unichar_id(unichar_0, i+1);
757  i++;
758  }
759  /* 9O<non upper> */
760  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761  (i+1) < best_choice->length() &&
762  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763  best_choice->set_unichar_id(unichar_0, i);
764  }
765  /* 9[.,]OOO.. */
766  if ((i > 1) &&
767  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
768  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
769  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770  best_choice->unichar_id(i-2) == unichar_O)) {
771  if (best_choice->unichar_id(i-2) == unichar_O) {
772  best_choice->set_unichar_id(unichar_0, i-2);
773  }
774  while (i < best_choice->length() &&
775  (best_choice->unichar_id(i) == unichar_O ||
776  best_choice->unichar_id(i) == unichar_0)) {
777  best_choice->set_unichar_id(unichar_0, i);
778  i++;
779  }
780  i--;
781  }
782  }
783  }
784 }
785 
786 bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
787  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
788 }
789 
790 bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
791  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
792 }
793 } // namespace tesseract
794 
795 #endif // def DISABLED_LEGACY_ENGINE
int UNICHAR_ID
Definition: unichar.h:34
int16_t width() const
Definition: rect.h:115
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int NumBlobs() const
Definition: blobs.h:448
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:181
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
void resize_no_init(int size)
Definition: genericvector.h:66
double rej_whole_of_mostly_reject_word_fract
int length() const
Definition: ratngs.h:293
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
float rating() const
Definition: ratngs.h:317
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:617
REJMAP reject_map
Definition: pageres.h:294
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
const int kBlnBaselineOffset
Definition: normalis.h:25
float y_scale() const
Definition: normalis.h:270
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
void rej_word_mostly_rej()
Definition: rejctmap.cpp:406
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:229
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:194
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
Definition: blobs.h:284
TWERD * rebuild_word
Definition: pageres.h:266
void flip_0O(WERD_RES *word)
Definition: reject.cpp:674
void print() const
Definition: ratngs.h:570
bool contains(char c) const
Definition: strngs.cpp:185
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:574
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:293
const int kBlnXHeight
Definition: normalis.h:24
void rej_word_contains_blanks()
Definition: rejctmap.cpp:370
const STRING & unichar_lengths() const
Definition: ratngs.h:538
const char * string() const
Definition: strngs.cpp:194
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:376
uint8_t permuter() const
Definition: ratngs.h:336
tesseract::Tesseract * tesseract
Definition: pageres.h:280
int16_t left() const
Definition: rect.h:72
bool tess_accepted
Definition: pageres.h:303
Definition: ocrrow.h:36
TBOX bounding_box() const
Definition: werd.cpp:148
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
int16_t height() const
Definition: rect.h:108
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:496
int32_t length() const
Definition: strngs.cpp:189
float x_scale() const
Definition: normalis.h:267
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:510
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:527
ALL upper case.
Definition: control.h:32
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790
void initialise(int16_t length)
Definition: rejctmap.cpp:273
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
DENORM denorm
Definition: pageres.h:201
int16_t bottom() const
Definition: rect.h:65
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:361
int count(LIST var_list)
Definition: oldlist.cpp:95
int32_t length() const
Definition: rejctmap.h:223
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const STRING & unichar_string() const
Definition: ratngs.h:531
ALL lower case.
Definition: control.h:31
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:786
Definition: strngs.h:45
Definition: rect.h:34
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:583
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
WERD * word
Definition: pageres.h:186
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:51
float certainty() const
Definition: ratngs.h:320
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:470
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:483
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:210
int length() const
Definition: boxword.h:83
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
void set_done(WERD_RES *word, int16_t pass)
UNICHARSET unicharset
Definition: ccutil.h:73
Unacceptable word.
Definition: control.h:30
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
TBOX bounding_box() const
Definition: blobs.cpp:468
const UNICHARSET * uch_set
Definition: pageres.h:203
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool done
Definition: pageres.h:305
int16_t reject_count()
Definition: rejctmap.h:229
tesseract::BoxWord * box_word
Definition: pageres.h:272
void rej_word_small_xht()
Definition: rejctmap.cpp:343
int16_t right() const
Definition: rect.h:79
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t top() const
Definition: rect.h:58
bool dangerous_ambig_found() const
Definition: ratngs.h:353
ALL but initial lc.
Definition: control.h:33
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:264
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
void rej_word_bad_permuter()
Definition: rejctmap.cpp:379
char * ok_repeated_ch_non_alphanum_wds