summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/src/dict/dict.cpp')
-rw-r--r--tesseract/src/dict/dict.cpp888
1 files changed, 888 insertions, 0 deletions
diff --git a/tesseract/src/dict/dict.cpp b/tesseract/src/dict/dict.cpp
new file mode 100644
index 00000000..eb69b569
--- /dev/null
+++ b/tesseract/src/dict/dict.cpp
@@ -0,0 +1,888 @@
+///////////////////////////////////////////////////////////////////////
+// File: dict.cpp
+// Description: dict class.
+// Author: Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dict.h"
+
+#include "tprintf.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+class Image;
+
+Dict::Dict(CCUtil* ccutil)
+ : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
+ probability_in_context_(&tesseract::Dict::def_probability_in_context),
+ ccutil_(ccutil),
+ wildcard_unichar_id_(INVALID_UNICHAR_ID),
+ apostrophe_unichar_id_(INVALID_UNICHAR_ID),
+ question_unichar_id_(INVALID_UNICHAR_ID),
+ slash_unichar_id_(INVALID_UNICHAR_ID),
+ hyphen_unichar_id_(INVALID_UNICHAR_ID),
+ STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
+ getCCUtil()->params()),
+ STRING_INIT_MEMBER(user_words_suffix, "",
+ "A suffix of user-provided words located in tessdata.",
+ getCCUtil()->params()),
+ STRING_MEMBER(user_patterns_file, "",
+ "A filename of user-provided patterns.",
+ getCCUtil()->params()),
+ STRING_INIT_MEMBER(user_patterns_suffix, "",
+ "A suffix of user-provided patterns located in "
+ "tessdata.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_punc_dawg, true,
+ "Load dawg with punctuation"
+ " patterns.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_number_dawg, true,
+ "Load dawg with number"
+ " patterns.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_bigram_dawg, true,
+ "Load dawg with special word "
+ "bigrams.",
+ getCCUtil()->params()),
+ double_MEMBER(xheight_penalty_subscripts, 0.125,
+ "Score penalty (0.1 = 10%) added if there are subscripts "
+ "or superscripts in a word, but it is otherwise OK.",
+ getCCUtil()->params()),
+ double_MEMBER(xheight_penalty_inconsistent, 0.25,
+ "Score penalty (0.1 = 10%) added if an xheight is "
+ "inconsistent.",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
+ "Score multiplier for word matches which have good case and"
+ " are frequent in the given language (lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_case_ok, 1.1,
+ "Score multiplier for word matches that have good case "
+ "(lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
+ "Default score multiplier for word matches, which may have "
+ "case issues (lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_nonword, 1.25,
+ "Score multiplier for glyph fragment segmentations which "
+ "do not match a dictionary word (lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_garbage, 1.50,
+ "Score multiplier for poorly cased strings that are not in"
+ " the dictionary and generally look like garbage (lower is"
+ " better).",
+ getCCUtil()->params()),
+ STRING_MEMBER(output_ambig_words_file, "",
+ "Output file for ambiguities found in the dictionary",
+ getCCUtil()->params()),
+ INT_MEMBER(dawg_debug_level, 0,
+ "Set to 1 for general debug info"
+ ", to 2 for more details, to 3 to see all the debug messages",
+ getCCUtil()->params()),
+ INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
+ getCCUtil()->params()),
+ BOOL_MEMBER(use_only_first_uft8_step, false,
+ "Use only the first UTF8 step of the given string"
+ " when computing log probabilities.",
+ getCCUtil()->params()),
+ double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_nondict_certainty_base, -2.50,
+ "Certainty threshold for non-dict words",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
+ "Reject certainty offset", getCCUtil()->params()),
+ INT_MEMBER(stopper_smallword_size, 2,
+ "Size of dict word to be treated as non-dict word",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_certainty_per_char, -0.50,
+ "Certainty to add"
+ " for each dict char above small word size.",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_allowable_character_badness, 3.0,
+ "Max certaintly variation allowed in a word (in sigma)",
+ getCCUtil()->params()),
+ INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
+ getCCUtil()->params()),
+ BOOL_MEMBER(stopper_no_acceptable_choices, false,
+ "Make AcceptableChoice() always return false. Useful"
+ " when there is a need to explore all segmentations",
+ getCCUtil()->params()),
+ INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
+ "Max words to keep in list", getCCUtil()->params()),
+ STRING_MEMBER(word_to_debug, "",
+ "Word for which stopper debug"
+ " information should be printed to stdout",
+ getCCUtil()->params()),
+ BOOL_MEMBER(segment_nonalphabetic_script, false,
+ "Don't use any alphabetic-specific tricks."
+ " Set to true in the traineddata config file for"
+ " scripts that are cursive or inherently fixed-pitch",
+ getCCUtil()->params()),
+ BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
+ getCCUtil()->params()),
+ double_MEMBER(doc_dict_pending_threshold, 0.0,
+ "Worst certainty for using pending dictionary",
+ getCCUtil()->params()),
+ double_MEMBER(doc_dict_certainty_threshold, -2.25,
+ "Worst certainty for words that can be inserted into the"
+ " document dictionary",
+ getCCUtil()->params()),
+ INT_MEMBER(max_permuter_attempts, 10000,
+ "Maximum number of different"
+ " character choices to consider during permutation."
+ " This limit is especially useful when user patterns"
+ " are specified, since overly generic patterns can result in"
+ " dawg search exploring an overly large number of options.",
+ getCCUtil()->params()) {
+ reject_offset_ = 0.0;
+ go_deeper_fxn_ = nullptr;
+ hyphen_word_ = nullptr;
+ last_word_on_line_ = false;
+ document_words_ = nullptr;
+ dawg_cache_ = nullptr;
+ dawg_cache_is_ours_ = false;
+ pending_words_ = nullptr;
+ bigram_dawg_ = nullptr;
+ freq_dawg_ = nullptr;
+ punc_dawg_ = nullptr;
+ unambig_dawg_ = nullptr;
+ wordseg_rating_adjust_factor_ = -1.0f;
+ output_ambig_words_file_ = nullptr;
+}
+
+Dict::~Dict() {
+ End();
+ delete hyphen_word_;
+ if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
+}
+
+DawgCache* Dict::GlobalDawgCache() {
+ // This global cache (a singleton) will outlive every Tesseract instance
+ // (even those that someone else might declare as global statics).
+ static DawgCache cache;
+ return &cache;
+}
+
+// Sets up ready for a Load or LoadLSTM.
+void Dict::SetupForLoad(DawgCache* dawg_cache) {
+ if (dawgs_.size() != 0) this->End();
+
+ apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
+ question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
+ slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
+ hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
+
+ if (dawg_cache != nullptr) {
+ dawg_cache_ = dawg_cache;
+ dawg_cache_is_ours_ = false;
+ } else {
+ dawg_cache_ = new DawgCache();
+ dawg_cache_is_ours_ = true;
+ }
+}
+
+// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+void Dict::Load(const STRING& lang, TessdataManager* data_file) {
+ // Load dawgs_.
+ if (load_punc_dawg) {
+ punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
+ dawg_debug_level, data_file);
+ if (punc_dawg_) dawgs_ += punc_dawg_;
+ }
+ if (load_system_dawg) {
+ Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
+ if (system_dawg) dawgs_ += system_dawg;
+ }
+ if (load_number_dawg) {
+ Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
+ if (number_dawg) dawgs_ += number_dawg;
+ }
+ if (load_bigram_dawg) {
+ bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
+ dawg_debug_level, data_file);
+ // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
+ // dawgs_!!
+ }
+ if (load_freq_dawg) {
+ freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
+ dawg_debug_level, data_file);
+ if (freq_dawg_) dawgs_ += freq_dawg_;
+ }
+ if (load_unambig_dawg) {
+ unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
+ dawg_debug_level, data_file);
+ if (unambig_dawg_) dawgs_ += unambig_dawg_;
+ }
+
+ STRING name;
+ if (!user_words_suffix.empty() || !user_words_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ if (!user_words_file.empty()) {
+ name = user_words_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_words_suffix;
+ }
+ if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
+ Trie::RRP_REVERSE_IF_HAS_RTL)) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+
+ if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ trie_ptr->initialize_patterns(&(getUnicharset()));
+ if (!user_patterns_file.empty()) {
+ name = user_patterns_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_patterns_suffix;
+ }
+ if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+
+ document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ dawgs_ += document_words_;
+
+ // This dawg is temporary and should not be searched by letter_is_ok.
+ pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
+ getUnicharset().size(), dawg_debug_level);
+}
+
+// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) {
+ // Load dawgs_.
+ if (load_punc_dawg) {
+ punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
+ dawg_debug_level, data_file);
+ if (punc_dawg_) dawgs_ += punc_dawg_;
+ }
+ if (load_system_dawg) {
+ Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
+ if (system_dawg) dawgs_ += system_dawg;
+ }
+ if (load_number_dawg) {
+ Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
+ if (number_dawg) dawgs_ += number_dawg;
+ }
+
+ // stolen from Dict::Load (but needs params_ from Tesseract
+ // langdata/config/api):
+ STRING name;
+ if (!user_words_suffix.empty() || !user_words_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ if (!user_words_file.empty()) {
+ name = user_words_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_words_suffix;
+ }
+ if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
+ Trie::RRP_REVERSE_IF_HAS_RTL)) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+
+ if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ trie_ptr->initialize_patterns(&(getUnicharset()));
+ if (!user_patterns_file.empty()) {
+ name = user_patterns_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_patterns_suffix;
+ }
+ if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+}
+
+// Completes the loading process after Load() and/or LoadLSTM().
+// Returns false if no dictionaries were loaded.
+bool Dict::FinishLoad() {
+ if (dawgs_.empty()) return false;
+ // Construct a list of corresponding successors for each dawg. Each entry, i,
+ // in the successors_ vector is a vector of integers that represent the
+ // indices into the dawgs_ vector of the successors for dawg i.
+ successors_.reserve(dawgs_.size());
+ for (int i = 0; i < dawgs_.size(); ++i) {
+ const Dawg* dawg = dawgs_[i];
+ auto* lst = new SuccessorList();
+ for (int j = 0; j < dawgs_.size(); ++j) {
+ const Dawg* other = dawgs_[j];
+ if (dawg != nullptr && other != nullptr &&
+ (dawg->lang() == other->lang()) &&
+ kDawgSuccessors[dawg->type()][other->type()])
+ *lst += j;
+ }
+ successors_ += lst;
+ }
+ return true;
+}
+
+void Dict::End() {
+ if (dawgs_.size() == 0) return; // Not safe to call twice.
+ for (int i = 0; i < dawgs_.size(); i++) {
+ if (!dawg_cache_->FreeDawg(dawgs_[i])) {
+ delete dawgs_[i];
+ }
+ }
+ dawg_cache_->FreeDawg(bigram_dawg_);
+ if (dawg_cache_is_ours_) {
+ delete dawg_cache_;
+ dawg_cache_ = nullptr;
+ }
+ successors_.delete_data_pointers();
+ dawgs_.clear();
+ successors_.clear();
+ document_words_ = nullptr;
+ delete pending_words_;
+ pending_words_ = nullptr;
+}
+
+// Returns true if in light of the current state unichar_id is allowed
+// according to at least one of the dawgs in the dawgs_ vector.
+// See more extensive comments in dict.h where this function is declared.
+int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const {
+ auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
+
+ ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+
+ if (dawg_debug_level >= 3) {
+ tprintf(
+ "def_letter_is_okay: current unichar=%s word_end=%d"
+ " num active dawgs=%d\n",
+ getUnicharset().debug_str(unichar_id).c_str(), word_end,
+ dawg_args->active_dawgs->size());
+ }
+
+ // Do not accept words that contain kPatternUnicharID.
+ // (otherwise pattern dawgs would not function correctly).
+ // Do not accept words containing INVALID_UNICHAR_IDs.
+ if (unichar_id == Dawg::kPatternUnicharID ||
+ unichar_id == INVALID_UNICHAR_ID) {
+ dawg_args->permuter = NO_PERM;
+ return NO_PERM;
+ }
+
+ // Initialization.
+ PermuterType curr_perm = NO_PERM;
+ dawg_args->updated_dawgs->clear();
+ dawg_args->valid_end = false;
+
+ // Go over the active_dawgs vector and insert DawgPosition records
+ // with the updated ref (an edge with the corresponding unichar id) into
+ // dawg_args->updated_pos.
+ for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
+ const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
+ const Dawg* punc_dawg =
+ pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
+ const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
+
+ if (!dawg && !punc_dawg) {
+ // shouldn't happen.
+ tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
+ continue;
+ }
+ if (!dawg) {
+ // We're in the punctuation dawg. A core dawg has not been chosen.
+ NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
+ EDGE_REF punc_transition_edge =
+ punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
+ if (punc_transition_edge != NO_EDGE) {
+ // Find all successors, and see which can transition.
+ const SuccessorList& slist = *(successors_[pos.punc_index]);
+ for (int s = 0; s < slist.size(); ++s) {
+ int sdawg_index = slist[s];
+ const Dawg* sdawg = dawgs_[sdawg_index];
+ UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
+ EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
+ if (dawg_edge != NO_EDGE) {
+ if (dawg_debug_level >= 3) {
+ tprintf("Letter found in dawg %d\n", sdawg_index);
+ }
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
+ punc_transition_edge, false),
+ dawg_debug_level > 0,
+ "Append transition from punc dawg to current dawgs: ");
+ if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
+ if (sdawg->end_of_word(dawg_edge) &&
+ punc_dawg->end_of_word(punc_transition_edge))
+ dawg_args->valid_end = true;
+ }
+ }
+ }
+ EDGE_REF punc_edge =
+ punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
+ if (punc_edge != NO_EDGE) {
+ if (dawg_debug_level >= 3) {
+ tprintf("Letter found in punctuation dawg\n");
+ }
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
+ dawg_debug_level > 0, "Extend punctuation dawg: ");
+ if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
+ if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
+ }
+ continue;
+ }
+
+ if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
+ // We can end the main word here.
+ // If we can continue on the punc ref, add that possibility.
+ NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
+ EDGE_REF punc_edge =
+ punc_node == NO_EDGE
+ ? NO_EDGE
+ : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
+ if (punc_edge != NO_EDGE) {
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
+ punc_edge, true),
+ dawg_debug_level > 0, "Return to punctuation dawg: ");
+ if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
+ if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
+ }
+ }
+
+ if (pos.back_to_punc) continue;
+
+ // If we are dealing with the pattern dawg, look up all the
+ // possible edges, not only for the exact unichar_id, but also
+ // for all its character classes (alpha, digit, etc).
+ if (dawg->type() == DAWG_TYPE_PATTERN) {
+ ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
+ &curr_perm);
+ // There can't be any successors to dawg that is of type
+ // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
+ continue;
+ }
+
+ // Find the edge out of the node for the unichar_id.
+ NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
+ EDGE_REF edge =
+ (node == NO_EDGE)
+ ? NO_EDGE
+ : dawg->edge_char_of(
+ node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
+
+ if (dawg_debug_level >= 3) {
+ tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
+ pos.dawg_index, node, edge);
+ }
+
+ if (edge != NO_EDGE) { // the unichar was found in the current dawg
+ if (dawg_debug_level >= 3) {
+ tprintf("Letter found in dawg %d\n", pos.dawg_index);
+ }
+ if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
+ if (dawg_debug_level >= 3) {
+ tprintf("Punctuation constraint not satisfied at end of word.\n");
+ }
+ continue;
+ }
+ if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
+ if (dawg->end_of_word(edge) &&
+ (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
+ dawg_args->valid_end = true;
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
+ false),
+ dawg_debug_level > 0,
+ "Append current dawg to updated active dawgs: ");
+ }
+ } // end for
+ // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
+ // or if we found the current letter in a non-punctuation dawg. This
+ // allows preserving information on which dawg the "core" word came from.
+ // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
+ if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
+ (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
+ dawg_args->permuter = curr_perm;
+ }
+ if (dawg_debug_level >= 2) {
+ tprintf("Returning %d for permuter code for this character.\n",
+ dawg_args->permuter);
+ }
+ return dawg_args->permuter;
+}
+
+void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos,
+ UNICHAR_ID unichar_id, bool word_end,
+ DawgArgs* dawg_args,
+ PermuterType* curr_perm) const {
+ NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
+ // Try to find the edge corresponding to the exact unichar_id and to all the
+ // edges corresponding to the character class of unichar_id.
+ GenericVector<UNICHAR_ID> unichar_id_patterns;
+ unichar_id_patterns.push_back(unichar_id);
+ dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
+ &unichar_id_patterns);
+ for (int i = 0; i < unichar_id_patterns.size(); ++i) {
+ // On the first iteration check all the outgoing edges.
+ // On the second iteration check all self-loops.
+ for (int k = 0; k < 2; ++k) {
+ EDGE_REF edge =
+ (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
+ : dawg->pattern_loop_edge(pos.dawg_ref,
+ unichar_id_patterns[i], word_end);
+ if (edge == NO_EDGE) continue;
+ if (dawg_debug_level >= 3) {
+ tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
+ pos.dawg_index, node, edge);
+ tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
+ }
+ if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
+ if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
+ pos.back_to_punc),
+ dawg_debug_level > 0,
+ "Append current dawg to updated active dawgs: ");
+ }
+ }
+}
+
+// Fill the given active_dawgs vector with dawgs that could contain the
+// beginning of the word. If hyphenated() returns true, copy the entries
+// from hyphen_active_dawgs_ instead.
+void Dict::init_active_dawgs(DawgPositionVector* active_dawgs,
+ bool ambigs_mode) const {
+ int i;
+ if (hyphenated()) {
+ *active_dawgs = hyphen_active_dawgs_;
+ if (dawg_debug_level >= 3) {
+ for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
+ tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
+ hyphen_active_dawgs_[i].dawg_index,
+ hyphen_active_dawgs_[i].dawg_ref);
+ }
+ }
+ } else {
+ default_dawgs(active_dawgs, ambigs_mode);
+ }
+}
+
+void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec,
+ bool suppress_patterns) const {
+ bool punc_dawg_available =
+ (punc_dawg_ != nullptr) &&
+ punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
+
+ for (int i = 0; i < dawgs_.size(); i++) {
+ if (dawgs_[i] != nullptr &&
+ !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
+ int dawg_ty = dawgs_[i]->type();
+ bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
+ if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
+ *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
+ if (dawg_debug_level >= 3) {
+ tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
+ NO_EDGE);
+ }
+ } else if (!punc_dawg_available || !subsumed_by_punc) {
+ *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
+ if (dawg_debug_level >= 3) {
+ tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
+ }
+ }
+ }
+ }
+}
+
+void Dict::add_document_word(const WERD_CHOICE& best_choice) {
+ // Do not add hyphenated word parts to the document dawg.
+ // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
+ // called when the first part of the hyphenated word is
+ // discovered and while the second part of the word is recognized.
+ // hyphen_word_ is cleared in cc_recg() before the next word on
+ // the line is recognized.
+ if (hyphen_word_) return;
+
+ int stringlen = best_choice.length();
+
+ if (valid_word(best_choice) || stringlen < 2) return;
+
+ // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
+ if (best_choice.length() >= kDocDictMaxRepChars) {
+ int num_rep_chars = 1;
+ UNICHAR_ID uch_id = best_choice.unichar_id(0);
+ for (int i = 1; i < best_choice.length(); ++i) {
+ if (best_choice.unichar_id(i) != uch_id) {
+ num_rep_chars = 1;
+ uch_id = best_choice.unichar_id(i);
+ } else {
+ ++num_rep_chars;
+ if (num_rep_chars == kDocDictMaxRepChars) return;
+ }
+ }
+ }
+
+ if (best_choice.certainty() < doc_dict_certainty_threshold ||
+ stringlen == 2) {
+ if (best_choice.certainty() < doc_dict_pending_threshold) return;
+
+ if (!pending_words_->word_in_dawg(best_choice)) {
+ if (stringlen > 2 ||
+ (stringlen == 2 &&
+ getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
+ getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
+ pending_words_->add_word_to_dawg(best_choice);
+ }
+ return;
+ }
+ }
+
+ if (save_doc_words) {
+ STRING filename(getCCUtil()->imagefile);
+ filename += ".doc";
+ FILE* doc_word_file = fopen(filename.c_str(), "a");
+ if (doc_word_file == nullptr) {
+ tprintf("Error: Could not open file %s\n", filename.c_str());
+ ASSERT_HOST(doc_word_file);
+ }
+ fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
+ fclose(doc_word_file);
+ }
+ document_words_->add_word_to_dawg(best_choice);
+}
+
+void Dict::adjust_word(WERD_CHOICE* word, bool nonword,
+ XHeightConsistencyEnum xheight_consistency,
+ float additional_adjust, bool modify_rating,
+ bool debug) {
+ bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
+ word->GetTopScriptID() == getUnicharset().han_sid());
+ bool case_is_ok = (is_han || case_ok(*word));
+ bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
+
+ float adjust_factor = additional_adjust;
+ float new_rating = word->rating();
+ new_rating += kRatingPad;
+ const char* xheight_triggered = "";
+ if (word->length() > 1) {
+ // Calculate x-height and y-offset consistency penalties.
+ switch (xheight_consistency) {
+ case XH_INCONSISTENT:
+ adjust_factor += xheight_penalty_inconsistent;
+ xheight_triggered = ", xhtBAD";
+ break;
+ case XH_SUBNORMAL:
+ adjust_factor += xheight_penalty_subscripts;
+ xheight_triggered = ", xhtSUB";
+ break;
+ case XH_GOOD:
+ // leave the factor alone - all good!
+ break;
+ }
+ // TODO(eger): if nonword is true, but there is a "core" that is a dict
+ // word, negate nonword status.
+ } else {
+ if (debug) {
+ tprintf("Consistency could not be calculated.\n");
+ }
+ }
+ if (debug) {
+ tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
+ word->unichar_string().c_str(), word->rating(), xheight_triggered);
+ }
+
+ if (nonword) { // non-dictionary word
+ if (case_is_ok && punc_is_ok) {
+ adjust_factor += segment_penalty_dict_nonword;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", W");
+ } else {
+ adjust_factor += segment_penalty_garbage;
+ new_rating *= adjust_factor;
+ if (debug) {
+ if (!case_is_ok) tprintf(", C");
+ if (!punc_is_ok) tprintf(", P");
+ }
+ }
+ } else { // dictionary word
+ if (case_is_ok) {
+ if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
+ word->set_permuter(FREQ_DAWG_PERM);
+ adjust_factor += segment_penalty_dict_frequent_word;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", F");
+ } else {
+ adjust_factor += segment_penalty_dict_case_ok;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", ");
+ }
+ } else {
+ adjust_factor += segment_penalty_dict_case_bad;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", C");
+ }
+ }
+ new_rating -= kRatingPad;
+ if (modify_rating) word->set_rating(new_rating);
+ if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
+ word->set_adjust_factor(adjust_factor);
+}
+
+int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const {
+ const WERD_CHOICE* word_ptr = &word;
+ WERD_CHOICE temp_word(word.unicharset());
+ if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
+ copy_hyphen_info(&temp_word);
+ temp_word += word;
+ word_ptr = &temp_word;
+ }
+ if (word_ptr->length() == 0) return NO_PERM;
+ // Allocate vectors for holding current and updated
+ // active_dawgs and initialize them.
+ DawgPositionVector active_dawgs[2];
+ init_active_dawgs(&(active_dawgs[0]), false);
+ DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+ int last_index = word_ptr->length() - 1;
+ // Call letter_is_okay for each letter in the word.
+ for (int i = hyphen_base_size(); i <= last_index; ++i) {
+ if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
+ word_ptr->unichar_id(i), i == last_index)))
+ break;
+ // Swap active_dawgs, constraints with the corresponding updated vector.
+ if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
+ dawg_args.updated_dawgs = &(active_dawgs[0]);
+ ++(dawg_args.active_dawgs);
+ } else {
+ ++(dawg_args.updated_dawgs);
+ dawg_args.active_dawgs = &(active_dawgs[0]);
+ }
+ }
+ return valid_word_permuter(dawg_args.permuter, numbers_ok)
+ ? dawg_args.permuter
+ : NO_PERM;
+}
+
+bool Dict::valid_bigram(const WERD_CHOICE& word1,
+ const WERD_CHOICE& word2) const {
+ if (bigram_dawg_ == nullptr) return false;
+
+ // Extract the core word from the middle of each word with any digits
+ // replaced with question marks.
+ int w1start, w1end, w2start, w2end;
+ word1.punct_stripped(&w1start, &w1end);
+ word2.punct_stripped(&w2start, &w2end);
+
+ // We don't want to penalize a single guillemet, hyphen, etc.
+ // But our bigram list doesn't have any information about punctuation.
+ if (w1start >= w1end) return word1.length() < 3;
+ if (w2start >= w2end) return word2.length() < 3;
+
+ const UNICHARSET& uchset = getUnicharset();
+ std::vector<UNICHAR_ID> bigram_string;
+ bigram_string.reserve(w1end + w2end + 1);
+ for (int i = w1start; i < w1end; i++) {
+ const auto &normed_ids =
+ getUnicharset().normed_ids(word1.unichar_id(i));
+ if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
+ bigram_string.push_back(question_unichar_id_);
+ else
+ bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
+ }
+ bigram_string.push_back(UNICHAR_SPACE);
+ for (int i = w2start; i < w2end; i++) {
+ const auto &normed_ids =
+ getUnicharset().normed_ids(word2.unichar_id(i));
+ if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
+ bigram_string.push_back(question_unichar_id_);
+ else
+ bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
+ }
+ WERD_CHOICE normalized_word(&uchset, bigram_string.size());
+ for (int i = 0; i < bigram_string.size(); ++i) {
+ normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
+ 0.0f);
+ }
+ return bigram_dawg_->word_in_dawg(normalized_word);
+}
+
+bool Dict::valid_punctuation(const WERD_CHOICE& word) {
+ if (word.length() == 0) return NO_PERM;
+ int i;
+ WERD_CHOICE new_word(word.unicharset());
+ int last_index = word.length() - 1;
+ int new_len = 0;
+ for (i = 0; i <= last_index; ++i) {
+ UNICHAR_ID unichar_id = (word.unichar_id(i));
+ if (getUnicharset().get_ispunctuation(unichar_id)) {
+ new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
+ } else if (!getUnicharset().get_isalpha(unichar_id) &&
+ !getUnicharset().get_isdigit(unichar_id)) {
+ return false; // neither punc, nor alpha, nor digit
+ } else if ((new_len = new_word.length()) == 0 ||
+ new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
+ new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
+ }
+ }
+ for (i = 0; i < dawgs_.size(); ++i) {
+ if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
+ dawgs_[i]->word_in_dawg(new_word))
+ return true;
+ }
+ return false;
+}
+
+/// Returns true if the language is space-delimited (not CJ, or T).
+bool Dict::IsSpaceDelimitedLang() const {
+ const UNICHARSET& u_set = getUnicharset();
+ if (u_set.han_sid() > 0) return false;
+ if (u_set.katakana_sid() > 0) return false;
+ if (u_set.thai_sid() > 0) return false;
+ return true;
+}
+
+} // namespace tesseract