diff options
Diffstat (limited to 'tesseract/src/dict/dict.cpp')
-rw-r--r-- | tesseract/src/dict/dict.cpp | 888 |
1 files changed, 888 insertions, 0 deletions
diff --git a/tesseract/src/dict/dict.cpp b/tesseract/src/dict/dict.cpp new file mode 100644 index 00000000..eb69b569 --- /dev/null +++ b/tesseract/src/dict/dict.cpp @@ -0,0 +1,888 @@ +/////////////////////////////////////////////////////////////////////// +// File: dict.cpp +// Description: dict class. +// Author: Samuel Charron +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "dict.h" + +#include "tprintf.h" + +#include <cstdio> + +namespace tesseract { + +class Image; + +Dict::Dict(CCUtil* ccutil) + : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), + probability_in_context_(&tesseract::Dict::def_probability_in_context), + ccutil_(ccutil), + wildcard_unichar_id_(INVALID_UNICHAR_ID), + apostrophe_unichar_id_(INVALID_UNICHAR_ID), + question_unichar_id_(INVALID_UNICHAR_ID), + slash_unichar_id_(INVALID_UNICHAR_ID), + hyphen_unichar_id_(INVALID_UNICHAR_ID), + STRING_MEMBER(user_words_file, "", "A filename of user-provided words.", + getCCUtil()->params()), + STRING_INIT_MEMBER(user_words_suffix, "", + "A suffix of user-provided words located in tessdata.", + getCCUtil()->params()), + STRING_MEMBER(user_patterns_file, "", + "A filename of user-provided patterns.", + getCCUtil()->params()), + STRING_INIT_MEMBER(user_patterns_suffix, "", + "A suffix of user-provided patterns located in " + "tessdata.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_punc_dawg, true, + "Load dawg with punctuation" + " patterns.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_number_dawg, true, + "Load dawg with number" + " patterns.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_bigram_dawg, true, + "Load dawg with special word " + "bigrams.", + getCCUtil()->params()), + double_MEMBER(xheight_penalty_subscripts, 0.125, + "Score penalty (0.1 = 10%) added if there are subscripts " + "or superscripts in a word, but it is otherwise OK.", + getCCUtil()->params()), + double_MEMBER(xheight_penalty_inconsistent, 0.25, + "Score penalty (0.1 = 10%) added if an xheight is " + "inconsistent.", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_frequent_word, 1.0, + "Score multiplier for word matches which have good case and" + " are frequent in the given language (lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_case_ok, 1.1, + "Score multiplier for word matches that have good case " + "(lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_case_bad, 1.3125, + "Default score multiplier for word matches, which may have " + "case issues (lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_nonword, 1.25, + "Score multiplier for glyph fragment segmentations which " + "do not match a dictionary word (lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_garbage, 1.50, + "Score multiplier for poorly cased strings that are not in" + " the dictionary and generally look like garbage (lower is" + " better).", + getCCUtil()->params()), + STRING_MEMBER(output_ambig_words_file, "", + "Output file for ambiguities found in the dictionary", + getCCUtil()->params()), + INT_MEMBER(dawg_debug_level, 0, + "Set to 1 for general debug info" + ", to 2 for more details, to 3 to see all the debug messages", + getCCUtil()->params()), + INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", + getCCUtil()->params()), + BOOL_MEMBER(use_only_first_uft8_step, false, + "Use only the first UTF8 step of the given string" + " when computing log probabilities.", + getCCUtil()->params()), + double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", + getCCUtil()->params()), + double_MEMBER(stopper_nondict_certainty_base, -2.50, + "Certainty threshold for non-dict words", + getCCUtil()->params()), + double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, + "Reject certainty offset", getCCUtil()->params()), + INT_MEMBER(stopper_smallword_size, 2, + "Size of dict word to be treated as non-dict word", + getCCUtil()->params()), + double_MEMBER(stopper_certainty_per_char, -0.50, + "Certainty to add" + " for each dict char above small word size.", + getCCUtil()->params()), + double_MEMBER(stopper_allowable_character_badness, 3.0, + "Max certaintly variation allowed in a word (in sigma)", + getCCUtil()->params()), + INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", + getCCUtil()->params()), + BOOL_MEMBER(stopper_no_acceptable_choices, false, + "Make AcceptableChoice() always return false. Useful" + " when there is a need to explore all segmentations", + getCCUtil()->params()), + INT_MEMBER(tessedit_truncate_wordchoice_log, 10, + "Max words to keep in list", getCCUtil()->params()), + STRING_MEMBER(word_to_debug, "", + "Word for which stopper debug" + " information should be printed to stdout", + getCCUtil()->params()), + BOOL_MEMBER(segment_nonalphabetic_script, false, + "Don't use any alphabetic-specific tricks." + " Set to true in the traineddata config file for" + " scripts that are cursive or inherently fixed-pitch", + getCCUtil()->params()), + BOOL_MEMBER(save_doc_words, 0, "Save Document Words", + getCCUtil()->params()), + double_MEMBER(doc_dict_pending_threshold, 0.0, + "Worst certainty for using pending dictionary", + getCCUtil()->params()), + double_MEMBER(doc_dict_certainty_threshold, -2.25, + "Worst certainty for words that can be inserted into the" + " document dictionary", + getCCUtil()->params()), + INT_MEMBER(max_permuter_attempts, 10000, + "Maximum number of different" + " character choices to consider during permutation." + " This limit is especially useful when user patterns" + " are specified, since overly generic patterns can result in" + " dawg search exploring an overly large number of options.", + getCCUtil()->params()) { + reject_offset_ = 0.0; + go_deeper_fxn_ = nullptr; + hyphen_word_ = nullptr; + last_word_on_line_ = false; + document_words_ = nullptr; + dawg_cache_ = nullptr; + dawg_cache_is_ours_ = false; + pending_words_ = nullptr; + bigram_dawg_ = nullptr; + freq_dawg_ = nullptr; + punc_dawg_ = nullptr; + unambig_dawg_ = nullptr; + wordseg_rating_adjust_factor_ = -1.0f; + output_ambig_words_file_ = nullptr; +} + +Dict::~Dict() { + End(); + delete hyphen_word_; + if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_); +} + +DawgCache* Dict::GlobalDawgCache() { + // This global cache (a singleton) will outlive every Tesseract instance + // (even those that someone else might declare as global statics). + static DawgCache cache; + return &cache; +} + +// Sets up ready for a Load or LoadLSTM. +void Dict::SetupForLoad(DawgCache* dawg_cache) { + if (dawgs_.size() != 0) this->End(); + + apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); + question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol); + slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol); + hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); + + if (dawg_cache != nullptr) { + dawg_cache_ = dawg_cache; + dawg_cache_is_ours_ = false; + } else { + dawg_cache_ = new DawgCache(); + dawg_cache_is_ours_ = true; + } +} + +// Loads the dawgs needed by Tesseract. Call FinishLoad() after. +void Dict::Load(const STRING& lang, TessdataManager* data_file) { + // Load dawgs_. + if (load_punc_dawg) { + punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, + dawg_debug_level, data_file); + if (punc_dawg_) dawgs_ += punc_dawg_; + } + if (load_system_dawg) { + Dawg* system_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); + if (system_dawg) dawgs_ += system_dawg; + } + if (load_number_dawg) { + Dawg* number_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); + if (number_dawg) dawgs_ += number_dawg; + } + if (load_bigram_dawg) { + bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, + dawg_debug_level, data_file); + // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the + // dawgs_!! + } + if (load_freq_dawg) { + freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, + dawg_debug_level, data_file); + if (freq_dawg_) dawgs_ += freq_dawg_; + } + if (load_unambig_dawg) { + unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, + dawg_debug_level, data_file); + if (unambig_dawg_) dawgs_ += unambig_dawg_; + } + + STRING name; + if (!user_words_suffix.empty() || !user_words_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + if (!user_words_file.empty()) { + name = user_words_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; + } + if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), + Trie::RRP_REVERSE_IF_HAS_RTL)) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + getUnicharset().size(), dawg_debug_level); + trie_ptr->initialize_patterns(&(getUnicharset())); + if (!user_patterns_file.empty()) { + name = user_patterns_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; + } + if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + dawgs_ += document_words_; + + // This dawg is temporary and should not be searched by letter_is_ok. + pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, + getUnicharset().size(), dawg_debug_level); +} + +// Loads the dawgs needed by the LSTM model. Call FinishLoad() after. +void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) { + // Load dawgs_. + if (load_punc_dawg) { + punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, + dawg_debug_level, data_file); + if (punc_dawg_) dawgs_ += punc_dawg_; + } + if (load_system_dawg) { + Dawg* system_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); + if (system_dawg) dawgs_ += system_dawg; + } + if (load_number_dawg) { + Dawg* number_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); + if (number_dawg) dawgs_ += number_dawg; + } + + // stolen from Dict::Load (but needs params_ from Tesseract + // langdata/config/api): + STRING name; + if (!user_words_suffix.empty() || !user_words_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + if (!user_words_file.empty()) { + name = user_words_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; + } + if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), + Trie::RRP_REVERSE_IF_HAS_RTL)) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + getUnicharset().size(), dawg_debug_level); + trie_ptr->initialize_patterns(&(getUnicharset())); + if (!user_patterns_file.empty()) { + name = user_patterns_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; + } + if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } +} + +// Completes the loading process after Load() and/or LoadLSTM(). +// Returns false if no dictionaries were loaded. +bool Dict::FinishLoad() { + if (dawgs_.empty()) return false; + // Construct a list of corresponding successors for each dawg. Each entry, i, + // in the successors_ vector is a vector of integers that represent the + // indices into the dawgs_ vector of the successors for dawg i. + successors_.reserve(dawgs_.size()); + for (int i = 0; i < dawgs_.size(); ++i) { + const Dawg* dawg = dawgs_[i]; + auto* lst = new SuccessorList(); + for (int j = 0; j < dawgs_.size(); ++j) { + const Dawg* other = dawgs_[j]; + if (dawg != nullptr && other != nullptr && + (dawg->lang() == other->lang()) && + kDawgSuccessors[dawg->type()][other->type()]) + *lst += j; + } + successors_ += lst; + } + return true; +} + +void Dict::End() { + if (dawgs_.size() == 0) return; // Not safe to call twice. + for (int i = 0; i < dawgs_.size(); i++) { + if (!dawg_cache_->FreeDawg(dawgs_[i])) { + delete dawgs_[i]; + } + } + dawg_cache_->FreeDawg(bigram_dawg_); + if (dawg_cache_is_ours_) { + delete dawg_cache_; + dawg_cache_ = nullptr; + } + successors_.delete_data_pointers(); + dawgs_.clear(); + successors_.clear(); + document_words_ = nullptr; + delete pending_words_; + pending_words_ = nullptr; +} + +// Returns true if in light of the current state unichar_id is allowed +// according to at least one of the dawgs in the dawgs_ vector. +// See more extensive comments in dict.h where this function is declared. +int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const { + auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args); + + ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); + + if (dawg_debug_level >= 3) { + tprintf( + "def_letter_is_okay: current unichar=%s word_end=%d" + " num active dawgs=%d\n", + getUnicharset().debug_str(unichar_id).c_str(), word_end, + dawg_args->active_dawgs->size()); + } + + // Do not accept words that contain kPatternUnicharID. + // (otherwise pattern dawgs would not function correctly). + // Do not accept words containing INVALID_UNICHAR_IDs. + if (unichar_id == Dawg::kPatternUnicharID || + unichar_id == INVALID_UNICHAR_ID) { + dawg_args->permuter = NO_PERM; + return NO_PERM; + } + + // Initialization. + PermuterType curr_perm = NO_PERM; + dawg_args->updated_dawgs->clear(); + dawg_args->valid_end = false; + + // Go over the active_dawgs vector and insert DawgPosition records + // with the updated ref (an edge with the corresponding unichar id) into + // dawg_args->updated_pos. + for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) { + const DawgPosition& pos = (*dawg_args->active_dawgs)[a]; + const Dawg* punc_dawg = + pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr; + const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr; + + if (!dawg && !punc_dawg) { + // shouldn't happen. + tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n"); + continue; + } + if (!dawg) { + // We're in the punctuation dawg. A core dawg has not been chosen. + NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); + EDGE_REF punc_transition_edge = + punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end); + if (punc_transition_edge != NO_EDGE) { + // Find all successors, and see which can transition. + const SuccessorList& slist = *(successors_[pos.punc_index]); + for (int s = 0; s < slist.size(); ++s) { + int sdawg_index = slist[s]; + const Dawg* sdawg = dawgs_[sdawg_index]; + UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg); + EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); + if (dawg_edge != NO_EDGE) { + if (dawg_debug_level >= 3) { + tprintf("Letter found in dawg %d\n", sdawg_index); + } + dawg_args->updated_dawgs->add_unique( + DawgPosition(sdawg_index, dawg_edge, pos.punc_index, + punc_transition_edge, false), + dawg_debug_level > 0, + "Append transition from punc dawg to current dawgs: "); + if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); + if (sdawg->end_of_word(dawg_edge) && + punc_dawg->end_of_word(punc_transition_edge)) + dawg_args->valid_end = true; + } + } + } + EDGE_REF punc_edge = + punc_dawg->edge_char_of(punc_node, unichar_id, word_end); + if (punc_edge != NO_EDGE) { + if (dawg_debug_level >= 3) { + tprintf("Letter found in punctuation dawg\n"); + } + dawg_args->updated_dawgs->add_unique( + DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), + dawg_debug_level > 0, "Extend punctuation dawg: "); + if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM; + if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; + } + continue; + } + + if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) { + // We can end the main word here. + // If we can continue on the punc ref, add that possibility. + NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); + EDGE_REF punc_edge = + punc_node == NO_EDGE + ? NO_EDGE + : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); + if (punc_edge != NO_EDGE) { + dawg_args->updated_dawgs->add_unique( + DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, + punc_edge, true), + dawg_debug_level > 0, "Return to punctuation dawg: "); + if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); + if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; + } + } + + if (pos.back_to_punc) continue; + + // If we are dealing with the pattern dawg, look up all the + // possible edges, not only for the exact unichar_id, but also + // for all its character classes (alpha, digit, etc). + if (dawg->type() == DAWG_TYPE_PATTERN) { + ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, + &curr_perm); + // There can't be any successors to dawg that is of type + // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. + continue; + } + + // Find the edge out of the node for the unichar_id. + NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); + EDGE_REF edge = + (node == NO_EDGE) + ? NO_EDGE + : dawg->edge_char_of( + node, char_for_dawg(unicharset, unichar_id, dawg), word_end); + + if (dawg_debug_level >= 3) { + tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", + pos.dawg_index, node, edge); + } + + if (edge != NO_EDGE) { // the unichar was found in the current dawg + if (dawg_debug_level >= 3) { + tprintf("Letter found in dawg %d\n", pos.dawg_index); + } + if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { + if (dawg_debug_level >= 3) { + tprintf("Punctuation constraint not satisfied at end of word.\n"); + } + continue; + } + if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); + if (dawg->end_of_word(edge) && + (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) + dawg_args->valid_end = true; + dawg_args->updated_dawgs->add_unique( + DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, + false), + dawg_debug_level > 0, + "Append current dawg to updated active dawgs: "); + } + } // end for + // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM + // or if we found the current letter in a non-punctuation dawg. This + // allows preserving information on which dawg the "core" word came from. + // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. + if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || + (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { + dawg_args->permuter = curr_perm; + } + if (dawg_debug_level >= 2) { + tprintf("Returning %d for permuter code for this character.\n", + dawg_args->permuter); + } + return dawg_args->permuter; +} + +void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos, + UNICHAR_ID unichar_id, bool word_end, + DawgArgs* dawg_args, + PermuterType* curr_perm) const { + NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); + // Try to find the edge corresponding to the exact unichar_id and to all the + // edges corresponding to the character class of unichar_id. + GenericVector<UNICHAR_ID> unichar_id_patterns; + unichar_id_patterns.push_back(unichar_id); + dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), + &unichar_id_patterns); + for (int i = 0; i < unichar_id_patterns.size(); ++i) { + // On the first iteration check all the outgoing edges. + // On the second iteration check all self-loops. + for (int k = 0; k < 2; ++k) { + EDGE_REF edge = + (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end) + : dawg->pattern_loop_edge(pos.dawg_ref, + unichar_id_patterns[i], word_end); + if (edge == NO_EDGE) continue; + if (dawg_debug_level >= 3) { + tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", + pos.dawg_index, node, edge); + tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); + } + if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter(); + if (dawg->end_of_word(edge)) dawg_args->valid_end = true; + dawg_args->updated_dawgs->add_unique( + DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, + pos.back_to_punc), + dawg_debug_level > 0, + "Append current dawg to updated active dawgs: "); + } + } +} + +// Fill the given active_dawgs vector with dawgs that could contain the +// beginning of the word. If hyphenated() returns true, copy the entries +// from hyphen_active_dawgs_ instead. +void Dict::init_active_dawgs(DawgPositionVector* active_dawgs, + bool ambigs_mode) const { + int i; + if (hyphenated()) { + *active_dawgs = hyphen_active_dawgs_; + if (dawg_debug_level >= 3) { + for (i = 0; i < hyphen_active_dawgs_.size(); ++i) { + tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", + hyphen_active_dawgs_[i].dawg_index, + hyphen_active_dawgs_[i].dawg_ref); + } + } + } else { + default_dawgs(active_dawgs, ambigs_mode); + } +} + +void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec, + bool suppress_patterns) const { + bool punc_dawg_available = + (punc_dawg_ != nullptr) && + punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; + + for (int i = 0; i < dawgs_.size(); i++) { + if (dawgs_[i] != nullptr && + !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { + int dawg_ty = dawgs_[i]->type(); + bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; + if (dawg_ty == DAWG_TYPE_PUNCTUATION) { + *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false); + if (dawg_debug_level >= 3) { + tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, + NO_EDGE); + } + } else if (!punc_dawg_available || !subsumed_by_punc) { + *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false); + if (dawg_debug_level >= 3) { + tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); + } + } + } + } +} + +void Dict::add_document_word(const WERD_CHOICE& best_choice) { + // Do not add hyphenated word parts to the document dawg. + // hyphen_word_ will be non-nullptr after the set_hyphen_word() is + // called when the first part of the hyphenated word is + // discovered and while the second part of the word is recognized. + // hyphen_word_ is cleared in cc_recg() before the next word on + // the line is recognized. + if (hyphen_word_) return; + + int stringlen = best_choice.length(); + + if (valid_word(best_choice) || stringlen < 2) return; + + // Discard words that contain >= kDocDictMaxRepChars repeating unichars. + if (best_choice.length() >= kDocDictMaxRepChars) { + int num_rep_chars = 1; + UNICHAR_ID uch_id = best_choice.unichar_id(0); + for (int i = 1; i < best_choice.length(); ++i) { + if (best_choice.unichar_id(i) != uch_id) { + num_rep_chars = 1; + uch_id = best_choice.unichar_id(i); + } else { + ++num_rep_chars; + if (num_rep_chars == kDocDictMaxRepChars) return; + } + } + } + + if (best_choice.certainty() < doc_dict_certainty_threshold || + stringlen == 2) { + if (best_choice.certainty() < doc_dict_pending_threshold) return; + + if (!pending_words_->word_in_dawg(best_choice)) { + if (stringlen > 2 || + (stringlen == 2 && + getUnicharset().get_isupper(best_choice.unichar_id(0)) && + getUnicharset().get_isupper(best_choice.unichar_id(1)))) { + pending_words_->add_word_to_dawg(best_choice); + } + return; + } + } + + if (save_doc_words) { + STRING filename(getCCUtil()->imagefile); + filename += ".doc"; + FILE* doc_word_file = fopen(filename.c_str(), "a"); + if (doc_word_file == nullptr) { + tprintf("Error: Could not open file %s\n", filename.c_str()); + ASSERT_HOST(doc_word_file); + } + fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str()); + fclose(doc_word_file); + } + document_words_->add_word_to_dawg(best_choice); +} + +void Dict::adjust_word(WERD_CHOICE* word, bool nonword, + XHeightConsistencyEnum xheight_consistency, + float additional_adjust, bool modify_rating, + bool debug) { + bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && + word->GetTopScriptID() == getUnicharset().han_sid()); + bool case_is_ok = (is_han || case_ok(*word)); + bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); + + float adjust_factor = additional_adjust; + float new_rating = word->rating(); + new_rating += kRatingPad; + const char* xheight_triggered = ""; + if (word->length() > 1) { + // Calculate x-height and y-offset consistency penalties. + switch (xheight_consistency) { + case XH_INCONSISTENT: + adjust_factor += xheight_penalty_inconsistent; + xheight_triggered = ", xhtBAD"; + break; + case XH_SUBNORMAL: + adjust_factor += xheight_penalty_subscripts; + xheight_triggered = ", xhtSUB"; + break; + case XH_GOOD: + // leave the factor alone - all good! + break; + } + // TODO(eger): if nonword is true, but there is a "core" that is a dict + // word, negate nonword status. + } else { + if (debug) { + tprintf("Consistency could not be calculated.\n"); + } + } + if (debug) { + tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", + word->unichar_string().c_str(), word->rating(), xheight_triggered); + } + + if (nonword) { // non-dictionary word + if (case_is_ok && punc_is_ok) { + adjust_factor += segment_penalty_dict_nonword; + new_rating *= adjust_factor; + if (debug) tprintf(", W"); + } else { + adjust_factor += segment_penalty_garbage; + new_rating *= adjust_factor; + if (debug) { + if (!case_is_ok) tprintf(", C"); + if (!punc_is_ok) tprintf(", P"); + } + } + } else { // dictionary word + if (case_is_ok) { + if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) { + word->set_permuter(FREQ_DAWG_PERM); + adjust_factor += segment_penalty_dict_frequent_word; + new_rating *= adjust_factor; + if (debug) tprintf(", F"); + } else { + adjust_factor += segment_penalty_dict_case_ok; + new_rating *= adjust_factor; + if (debug) tprintf(", "); + } + } else { + adjust_factor += segment_penalty_dict_case_bad; + new_rating *= adjust_factor; + if (debug) tprintf(", C"); + } + } + new_rating -= kRatingPad; + if (modify_rating) word->set_rating(new_rating); + if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); + word->set_adjust_factor(adjust_factor); +} + +int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const { + const WERD_CHOICE* word_ptr = &word; + WERD_CHOICE temp_word(word.unicharset()); + if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { + copy_hyphen_info(&temp_word); + temp_word += word; + word_ptr = &temp_word; + } + if (word_ptr->length() == 0) return NO_PERM; + // Allocate vectors for holding current and updated + // active_dawgs and initialize them. + DawgPositionVector active_dawgs[2]; + init_active_dawgs(&(active_dawgs[0]), false); + DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); + int last_index = word_ptr->length() - 1; + // Call letter_is_okay for each letter in the word. + for (int i = hyphen_base_size(); i <= last_index; ++i) { + if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), + word_ptr->unichar_id(i), i == last_index))) + break; + // Swap active_dawgs, constraints with the corresponding updated vector. + if (dawg_args.updated_dawgs == &(active_dawgs[1])) { + dawg_args.updated_dawgs = &(active_dawgs[0]); + ++(dawg_args.active_dawgs); + } else { + ++(dawg_args.updated_dawgs); + dawg_args.active_dawgs = &(active_dawgs[0]); + } + } + return valid_word_permuter(dawg_args.permuter, numbers_ok) + ? dawg_args.permuter + : NO_PERM; +} + +bool Dict::valid_bigram(const WERD_CHOICE& word1, + const WERD_CHOICE& word2) const { + if (bigram_dawg_ == nullptr) return false; + + // Extract the core word from the middle of each word with any digits + // replaced with question marks. + int w1start, w1end, w2start, w2end; + word1.punct_stripped(&w1start, &w1end); + word2.punct_stripped(&w2start, &w2end); + + // We don't want to penalize a single guillemet, hyphen, etc. + // But our bigram list doesn't have any information about punctuation. + if (w1start >= w1end) return word1.length() < 3; + if (w2start >= w2end) return word2.length() < 3; + + const UNICHARSET& uchset = getUnicharset(); + std::vector<UNICHAR_ID> bigram_string; + bigram_string.reserve(w1end + w2end + 1); + for (int i = w1start; i < w1end; i++) { + const auto &normed_ids = + getUnicharset().normed_ids(word1.unichar_id(i)); + if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) + bigram_string.push_back(question_unichar_id_); + else + bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); + } + bigram_string.push_back(UNICHAR_SPACE); + for (int i = w2start; i < w2end; i++) { + const auto &normed_ids = + getUnicharset().normed_ids(word2.unichar_id(i)); + if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) + bigram_string.push_back(question_unichar_id_); + else + bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); + } + WERD_CHOICE normalized_word(&uchset, bigram_string.size()); + for (int i = 0; i < bigram_string.size(); ++i) { + normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f, + 0.0f); + } + return bigram_dawg_->word_in_dawg(normalized_word); +} + +bool Dict::valid_punctuation(const WERD_CHOICE& word) { + if (word.length() == 0) return NO_PERM; + int i; + WERD_CHOICE new_word(word.unicharset()); + int last_index = word.length() - 1; + int new_len = 0; + for (i = 0; i <= last_index; ++i) { + UNICHAR_ID unichar_id = (word.unichar_id(i)); + if (getUnicharset().get_ispunctuation(unichar_id)) { + new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); + } else if (!getUnicharset().get_isalpha(unichar_id) && + !getUnicharset().get_isdigit(unichar_id)) { + return false; // neither punc, nor alpha, nor digit + } else if ((new_len = new_word.length()) == 0 || + new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) { + new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); + } + } + for (i = 0; i < dawgs_.size(); ++i) { + if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && + dawgs_[i]->word_in_dawg(new_word)) + return true; + } + return false; +} + +/// Returns true if the language is space-delimited (not CJ, or T). +bool Dict::IsSpaceDelimitedLang() const { + const UNICHARSET& u_set = getUnicharset(); + if (u_set.han_sid() > 0) return false; + if (u_set.katakana_sid() > 0) return false; + if (u_set.thai_sid() > 0) return false; + return true; +} + +} // namespace tesseract |