diff options
Diffstat (limited to 'tesseract/src/ccmain/fixspace.cpp')
-rw-r--r-- | tesseract/src/ccmain/fixspace.cpp | 885 |
1 files changed, 885 insertions, 0 deletions
diff --git a/tesseract/src/ccmain/fixspace.cpp b/tesseract/src/ccmain/fixspace.cpp new file mode 100644 index 00000000..c15e99d3 --- /dev/null +++ b/tesseract/src/ccmain/fixspace.cpp @@ -0,0 +1,885 @@ +/****************************************************************** + * File: fixspace.cpp (Formerly fixspace.c) + * Description: Implements a pass over the page res, exploring the alternative + * spacing possibilities, trying to use context to improve the + * word spacing + * Author: Phil Cheatle + * + * (C) Copyright 1993, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "fixspace.h" + +#include "blobs.h" // for TWERD, TBLOB, TESSLINE +#include "boxword.h" // for BoxWord +#include "errcode.h" // for ASSERT_HOST +#include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset +#include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST +#include "params.h" // for IntParam, StringParam, BoolParam, Doub... +#include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM +#include "rect.h" // for TBOX +#include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB +#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData +#include "tessvars.h" // for debug_fp +#include "tprintf.h" // for tprintf +#include "unicharset.h" // for UNICHARSET +#include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP + +#include <tesseract/ocrclass.h> // for ETEXT_DESC +#include "strngs.h" // for STRING +#include <tesseract/unichar.h> // for UNICHAR_ID + +#include <cstdint> // for INT16_MAX, int16_t, int32_t + +namespace tesseract { + +class BLOCK; +class ROW; + +#define PERFECT_WERDS 999 + +/********************************************************************** + * c_blob_comparator() + * + * Blob comparator used to sort a blob list so that blobs are in increasing + * order of left edge. + **********************************************************************/ + +static int c_blob_comparator( // sort blobs + const void *blob1p, // ptr to ptr to blob1 + const void *blob2p // ptr to ptr to blob2 + ) { + const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p); + const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p); + + return blob1->bounding_box ().left () - blob2->bounding_box ().left (); +} + +/** + * @name fix_fuzzy_spaces() + * Walk over the page finding sequences of words joined by fuzzy spaces. Extract + * them as a sublist, process the sublist to find the optimal arrangement of + * spaces then replace the sublist in the ROW_RES. + * + * @param monitor progress monitor + * @param word_count count of words in doc + * @param[out] page_res + */ +void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, + int32_t word_count, + PAGE_RES *page_res) { + BLOCK_RES_IT block_res_it; + ROW_RES_IT row_res_it; + WERD_RES_IT word_res_it_from; + WERD_RES_IT word_res_it_to; + WERD_RES *word_res; + WERD_RES_LIST fuzzy_space_words; + int16_t new_length; + bool prevent_null_wd_fixsp; // DON'T process blobless wds + int32_t word_index; // current word + + block_res_it.set_to_list(&page_res->block_res_list); + word_index = 0; + for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); + block_res_it.forward()) { + row_res_it.set_to_list(&block_res_it.data()->row_res_list); + for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); + row_res_it.forward()) { + word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); + while (!word_res_it_from.at_last()) { + word_res = word_res_it_from.data(); + while (!word_res_it_from.at_last() && + !(word_res->combination || + word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || + word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { + fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, + block_res_it.data()->block); + word_res = word_res_it_from.forward(); + word_index++; + if (monitor != nullptr) { + monitor->ocr_alive = true; + monitor->progress = 90 + 5 * word_index / word_count; + if (monitor->deadline_exceeded() || + (monitor->cancel != nullptr && + (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) + return; + } + } + + if (!word_res_it_from.at_last()) { + word_res_it_to = word_res_it_from; + prevent_null_wd_fixsp = + word_res->word->cblob_list()->empty(); + if (check_debug_pt(word_res, 60)) + debug_fix_space_level.set_value(10); + word_res_it_to.forward(); + word_index++; + if (monitor != nullptr) { + monitor->ocr_alive = true; + monitor->progress = 90 + 5 * word_index / word_count; + if (monitor->deadline_exceeded() || + (monitor->cancel != nullptr && + (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) + return; + } + while (!word_res_it_to.at_last () && + (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || + word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { + if (check_debug_pt(word_res, 60)) + debug_fix_space_level.set_value(10); + if (word_res->word->cblob_list()->empty()) + prevent_null_wd_fixsp = true; + word_res = word_res_it_to.forward(); + } + if (check_debug_pt(word_res, 60)) + debug_fix_space_level.set_value(10); + if (word_res->word->cblob_list()->empty()) + prevent_null_wd_fixsp = true; + if (prevent_null_wd_fixsp) { + word_res_it_from = word_res_it_to; + } else { + fuzzy_space_words.assign_to_sublist(&word_res_it_from, + &word_res_it_to); + fix_fuzzy_space_list(fuzzy_space_words, + row_res_it.data()->row, + block_res_it.data()->block); + new_length = fuzzy_space_words.length(); + word_res_it_from.add_list_before(&fuzzy_space_words); + for (; + !word_res_it_from.at_last() && new_length > 0; + new_length--) { + word_res_it_from.forward(); + } + } + if (test_pt) + debug_fix_space_level.set_value(0); + } + fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, + block_res_it.data()->block); + // Last word in row + } + } + } +} + +void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, + ROW *row, + BLOCK* block) { + int16_t best_score; + WERD_RES_LIST current_perm; + int16_t current_score; + bool improved = false; + + best_score = eval_word_spacing(best_perm); // default score + dump_words(best_perm, best_score, 1, improved); + + if (best_score != PERFECT_WERDS) + initialise_search(best_perm, current_perm); + + while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { + match_current_words(current_perm, row, block); + current_score = eval_word_spacing(current_perm); + dump_words(current_perm, current_score, 2, improved); + if (current_score > best_score) { + best_perm.clear(); + best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); + best_score = current_score; + improved = true; + } + if (current_score < PERFECT_WERDS) + transform_to_next_perm(current_perm); + } + dump_words(best_perm, best_score, 3, improved); +} + +void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { + WERD_RES_IT src_it(&src_list); + WERD_RES_IT new_it(&new_list); + WERD_RES *src_wd; + WERD_RES *new_wd; + + for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { + src_wd = src_it.data(); + if (!src_wd->combination) { + new_wd = WERD_RES::deep_copy(src_wd); + new_wd->combination = false; + new_wd->part_of_combo = false; + new_it.add_after_then_move(new_wd); + } + } +} + +void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, + BLOCK* block) { + WERD_RES_IT word_it(&words); + WERD_RES *word; + // Since we are not using PAGE_RES to iterate over words, we need to update + // prev_word_best_choice_ before calling classify_word_pass2(). + prev_word_best_choice_ = nullptr; + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); + if ((!word->part_of_combo) && (word->box_word == nullptr)) { + WordData word_data(block, row, word); + SetupWordPassN(2, &word_data); + classify_word_and_language(2, nullptr, &word_data); + } + prev_word_best_choice_ = word->best_choice; + } +} + +/** + * @name eval_word_spacing() + * The basic measure is the number of characters in contextually confirmed + * words. (I.e the word is done) + * If all words are contextually confirmed the evaluation is deemed perfect. + * + * Some fiddles are done to handle "1"s as these are VERY frequent causes of + * fuzzy spaces. The problem with the basic measure is that "561 63" would score + * the same as "56163", though given our knowledge that the space is fuzzy, and + * that there is a "1" next to the fuzzy space, we need to ensure that "56163" + * is preferred. + * + * The solution is to NOT COUNT the score of any word which has a digit at one + * end and a "1Il" as the character the other side of the space. + * + * Conversely, any character next to a "1" within a word is counted as a positive + * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of + * the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 + * sides of a "1" joined. + * + * The joined 1 rule is applied to any word REGARDLESS of contextual + * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally + * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. + * + */ +int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { + WERD_RES_IT word_res_it(&word_res_list); + int16_t total_score = 0; + int16_t word_count = 0; + int16_t done_word_count = 0; + int16_t word_len; + int16_t i; + int16_t offset; + WERD_RES *word; // current word + int16_t prev_word_score = 0; + bool prev_word_done = false; + bool prev_char_1 = false; // prev ch a "1/I/l"? + bool prev_char_digit = false; // prev ch 2..9 or 0 + bool current_char_1 = false; + bool current_word_ok_so_far; + STRING punct_chars = "!\"`',.:;"; + bool prev_char_punct = false; + bool current_char_punct = false; + bool word_done = false; + + do { + word = word_res_it.data(); + word_done = fixspace_thinks_word_done(word); + word_count++; + if (word->tess_failed) { + total_score += prev_word_score; + if (prev_word_done) + done_word_count++; + prev_word_score = 0; + prev_char_1 = false; + prev_char_digit = false; + prev_word_done = false; + } else { + /* + Can we add the prev word score and potentially count this word? + Yes IF it didn't end in a 1 when the first char of this word is a digit + AND it didn't end in a digit when the first char of this word is a 1 + */ + word_len = word->reject_map.length(); + current_word_ok_so_far = false; + if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || + (prev_char_digit && ( + (word_done && + word->best_choice->unichar_lengths().c_str()[0] == 1 && + word->best_choice->unichar_string()[0] == '1') || + (!word_done && STRING(conflict_set_I_l_1).contains( + word->best_choice->unichar_string()[0])))))) { + total_score += prev_word_score; + if (prev_word_done) + done_word_count++; + current_word_ok_so_far = word_done; + } + + if (current_word_ok_so_far) { + prev_word_done = true; + prev_word_score = word_len; + } else { + prev_word_done = false; + prev_word_score = 0; + } + + /* Add 1 to total score for every joined 1 regardless of context and + rejtn */ + for (i = 0, prev_char_1 = false; i < word_len; i++) { + current_char_1 = word->best_choice->unichar_string()[i] == '1'; + if (prev_char_1 || (current_char_1 && (i > 0))) + total_score++; + prev_char_1 = current_char_1; + } + + /* Add 1 to total score for every joined punctuation regardless of context + and rejtn */ + if (tessedit_prefer_joined_punct) { + for (i = 0, offset = 0, prev_char_punct = false; i < word_len; + offset += word->best_choice->unichar_lengths()[i++]) { + current_char_punct = + punct_chars.contains(word->best_choice->unichar_string()[offset]); + if (prev_char_punct || (current_char_punct && i > 0)) + total_score++; + prev_char_punct = current_char_punct; + } + } + prev_char_digit = digit_or_numeric_punct(word, word_len - 1); + for (i = 0, offset = 0; i < word_len - 1; + offset += word->best_choice->unichar_lengths()[i++]); + prev_char_1 = + ((word_done && (word->best_choice->unichar_string()[offset] == '1')) + || (!word_done && STRING(conflict_set_I_l_1).contains( + word->best_choice->unichar_string()[offset]))); + } + /* Find next word */ + do { + word_res_it.forward(); + } while (word_res_it.data()->part_of_combo); + } while (!word_res_it.at_first()); + total_score += prev_word_score; + if (prev_word_done) + done_word_count++; + if (done_word_count == word_count) + return PERFECT_WERDS; + else + return total_score; +} + +bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { + int i; + int offset; + + for (i = 0, offset = 0; i < char_position; + offset += word->best_choice->unichar_lengths()[i++]); + return ( + word->uch_set->get_isdigit( + word->best_choice->unichar_string().c_str() + offset, + word->best_choice->unichar_lengths()[i]) || + (word->best_choice->permuter() == NUMBER_PERM && + STRING(numeric_punctuation).contains( + word->best_choice->unichar_string().c_str()[offset]))); +} + +/** + * @name transform_to_next_perm() + * Examines the current word list to find the smallest word gap size. Then walks + * the word list closing any gaps of this size by either inserted new + * combination words, or extending existing ones. + * + * The routine COULD be limited to stop it building words longer than N blobs. + * + * If there are no more gaps then it DELETES the entire list and returns the + * empty list to cause termination. + */ +void transform_to_next_perm(WERD_RES_LIST &words) { + WERD_RES_IT word_it(&words); + WERD_RES_IT prev_word_it(&words); + WERD_RES *word; + WERD_RES *prev_word; + WERD_RES *combo; + WERD *copy_word; + int16_t prev_right = -INT16_MAX; + TBOX box; + int16_t gap; + int16_t min_gap = INT16_MAX; + + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); + if (!word->part_of_combo) { + box = word->word->bounding_box(); + if (prev_right > -INT16_MAX) { + gap = box.left() - prev_right; + if (gap < min_gap) + min_gap = gap; + } + prev_right = box.right(); + } + } + if (min_gap < INT16_MAX) { + prev_right = -INT16_MAX; // back to start + word_it.set_to_list(&words); + // Note: we can't use cycle_pt due to inserted combos at start of list. + for (; (prev_right == -INT16_MAX) || !word_it.at_first(); + word_it.forward()) { + word = word_it.data(); + if (!word->part_of_combo) { + box = word->word->bounding_box(); + if (prev_right > -INT16_MAX) { + gap = box.left() - prev_right; + if (gap <= min_gap) { + prev_word = prev_word_it.data(); + if (prev_word->combination) { + combo = prev_word; + } else { + /* Make a new combination and insert before + * the first word being joined. */ + copy_word = new WERD; + *copy_word = *(prev_word->word); + // deep copy + combo = new WERD_RES(copy_word); + combo->combination = true; + combo->x_height = prev_word->x_height; + prev_word->part_of_combo = true; + prev_word_it.add_before_then_move(combo); + } + combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); + if (word->combination) { + combo->word->join_on(word->word); + // Move blobs to combo + // old combo no longer needed + delete word_it.extract(); + } else { + // Copy current wd to combo + combo->copy_on(word); + word->part_of_combo = true; + } + combo->done = false; + combo->ClearResults(); + } else { + prev_word_it = word_it; // catch up + } + } + prev_right = box.right(); + } + } + } else { + words.clear(); // signal termination + } +} + +void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, + int16_t mode, bool improved) { + WERD_RES_IT word_res_it(&perm); + + if (debug_fix_space_level > 0) { + if (mode == 1) { + stats_.dump_words_str = ""; + for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); + word_res_it.forward()) { + if (!word_res_it.data()->part_of_combo) { + stats_.dump_words_str += + word_res_it.data()->best_choice->unichar_string(); + stats_.dump_words_str += ' '; + } + } + } + + if (debug_fix_space_level > 1) { + switch (mode) { + case 1: + tprintf("EXTRACTED (%d): \"", score); + break; + case 2: + tprintf("TESTED (%d): \"", score); + break; + case 3: + tprintf("RETURNED (%d): \"", score); + break; + } + + for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); + word_res_it.forward()) { + if (!word_res_it.data()->part_of_combo) { + tprintf("%s/%1d ", + word_res_it.data()->best_choice->unichar_string().c_str(), + static_cast<int>(word_res_it.data()->best_choice->permuter())); + } + } + tprintf("\"\n"); + } else if (improved) { + tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str()); + for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); + word_res_it.forward()) { + if (!word_res_it.data()->part_of_combo) { + tprintf("%s/%1d ", + word_res_it.data()->best_choice->unichar_string().c_str(), + static_cast<int>(word_res_it.data()->best_choice->permuter())); + } + } + tprintf("\"\n"); + } + } +} + +bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) { + if (word->done) + return true; + + /* + Use all the standard pass 2 conditions for mode 5 in set_done() in + reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T + CARE WHETHER WE HAVE of/at on/an etc. + */ + if (fixsp_done_mode > 0 && + (word->tess_accepted || + (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || + fixsp_done_mode == 3) && + (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) && + ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || + (word->best_choice->permuter() == FREQ_DAWG_PERM) || + (word->best_choice->permuter() == USER_DAWG_PERM) || + (word->best_choice->permuter() == NUMBER_PERM))) { + return true; + } else { + return false; + } +} + + +/** + * @name fix_sp_fp_word() + * Test the current word to see if it can be split by deleting noise blobs. If + * so, do the business. + * Return with the iterator pointing to the same place if the word is unchanged, + * or the last of the replacement words. + */ +void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, + BLOCK* block) { + WERD_RES *word_res; + WERD_RES_LIST sub_word_list; + WERD_RES_IT sub_word_list_it(&sub_word_list); + int16_t blob_index; + int16_t new_length; + float junk; + + word_res = word_res_it.data(); + if (word_res->word->flag(W_REP_CHAR) || + word_res->combination || + word_res->part_of_combo || + !word_res->word->flag(W_DONT_CHOP)) + return; + + blob_index = worst_noise_blob(word_res, &junk); + if (blob_index < 0) + return; + + if (debug_fix_space_level > 1) { + tprintf("FP fixspace working on \"%s\"\n", + word_res->best_choice->unichar_string().c_str()); + } + word_res->word->rej_cblob_list()->sort(c_blob_comparator); + sub_word_list_it.add_after_stay_put(word_res_it.extract()); + fix_noisy_space_list(sub_word_list, row, block); + new_length = sub_word_list.length(); + word_res_it.add_list_before(&sub_word_list); + for (; !word_res_it.at_last() && new_length > 1; new_length--) { + word_res_it.forward(); + } +} + +void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, + BLOCK* block) { + int16_t best_score; + WERD_RES_IT best_perm_it(&best_perm); + WERD_RES_LIST current_perm; + WERD_RES_IT current_perm_it(¤t_perm); + WERD_RES *old_word_res; + int16_t current_score; + bool improved = false; + + best_score = fp_eval_word_spacing(best_perm); // default score + + dump_words(best_perm, best_score, 1, improved); + + old_word_res = best_perm_it.data(); + // Even deep_copy doesn't copy the underlying WERD unless its combination + // flag is true!. + old_word_res->combination = true; // Kludge to force deep copy + current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res)); + old_word_res->combination = false; // Undo kludge + + break_noisiest_blob_word(current_perm); + + while (best_score != PERFECT_WERDS && !current_perm.empty()) { + match_current_words(current_perm, row, block); + current_score = fp_eval_word_spacing(current_perm); + dump_words(current_perm, current_score, 2, improved); + if (current_score > best_score) { + best_perm.clear(); + best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); + best_score = current_score; + improved = true; + } + if (current_score < PERFECT_WERDS) { + break_noisiest_blob_word(current_perm); + } + } + dump_words(best_perm, best_score, 3, improved); +} + + +/** + * break_noisiest_blob_word() + * Find the word with the blob which looks like the worst noise. + * Break the word into two, deleting the noise blob. + */ +void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { + WERD_RES_IT word_it(&words); + WERD_RES_IT worst_word_it; + float worst_noise_score = 9999; + int worst_blob_index = -1; // Noisiest blob of noisiest wd + int blob_index; // of wds noisiest blob + float noise_score; // of wds noisiest blob + WERD_RES *word_res; + C_BLOB_IT blob_it; + C_BLOB_IT rej_cblob_it; + C_BLOB_LIST new_blob_list; + C_BLOB_IT new_blob_it; + C_BLOB_IT new_rej_cblob_it; + WERD *new_word; + int16_t start_of_noise_blob; + int16_t i; + + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + blob_index = worst_noise_blob(word_it.data(), &noise_score); + if (blob_index > -1 && worst_noise_score > noise_score) { + worst_noise_score = noise_score; + worst_blob_index = blob_index; + worst_word_it = word_it; + } + } + if (worst_blob_index < 0) { + words.clear(); // signal termination + return; + } + + /* Now split the worst_word_it */ + + word_res = worst_word_it.data(); + + /* Move blobs before noise blob to a new bloblist */ + + new_blob_it.set_to_list(&new_blob_list); + blob_it.set_to_list(word_res->word->cblob_list()); + for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { + new_blob_it.add_after_then_move(blob_it.extract()); + } + start_of_noise_blob = blob_it.data()->bounding_box().left(); + delete blob_it.extract(); // throw out noise blob + + new_word = new WERD(&new_blob_list, word_res->word); + new_word->set_flag(W_EOL, false); + word_res->word->set_flag(W_BOL, false); + word_res->word->set_blanks(1); // After break + + new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); + rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); + for (; + (!rej_cblob_it.empty() && + (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); + rej_cblob_it.forward()) { + new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); + } + + auto* new_word_res = new WERD_RES(new_word); + new_word_res->combination = true; + worst_word_it.add_before_then_move(new_word_res); + + word_res->ClearResults(); +} + +int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, + float *worst_noise_score) { + float noise_score[512]; + int i; + int min_noise_blob; // 1st contender + int max_noise_blob; // last contender + int non_noise_count; + int worst_noise_blob; // Worst blob + float small_limit = kBlnXHeight * fixsp_small_outlines_size; + float non_noise_limit = kBlnXHeight * 0.8; + + if (word_res->rebuild_word == nullptr) + return -1; // Can't handle cube words. + + // Normalised. + int blob_count = word_res->box_word->length(); + ASSERT_HOST(blob_count <= 512); + if (blob_count < 5) + return -1; // too short to split + + /* Get the noise scores for all blobs */ + + #ifndef SECURE_NAMES + if (debug_fix_space_level > 5) + tprintf("FP fixspace Noise metrics for \"%s\": ", + word_res->best_choice->unichar_string().c_str()); + #endif + + for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) { + TBLOB* blob = word_res->rebuild_word->blobs[i]; + if (word_res->reject_map[i].accepted()) + noise_score[i] = non_noise_limit; + else + noise_score[i] = blob_noise_score(blob); + + if (debug_fix_space_level > 5) + tprintf("%1.1f ", noise_score[i]); + } + if (debug_fix_space_level > 5) + tprintf("\n"); + + /* Now find the worst one which is far enough away from the end of the word */ + + non_noise_count = 0; + for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { + if (noise_score[i] >= non_noise_limit) { + non_noise_count++; + } + } + if (non_noise_count < fixsp_non_noise_limit) + return -1; + + min_noise_blob = i; + + non_noise_count = 0; + for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; + i--) { + if (noise_score[i] >= non_noise_limit) { + non_noise_count++; + } + } + if (non_noise_count < fixsp_non_noise_limit) + return -1; + + max_noise_blob = i; + + if (min_noise_blob > max_noise_blob) + return -1; + + *worst_noise_score = small_limit; + worst_noise_blob = -1; + for (i = min_noise_blob; i <= max_noise_blob; i++) { + if (noise_score[i] < *worst_noise_score) { + worst_noise_blob = i; + *worst_noise_score = noise_score[i]; + } + } + return worst_noise_blob; +} + +float Tesseract::blob_noise_score(TBLOB *blob) { + TBOX box; // BB of outline + int16_t outline_count = 0; + int16_t max_dimension; + int16_t largest_outline_dimension = 0; + + for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) { + outline_count++; + box = ol->bounding_box(); + if (box.height() > box.width()) { + max_dimension = box.height(); + } else { + max_dimension = box.width(); + } + + if (largest_outline_dimension < max_dimension) + largest_outline_dimension = max_dimension; + } + + if (outline_count > 5) { + // penalise LOTS of blobs + largest_outline_dimension *= 2; + } + + box = blob->bounding_box(); + if (box.bottom() > kBlnBaselineOffset * 4 || + box.top() < kBlnBaselineOffset / 2) { + // Lax blob is if high or low + largest_outline_dimension /= 2; + } + + return largest_outline_dimension; +} + +void fixspace_dbg(WERD_RES *word) { + TBOX box = word->word->bounding_box(); + const bool show_map_detail = false; + int16_t i; + + box.print(); + tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); + tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", + word->word->cblob_list()->length(), + word->rebuild_word->NumBlobs(), + word->box_word->length()); + word->reject_map.print(debug_fp); + tprintf("\n"); + if (show_map_detail) { + tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); + for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { + tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); + word->reject_map[i].full_print(debug_fp); + } + } + + tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); + tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); +} + + +/** + * fp_eval_word_spacing() + * Evaluation function for fixed pitch word lists. + * + * Basically, count the number of "nice" characters - those which are in tess + * acceptable words or in dict words and are not rejected. + * Penalise any potential noise chars + */ +int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { + WERD_RES_IT word_it(&word_res_list); + WERD_RES *word; + int16_t score = 0; + int16_t i; + float small_limit = kBlnXHeight * fixsp_small_outlines_size; + + for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { + word = word_it.data(); + if (word->rebuild_word == nullptr) + continue; // Can't handle cube words. + if (word->done || + word->tess_accepted || + word->best_choice->permuter() == SYSTEM_DAWG_PERM || + word->best_choice->permuter() == FREQ_DAWG_PERM || + word->best_choice->permuter() == USER_DAWG_PERM || + safe_dict_word(word) > 0) { + int num_blobs = word->rebuild_word->NumBlobs(); + UNICHAR_ID space = word->uch_set->unichar_to_id(" "); + for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) { + TBLOB* blob = word->rebuild_word->blobs[i]; + if (word->best_choice->unichar_id(i) == space || + blob_noise_score(blob) < small_limit) { + score -= 1; // penalise possibly erroneous non-space + } else if (word->reject_map[i].accepted()) { + score++; + } + } + } + } + if (score < 0) + score = 0; + return score; +} + +} // namespace tesseract |