diff options
Diffstat (limited to 'tesseract/src/ccmain/paragraphs.cpp')
-rw-r--r-- | tesseract/src/ccmain/paragraphs.cpp | 2590 |
1 files changed, 2590 insertions, 0 deletions
diff --git a/tesseract/src/ccmain/paragraphs.cpp b/tesseract/src/ccmain/paragraphs.cpp new file mode 100644 index 00000000..28576579 --- /dev/null +++ b/tesseract/src/ccmain/paragraphs.cpp @@ -0,0 +1,2590 @@ +/********************************************************************** + * File: paragraphs.cpp + * Description: Paragraph detection for tesseract. + * Author: David Eger + * + * (C) Copyright 2011, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "paragraphs.h" + +#include "genericvector.h" // for GenericVector, GenericVectorEqEq +#include "helpers.h" // for UpdateRange, ClipToRange +#include "host.h" // for NearlyEqual +#include "mutableiterator.h" // for MutableIterator +#include "ocrblock.h" // for BLOCK +#include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA... +#include "ocrrow.h" // for ROW +#include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO... +#include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels +#include "pdblock.h" // for PDBLK +#include "polyblk.h" // for POLY_BLOCK +#include "ratngs.h" // for WERD_CHOICE +#include "rect.h" // for TBOX +#include "statistc.h" // for STATS +#include "strngs.h" // for STRING +#include "tprintf.h" // for tprintf +#include "unicharset.h" // for UNICHARSET +#include "werd.h" // for WERD, W_REP_CHAR + +#include <tesseract/pageiterator.h> // for PageIterator +#include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R... +#include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID + +#include <cctype> // for isspace +#include <cmath> // for abs +#include <cstdio> // for snprintf +#include <cstdlib> // for abs +#include <cstring> // for strchr, strlen +#include <algorithm> // for max +#include <memory> // for unique_ptr + +static const char * const kRLE = "\u202A"; // Right-to-Left Embedding +static const char * const kPDF = "\u202C"; // Pop Directional Formatting + +namespace tesseract { + +// Special "weak" ParagraphModels. +const ParagraphModel *kCrownLeft + = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F)); +const ParagraphModel *kCrownRight + = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F)); + +// Do the text and geometry of two rows support a paragraph break between them? +static bool LikelyParagraphStart(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification j); + +// Given the width of a typical space between words, what is the threshold +// by which by which we think left and right alignments for paragraphs +// can vary and still be aligned. +static int Epsilon(int space_pix) { + return space_pix * 4 / 5; +} + +static bool AcceptableRowArgs( + int debug_level, int min_num_rows, const char *function_name, + const GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end) { + if (row_start < 0 || row_end > rows->size() || row_start > row_end) { + tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", + row_start, row_end, rows->size()); + return false; + } + if (row_end - row_start < min_num_rows) { + if (debug_level > 1) { + tprintf("# Too few rows[%d, %d) for %s.\n", + row_start, row_end, function_name); + } + return false; + } + return true; +} + +// =============================== Debug Code ================================ + +// Convert an integer to a decimal string. +static STRING StrOf(int num) { + char buffer[30]; + snprintf(buffer, sizeof(buffer), "%d", num); + return STRING(buffer); +} + +// Given a row-major matrix of unicode text and a column separator, print +// a formatted table. For ASCII, we get good column alignment. +static void PrintTable(const std::vector<std::vector<STRING> > &rows, + const STRING &colsep) { + std::vector<int> max_col_widths; + for (const auto& row : rows) { + int num_columns = row.size(); + for (int c = 0; c < num_columns; c++) { + int num_unicodes = 0; + for (int i = 0; i < row[c].size(); i++) { + if ((row[c][i] & 0xC0) != 0x80) num_unicodes++; + } + if (c >= max_col_widths.size()) { + max_col_widths.push_back(num_unicodes); + } else { + if (num_unicodes > max_col_widths[c]) + max_col_widths[c] = num_unicodes; + } + } + } + + std::vector<STRING> col_width_patterns; + for (int c = 0; c < max_col_widths.size(); c++) { + col_width_patterns.push_back( + STRING("%-") + StrOf(max_col_widths[c]) + "s"); + } + + for (int r = 0; r < rows.size(); r++) { + for (int c = 0; c < rows[r].size(); c++) { + if (c > 0) + tprintf("%s", colsep.c_str()); + tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str()); + } + tprintf("\n"); + } +} + +static STRING RtlEmbed(const STRING &word, bool rtlify) { + if (rtlify) + return STRING(kRLE) + word + STRING(kPDF); + return word; +} + +// Print the current thoughts of the paragraph detector. +static void PrintDetectorState(const ParagraphTheory &theory, + const GenericVector<RowScratchRegisters> &rows) { + std::vector<std::vector<STRING> > output; + output.push_back(std::vector<STRING>()); + output.back().push_back("#row"); + output.back().push_back("space"); + output.back().push_back(".."); + output.back().push_back("lword[widthSEL]"); + output.back().push_back("rword[widthSEL]"); + RowScratchRegisters::AppendDebugHeaderFields(&output.back()); + output.back().push_back("text"); + + for (int i = 0; i < rows.size(); i++) { + output.push_back(std::vector<STRING>()); + std::vector<STRING> &row = output.back(); + const RowInfo& ri = *rows[i].ri_; + row.push_back(StrOf(i)); + row.push_back(StrOf(ri.average_interword_space)); + row.push_back(ri.has_leaders ? ".." : " "); + row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + + "[" + StrOf(ri.lword_box.width()) + + (ri.lword_likely_starts_idea ? "S" : "s") + + (ri.lword_likely_ends_idea ? "E" : "e") + + (ri.lword_indicates_list_item ? "L" : "l") + + "]"); + row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + + "[" + StrOf(ri.rword_box.width()) + + (ri.rword_likely_starts_idea ? "S" : "s") + + (ri.rword_likely_ends_idea ? "E" : "e") + + (ri.rword_indicates_list_item ? "L" : "l") + + "]"); + rows[i].AppendDebugInfo(theory, &row); + row.push_back(RtlEmbed(ri.text, !ri.ltr)); + } + PrintTable(output, " "); + + tprintf("Active Paragraph Models:\n"); + unsigned m = 0; + for (const auto& model : theory.models()) { + tprintf(" %d: %s\n", ++m, model->ToString().c_str()); + } +} + +static void DebugDump( + bool should_print, + const STRING &phase, + const ParagraphTheory &theory, + const GenericVector<RowScratchRegisters> &rows) { + if (!should_print) + return; + tprintf("# %s\n", phase.c_str()); + PrintDetectorState(theory, rows); +} + +// Print out the text for rows[row_start, row_end) +static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, + int row_start, int row_end) { + tprintf("======================================\n"); + for (int row = row_start; row < row_end; row++) { + tprintf("%s\n", rows[row].ri_->text.c_str()); + } + tprintf("======================================\n"); +} + +// ============= Brain Dead Language Model (ASCII Version) =================== + +static bool IsLatinLetter(int ch) { + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); +} + +static bool IsDigitLike(int ch) { + return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; +} + +static bool IsOpeningPunct(int ch) { + return strchr("'\"({[", ch) != nullptr; +} + +static bool IsTerminalPunct(int ch) { + return strchr(":'\".?!]})", ch) != nullptr; +} + +// Return a pointer after consuming as much text as qualifies as roman numeral. +static const char *SkipChars(const char *str, const char *toskip) { + while (*str != '\0' && strchr(toskip, *str)) { str++; } + return str; +} + +static const char *SkipChars(const char *str, bool (*skip)(int)) { + while (*str != '\0' && skip(*str)) { str++; } + return str; +} + +static const char *SkipOne(const char *str, const char *toskip) { + if (*str != '\0' && strchr(toskip, *str)) return str + 1; + return str; +} + +// Return whether it is very likely that this is a numeral marker that could +// start a list item. Some examples include: +// A I iii. VI (2) 3.5. [C-4] +static bool LikelyListNumeral(const STRING &word) { + const char *kRomans = "ivxlmdIVXLMD"; + const char *kDigits = "012345789"; + const char *kOpen = "[{("; + const char *kSep = ":;-.,"; + const char *kClose = "]})"; + + int num_segments = 0; + const char *pos = word.c_str(); + while (*pos != '\0' && num_segments < 3) { + // skip up to two open parens. + const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); + const char *numeral_end = SkipChars(numeral_start, kRomans); + if (numeral_end != numeral_start) { + // Got Roman Numeral. Great. + } else { + numeral_end = SkipChars(numeral_start, kDigits); + if (numeral_end == numeral_start) { + // If there's a single latin letter, we can use that. + numeral_end = SkipChars(numeral_start, IsLatinLetter); + if (numeral_end - numeral_start != 1) + break; + } + } + // We got some sort of numeral. + num_segments++; + // Skip any trailing parens or punctuation. + pos = SkipChars(SkipChars(numeral_end, kClose), kSep); + if (pos == numeral_end) + break; + } + return *pos == '\0'; +} + +static bool LikelyListMark(const STRING &word) { + const char *kListMarks = "0Oo*.,+."; + return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr; +} + +bool AsciiLikelyListItem(const STRING &word) { + return LikelyListMark(word) || LikelyListNumeral(word); +} + +// ========== Brain Dead Language Model (Tesseract Version) ================ + +// Return the first Unicode Codepoint from werd[pos]. +int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) { + if (!u || !werd || pos > werd->length()) + return 0; + return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); +} + +// A useful helper class for finding the first j >= i so that word[j] +// does not have given character type. +class UnicodeSpanSkipper { + public: + UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) + : u_(unicharset), word_(word) { wordlen_ = word->length(); } + + // Given an input position, return the first position >= pos not punc. + int SkipPunc(int pos); + // Given an input position, return the first position >= pos not digit. + int SkipDigits(int pos); + // Given an input position, return the first position >= pos not roman. + int SkipRomans(int pos); + // Given an input position, return the first position >= pos not alpha. + int SkipAlpha(int pos); + + private: + const UNICHARSET *u_; + const WERD_CHOICE *word_; + int wordlen_; +}; + +int UnicodeSpanSkipper::SkipPunc(int pos) { + while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++; + return pos; +} + +int UnicodeSpanSkipper::SkipDigits(int pos) { + while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) || + IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++; + return pos; +} + +int UnicodeSpanSkipper::SkipRomans(int pos) { + const char *kRomans = "ivxlmdIVXLMD"; + while (pos < wordlen_) { + int ch = UnicodeFor(u_, word_, pos); + if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break; + pos++; + } + return pos; +} + +int UnicodeSpanSkipper::SkipAlpha(int pos) { + while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++; + return pos; +} + +static bool LikelyListMarkUnicode(int ch) { + if (ch < 0x80) { + STRING single_ch; + single_ch += ch; + return LikelyListMark(single_ch); + } + switch (ch) { + // TODO(eger) expand this list of unicodes as needed. + case 0x00B0: // degree sign + case 0x2022: // bullet + case 0x25E6: // white bullet + case 0x00B7: // middle dot + case 0x25A1: // white square + case 0x25A0: // black square + case 0x25AA: // black small square + case 0x2B1D: // black very small square + case 0x25BA: // black right-pointing pointer + case 0x25CF: // black circle + case 0x25CB: // white circle + return true; + default: + break; // fall through + } + return false; +} + +// Return whether it is very likely that this is a numeral marker that could +// start a list item. Some examples include: +// A I iii. VI (2) 3.5. [C-4] +static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { + if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) + return true; + + UnicodeSpanSkipper m(u, werd); + int num_segments = 0; + int pos = 0; + while (pos < werd->length() && num_segments < 3) { + int numeral_start = m.SkipPunc(pos); + if (numeral_start > pos + 1) break; + int numeral_end = m.SkipRomans(numeral_start); + if (numeral_end == numeral_start) { + numeral_end = m.SkipDigits(numeral_start); + if (numeral_end == numeral_start) { + // If there's a single latin letter, we can use that. + numeral_end = m.SkipAlpha(numeral_start); + if (numeral_end - numeral_start != 1) + break; + } + } + // We got some sort of numeral. + num_segments++; + // Skip any trailing punctuation. + pos = m.SkipPunc(numeral_end); + if (pos == numeral_end) + break; + } + return pos == werd->length(); +} + +// ========= Brain Dead Language Model (combined entry points) ================ + +// Given the leftmost word of a line either as a Tesseract unicharset + werd +// or a utf8 string, set the following attributes for it: +// is_list - this word might be a list number or bullet. +// starts_idea - this word is likely to start a sentence. +// ends_idea - this word is likely to end a sentence. +void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, + const STRING &utf8, + bool *is_list, bool *starts_idea, bool *ends_idea) { + *is_list = false; + *starts_idea = false; + *ends_idea = false; + if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty + *ends_idea = true; + return; + } + + if (unicharset && werd) { // We have a proper werd and unicharset so use it. + if (UniLikelyListItem(unicharset, werd)) { + *is_list = true; + *starts_idea = true; + *ends_idea = true; + } + if (unicharset->get_isupper(werd->unichar_id(0))) { + *starts_idea = true; + } + if (unicharset->get_ispunctuation(werd->unichar_id(0))) { + *starts_idea = true; + *ends_idea = true; + } + } else { // Assume utf8 is mostly ASCII + if (AsciiLikelyListItem(utf8)) { + *is_list = true; + *starts_idea = true; + } + int start_letter = utf8[0]; + if (IsOpeningPunct(start_letter)) { + *starts_idea = true; + } + if (IsTerminalPunct(start_letter)) { + *ends_idea = true; + } + if (start_letter >= 'A' && start_letter <= 'Z') { + *starts_idea = true; + } + } +} + +// Given the rightmost word of a line either as a Tesseract unicharset + werd +// or a utf8 string, set the following attributes for it: +// is_list - this word might be a list number or bullet. +// starts_idea - this word is likely to start a sentence. +// ends_idea - this word is likely to end a sentence. +void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, + const STRING &utf8, + bool *is_list, bool *starts_idea, bool *ends_idea) { + *is_list = false; + *starts_idea = false; + *ends_idea = false; + if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty + *ends_idea = true; + return; + } + + if (unicharset && werd) { // We have a proper werd and unicharset so use it. + if (UniLikelyListItem(unicharset, werd)) { + *is_list = true; + *starts_idea = true; + } + UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); + if (unicharset->get_ispunctuation(last_letter)) { + *ends_idea = true; + } + } else { // Assume utf8 is mostly ASCII + if (AsciiLikelyListItem(utf8)) { + *is_list = true; + *starts_idea = true; + } + int last_letter = utf8[utf8.size() - 1]; + if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { + *ends_idea = true; + } + } +} + +// =============== Implementation of RowScratchRegisters ===================== +/* static */ +void RowScratchRegisters::AppendDebugHeaderFields( + std::vector<STRING> *header) { + header->push_back("[lmarg,lind;rind,rmarg]"); + header->push_back("model"); +} + +void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, + std::vector<STRING> *dbg) const { + char s[30]; + snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", + lmargin_, lindent_, rindent_, rmargin_); + dbg->push_back(s); + STRING model_string; + model_string += static_cast<char>(GetLineType()); + model_string += ":"; + + int model_numbers = 0; + for (int h = 0; h < hypotheses_.size(); h++) { + if (hypotheses_[h].model == nullptr) + continue; + if (model_numbers > 0) + model_string += ","; + if (StrongModel(hypotheses_[h].model)) { + model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model)); + } else if (hypotheses_[h].model == kCrownLeft) { + model_string += "CrL"; + } else if (hypotheses_[h].model == kCrownRight) { + model_string += "CrR"; + } + model_numbers++; + } + if (model_numbers == 0) + model_string += "0"; + + dbg->push_back(model_string); +} + +void RowScratchRegisters::Init(const RowInfo &row) { + ri_ = &row; + lmargin_ = 0; + lindent_ = row.pix_ldistance; + rmargin_ = 0; + rindent_ = row.pix_rdistance; +} + +LineType RowScratchRegisters::GetLineType() const { + if (hypotheses_.empty()) + return LT_UNKNOWN; + bool has_start = false; + bool has_body = false; + for (int i = 0; i < hypotheses_.size(); i++) { + switch (hypotheses_[i].ty) { + case LT_START: has_start = true; break; + case LT_BODY: has_body = true; break; + default: + tprintf("Encountered bad value in hypothesis list: %c\n", + hypotheses_[i].ty); + break; + } + } + if (has_start && has_body) + return LT_MULTIPLE; + return has_start ? LT_START : LT_BODY; +} + +LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { + if (hypotheses_.empty()) + return LT_UNKNOWN; + bool has_start = false; + bool has_body = false; + for (int i = 0; i < hypotheses_.size(); i++) { + if (hypotheses_[i].model != model) + continue; + switch (hypotheses_[i].ty) { + case LT_START: has_start = true; break; + case LT_BODY: has_body = true; break; + default: + tprintf("Encountered bad value in hypothesis list: %c\n", + hypotheses_[i].ty); + break; + } + } + if (has_start && has_body) + return LT_MULTIPLE; + return has_start ? LT_START : LT_BODY; +} + +void RowScratchRegisters::SetStartLine() { + LineType current_lt = GetLineType(); + if (current_lt != LT_UNKNOWN && current_lt != LT_START) { + tprintf("Trying to set a line to be START when it's already BODY.\n"); + } + if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { + hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr)); + } +} + +void RowScratchRegisters::SetBodyLine() { + LineType current_lt = GetLineType(); + if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { + tprintf("Trying to set a line to be BODY when it's already START.\n"); + } + if (current_lt == LT_UNKNOWN || current_lt == LT_START) { + hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr)); + } +} + +void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { + hypotheses_.push_back_new(LineHypothesis(LT_START, model)); + int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr)); + if (old_idx >= 0) + hypotheses_.remove(old_idx); +} + +void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { + hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); + int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr)); + if (old_idx >= 0) + hypotheses_.remove(old_idx); +} + +void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { + for (int h = 0; h < hypotheses_.size(); h++) { + if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) + models->push_back_new(hypotheses_[h].model); + } +} + +void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { + for (int h = 0; h < hypotheses_.size(); h++) { + if (StrongModel(hypotheses_[h].model)) + models->push_back_new(hypotheses_[h].model); + } +} + +void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { + for (int h = 0; h < hypotheses_.size(); h++) { + if (hypotheses_[h].model != nullptr) + models->push_back_new(hypotheses_[h].model); + } +} + +const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { + if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) + return nullptr; + return hypotheses_[0].model; +} + +const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { + if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) + return nullptr; + return hypotheses_[0].model; +} + +// Discard any hypotheses whose model is not in the given list. +void RowScratchRegisters::DiscardNonMatchingHypotheses( + const SetOfModels &models) { + if (models.empty()) + return; + for (int h = hypotheses_.size() - 1; h >= 0; h--) { + if (!models.contains(hypotheses_[h].model)) { + hypotheses_.remove(h); + } + } +} + +// ============ Geometry based Paragraph Detection Algorithm ================= + +struct Cluster { + Cluster() : center(0), count(0) {} + Cluster(int cen, int num) : center(cen), count(num) {} + + int center; // The center of the cluster. + int count; // The number of entries within the cluster. +}; + +class SimpleClusterer { + public: + explicit SimpleClusterer(int max_cluster_width) + : max_cluster_width_(max_cluster_width) {} + void Add(int value) { values_.push_back(value); } + int size() const { return values_.size(); } + void GetClusters(GenericVector<Cluster> *clusters); + + private: + int max_cluster_width_; + GenericVector<int> values_; +}; + +// Return the index of the cluster closest to value. +static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) { + int best_index = 0; + for (int i = 0; i < clusters.size(); i++) { + if (abs(value - clusters[i].center) < + abs(value - clusters[best_index].center)) + best_index = i; + } + return best_index; +} + +void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) { + clusters->clear(); + values_.sort(); + for (int i = 0; i < values_.size();) { + int orig_i = i; + int lo = values_[i]; + int hi = lo; + while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { + hi = values_[i]; + } + clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); + } +} + +// Calculate left- and right-indent tab stop values seen in +// rows[row_start, row_end) given a tolerance of tolerance. +static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, int tolerance, + GenericVector<Cluster> *left_tabs, + GenericVector<Cluster> *right_tabs) { + if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) + return; + // First pass: toss all left and right indents into clusterers. + SimpleClusterer initial_lefts(tolerance); + SimpleClusterer initial_rights(tolerance); + GenericVector<Cluster> initial_left_tabs; + GenericVector<Cluster> initial_right_tabs; + for (int i = row_start; i < row_end; i++) { + initial_lefts.Add((*rows)[i].lindent_); + initial_rights.Add((*rows)[i].rindent_); + } + initial_lefts.GetClusters(&initial_left_tabs); + initial_rights.GetClusters(&initial_right_tabs); + + // Second pass: cluster only lines that are not "stray" + // An example of a stray line is a page number -- a line whose start + // and end tab-stops are far outside the typical start and end tab-stops + // for the block. + // Put another way, we only cluster data from lines whose start or end + // tab stop is frequent. + SimpleClusterer lefts(tolerance); + SimpleClusterer rights(tolerance); + + // Outlier elimination. We might want to switch this to test outlier-ness + // based on how strange a position an outlier is in instead of or in addition + // to how rare it is. These outliers get re-added if we end up having too + // few tab stops, to work with, however. + int infrequent_enough_to_ignore = 0; + if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1; + if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2; + + for (int i = row_start; i < row_end; i++) { + int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); + int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); + if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || + initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { + lefts.Add((*rows)[i].lindent_); + rights.Add((*rows)[i].rindent_); + } + } + lefts.GetClusters(left_tabs); + rights.GetClusters(right_tabs); + + if ((left_tabs->size() == 1 && right_tabs->size() >= 4) || + (right_tabs->size() == 1 && left_tabs->size() >= 4)) { + // One side is really ragged, and the other only has one tab stop, + // so those "insignificant outliers" are probably important, actually. + // This often happens on a page of an index. Add back in the ones + // we omitted in the first pass. + for (int i = row_start; i < row_end; i++) { + int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); + int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); + if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore || + initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) { + lefts.Add((*rows)[i].lindent_); + rights.Add((*rows)[i].rindent_); + } + } + } + lefts.GetClusters(left_tabs); + rights.GetClusters(right_tabs); + + // If one side is almost a two-indent aligned side, and the other clearly + // isn't, try to prune out the least frequent tab stop from that side. + if (left_tabs->size() == 3 && right_tabs->size() >= 4) { + int to_prune = -1; + for (int i = left_tabs->size() - 1; i >= 0; i--) { + if (to_prune < 0 || + (*left_tabs)[i].count < (*left_tabs)[to_prune].count) { + to_prune = i; + } + } + if (to_prune >= 0 && + (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { + left_tabs->remove(to_prune); + } + } + if (right_tabs->size() == 3 && left_tabs->size() >= 4) { + int to_prune = -1; + for (int i = right_tabs->size() - 1; i >= 0; i--) { + if (to_prune < 0 || + (*right_tabs)[i].count < (*right_tabs)[to_prune].count) { + to_prune = i; + } + } + if (to_prune >= 0 && + (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { + right_tabs->remove(to_prune); + } + } +} + +// Given a paragraph model mark rows[row_start, row_end) as said model +// start or body lines. +// +// Case 1: model->first_indent_ != model->body_indent_ +// Differentiating the paragraph start lines from the paragraph body lines in +// this case is easy, we just see how far each line is indented. +// +// Case 2: model->first_indent_ == model->body_indent_ +// Here, we find end-of-paragraph lines by looking for "short lines." +// What constitutes a "short line" changes depending on whether the text +// ragged-right[left] or fully justified (aligned left and right). +// +// Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) +// We have a new paragraph it the first word would have at the end +// of the previous line. +// +// Case 2b: Fully Justified. (eop_threshold > 0) +// We mark a line as short (end of paragraph) if the offside indent +// is greater than eop_threshold. +static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, + const ParagraphModel *model, + bool ltr, int eop_threshold) { + if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) + return; + for (int row = row_start; row < row_end; row++) { + bool valid_first = ValidFirstLine(rows, row, model); + bool valid_body = ValidBodyLine(rows, row, model); + if (valid_first && !valid_body) { + (*rows)[row].AddStartLine(model); + } else if (valid_body && !valid_first) { + (*rows)[row].AddBodyLine(model); + } else if (valid_body && valid_first) { + bool after_eop = (row == row_start); + if (row > row_start) { + if (eop_threshold > 0) { + if (model->justification() == JUSTIFICATION_LEFT) { + after_eop = (*rows)[row - 1].rindent_ > eop_threshold; + } else { + after_eop = (*rows)[row - 1].lindent_ > eop_threshold; + } + } else { + after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], + model->justification()); + } + } + if (after_eop) { + (*rows)[row].AddStartLine(model); + } else { + (*rows)[row].AddBodyLine(model); + } + } else { + // Do nothing. Stray row. + } + } +} + +// GeometricClassifierState holds all of the information we'll use while +// trying to determine a paragraph model for the text lines in a block of +// text: +// + the rows under consideration [row_start, row_end) +// + the common left- and right-indent tab stops +// + does the block start out left-to-right or right-to-left +// Further, this struct holds the data we amass for the (single) ParagraphModel +// we'll assign to the text lines (assuming we get that far). +struct GeometricClassifierState { + GeometricClassifierState(int dbg_level, + GenericVector<RowScratchRegisters> *r, + int r_start, int r_end) + : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) { + tolerance = InterwordSpace(*r, r_start, r_end); + CalculateTabStops(r, r_start, r_end, tolerance, + &left_tabs, &right_tabs); + if (debug_level >= 3) { + tprintf("Geometry: TabStop cluster tolerance = %d; " + "%d left tabs; %d right tabs\n", + tolerance, left_tabs.size(), right_tabs.size()); + } + ltr = (*r)[r_start].ri_->ltr; + } + + void AssumeLeftJustification() { + just = tesseract::JUSTIFICATION_LEFT; + margin = (*rows)[row_start].lmargin_; + } + + void AssumeRightJustification() { + just = tesseract::JUSTIFICATION_RIGHT; + margin = (*rows)[row_start].rmargin_; + } + + // Align tabs are the tab stops the text is aligned to. + const GenericVector<Cluster> &AlignTabs() const { + if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs; + return left_tabs; + } + + // Offside tabs are the tab stops opposite the tabs used to align the text. + // + // Note that for a left-to-right text which is aligned to the right such as + // this function comment, the offside tabs are the horizontal tab stops + // marking the beginning of ("Note", "this" and "marking"). + const GenericVector<Cluster> &OffsideTabs() const { + if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs; + return right_tabs; + } + + // Return whether the i'th row extends from the leftmost left tab stop + // to the right most right tab stop. + bool IsFullRow(int i) const { + return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && + ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; + } + + int AlignsideTabIndex(int row_idx) const { + return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); + } + + // Given what we know about the paragraph justification (just), would the + // first word of row_b have fit at the end of row_a? + bool FirstWordWouldHaveFit(int row_a, int row_b) { + return ::tesseract::FirstWordWouldHaveFit( + (*rows)[row_a], (*rows)[row_b], just); + } + + void PrintRows() const { PrintRowRange(*rows, row_start, row_end); } + + void Fail(int min_debug_level, const char *why) const { + if (debug_level < min_debug_level) return; + tprintf("# %s\n", why); + PrintRows(); + } + + ParagraphModel Model() const { + return ParagraphModel(just, margin, first_indent, body_indent, tolerance); + } + + // We print out messages with a debug level at least as great as debug_level. + int debug_level = 0; + + // The Geometric Classifier was asked to find a single paragraph model + // to fit the text rows (*rows)[row_start, row_end) + GenericVector<RowScratchRegisters> *rows; + int row_start = 0; + int row_end = 0; + + // The amount by which we expect the text edge can vary and still be aligned. + int tolerance = 0; + + // Is the script in this text block left-to-right? + // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve + bool ltr = false; + + // These left and right tab stops were determined to be the common tab + // stops for the given text. + GenericVector<Cluster> left_tabs; + GenericVector<Cluster> right_tabs; + + // These are parameters we must determine to create a ParagraphModel. + tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN; + int margin = 0; + int first_indent = 0; + int body_indent = 0; + + // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() + int eop_threshold = 0; +}; + +// Given a section of text where strong textual clues did not help identifying +// paragraph breaks, and for which the left and right indents have exactly +// three tab stops between them, attempt to find the paragraph breaks based +// solely on the outline of the text and whether the script is left-to-right. +// +// Algorithm Detail: +// The selected rows are in the form of a rectangle except +// for some number of "short lines" of the same length: +// +// (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx +// xxxxxxxxxxx xxxxxxxxxx # A "short" line. +// xxxxxxxxxxxxx xxxxxxxxxxxx +// xxxxxxxxxxxxx xxxxxxxxxxxx +// +// We have a slightly different situation if the only short +// line is at the end of the excerpt. +// +// (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx +// xxxxxxxxxxxxx xxxxxxxxxxxx +// xxxxxxxxxxxxx xxxxxxxxxxxx +// xxxxxxxxxxx xxxxxxxxxx # A "short" line. +// +// We'll interpret these as follows based on the reasoning in the comment for +// GeometricClassify(): +// [script direction: first indent, body indent] +// (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 +// (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 +static void GeometricClassifyThreeTabStopTextBlock( + int debug_level, + GeometricClassifierState &s, + ParagraphTheory *theory) { + int num_rows = s.row_end - s.row_start; + int num_full_rows = 0; + int last_row_full = 0; + for (int i = s.row_start; i < s.row_end; i++) { + if (s.IsFullRow(i)) { + num_full_rows++; + if (i == s.row_end - 1) last_row_full++; + } + } + + if (num_full_rows < 0.7 * num_rows) { + s.Fail(1, "Not enough full lines to know which lines start paras."); + return; + } + + // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() + s.eop_threshold = 0; + + if (s.ltr) { + s.AssumeLeftJustification(); + } else { + s.AssumeRightJustification(); + } + + if (debug_level > 0) { + tprintf("# Not enough variety for clear outline classification. " + "Guessing these are %s aligned based on script.\n", + s.ltr ? "left" : "right"); + s.PrintRows(); + } + + if (s.AlignTabs().size() == 2) { // case A1 or A2 + s.first_indent = s.AlignTabs()[1].center; + s.body_indent = s.AlignTabs()[0].center; + } else { // case B1 or B2 + if (num_rows - 1 == num_full_rows - last_row_full) { + // case B2 + const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; + (*s.rows)[s.row_start].AddStartLine(model); + for (int i = s.row_start + 1; i < s.row_end; i++) { + (*s.rows)[i].AddBodyLine(model); + } + return; + } else { + // case B1 + s.first_indent = s.body_indent = s.AlignTabs()[0].center; + s.eop_threshold = (s.OffsideTabs()[0].center + + s.OffsideTabs()[1].center) / 2; + } + } + const ParagraphModel *model = theory->AddModel(s.Model()); + MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, + s.ltr, s.eop_threshold); + return; +} + +// This function is called if strong textual clues were not available, but +// the caller hopes that the paragraph breaks will be super obvious just +// by the outline of the text. +// +// The particularly difficult case is figuring out what's going on if you +// don't have enough short paragraph end lines to tell us what's going on. +// +// For instance, let's say you have the following outline: +// +// (A1) xxxxxxxxxxxxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxxxx +// +// Even if we know that the text is left-to-right and so will probably be +// left-aligned, both of the following are possible texts: +// +// (A1a) 1. Here our list item +// with two full lines. +// 2. Here a second item. +// 3. Here our third one. +// +// (A1b) so ends paragraph one. +// Here starts another +// paragraph we want to +// read. This continues +// +// These examples are obvious from the text and should have been caught +// by the StrongEvidenceClassify pass. However, for languages where we don't +// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), +// it's worth guessing that (A1b) is the correct interpretation if there are +// far more "full" lines than "short" lines. +static void GeometricClassify(int debug_level, + GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, + ParagraphTheory *theory) { + if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) + return; + if (debug_level > 1) { + tprintf("###############################################\n"); + tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", + row_start, row_end); + tprintf("###############################################\n"); + } + RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); + + GeometricClassifierState s(debug_level, rows, row_start, row_end); + if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { + s.Fail(2, "Too much variety for simple outline classification."); + return; + } + if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { + s.Fail(1, "Not enough variety for simple outline classification."); + return; + } + if (s.left_tabs.size() + s.right_tabs.size() == 3) { + GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); + return; + } + + // At this point, we know that one side has at least two tab stops, and the + // other side has one or two tab stops. + // Left to determine: + // (1) Which is the body indent and which is the first line indent? + // (2) Is the text fully justified? + + // If one side happens to have three or more tab stops, assume that side + // is opposite of the aligned side. + if (s.right_tabs.size() > 2) { + s.AssumeLeftJustification(); + } else if (s.left_tabs.size() > 2) { + s.AssumeRightJustification(); + } else if (s.ltr) { // guess based on script direction + s.AssumeLeftJustification(); + } else { + s.AssumeRightJustification(); + } + + if (s.AlignTabs().size() == 2) { + // For each tab stop on the aligned side, how many of them appear + // to be paragraph start lines? [first lines] + int firsts[2] = {0, 0}; + // Count the first line as a likely paragraph start line. + firsts[s.AlignsideTabIndex(s.row_start)]++; + // For each line, if the first word would have fit on the previous + // line count it as a likely paragraph start line. + bool jam_packed = true; + for (int i = s.row_start + 1; i < s.row_end; i++) { + if (s.FirstWordWouldHaveFit(i - 1, i)) { + firsts[s.AlignsideTabIndex(i)]++; + jam_packed = false; + } + } + // Make an extra accounting for the last line of the paragraph just + // in case it's the only short line in the block. That is, take its + // first word as typical and see if this looks like the *last* line + // of a paragraph. If so, mark the *other* indent as probably a first. + if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { + firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; + } + + int percent0firsts, percent1firsts; + percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; + percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; + + // TODO(eger): Tune these constants if necessary. + if ((percent0firsts < 20 && 30 < percent1firsts) || + percent0firsts + 30 < percent1firsts) { + s.first_indent = s.AlignTabs()[1].center; + s.body_indent = s.AlignTabs()[0].center; + } else if ((percent1firsts < 20 && 30 < percent0firsts) || + percent1firsts + 30 < percent0firsts) { + s.first_indent = s.AlignTabs()[0].center; + s.body_indent = s.AlignTabs()[1].center; + } else { + // Ambiguous! Probably lineated (poetry) + if (debug_level > 1) { + tprintf("# Cannot determine %s indent likely to start paragraphs.\n", + s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); + tprintf("# Indent of %d looks like a first line %d%% of the time.\n", + s.AlignTabs()[0].center, percent0firsts); + tprintf("# Indent of %d looks like a first line %d%% of the time.\n", + s.AlignTabs()[1].center, percent1firsts); + s.PrintRows(); + } + return; + } + } else { + // There's only one tab stop for the "aligned to" side. + s.first_indent = s.body_indent = s.AlignTabs()[0].center; + } + + // At this point, we have our model. + const ParagraphModel *model = theory->AddModel(s.Model()); + + // Now all we have to do is figure out if the text is fully justified or not. + // eop_threshold: default to fully justified unless we see evidence below. + // See description on MarkRowsWithModel() + s.eop_threshold = + (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; + // If the text is not fully justified, re-set the eop_threshold to 0. + if (s.AlignTabs().size() == 2) { + // Paragraphs with a paragraph-start indent. + for (int i = s.row_start; i < s.row_end - 1; i++) { + if (ValidFirstLine(s.rows, i + 1, model) && + !NearlyEqual(s.OffsideTabs()[0].center, + (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { + // We found a non-end-of-paragraph short line: not fully justified. + s.eop_threshold = 0; + break; + } + } + } else { + // Paragraphs with no paragraph-start indent. + for (int i = s.row_start; i < s.row_end - 1; i++) { + if (!s.FirstWordWouldHaveFit(i, i + 1) && + !NearlyEqual(s.OffsideTabs()[0].center, + (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { + // We found a non-end-of-paragraph short line: not fully justified. + s.eop_threshold = 0; + break; + } + } + } + MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); +} + +// =============== Implementation of ParagraphTheory ===================== + +const ParagraphModel* ParagraphTheory::AddModel(const ParagraphModel &model) { + for (const auto& m : *models_) { + if (m->Comparable(model)) { + return m; + } + } + auto *m = new ParagraphModel(model); + models_->push_back(m); + models_we_added_.push_back_new(m); + return m; +} + +void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { + size_t w = 0; + for (size_t r = 0; r < models_->size(); r++) { + ParagraphModel* m = (*models_)[r]; + if (!used_models.contains(m) && models_we_added_.contains(m)) { + delete m; + } else { + if (r > w) { + (*models_)[w] = m; + } + w++; + } + } + models_->resize(w); +} + +// Examine rows[start, end) and try to determine if an existing non-centered +// paragraph model would fit them perfectly. If so, return a pointer to it. +// If not, return nullptr. +const ParagraphModel *ParagraphTheory::Fits( + const GenericVector<RowScratchRegisters> *rows, int start, int end) const { + for (const auto* model : *models_) { + if (model->justification() != JUSTIFICATION_CENTER && + RowsFitModel(rows, start, end, model)) + return model; + } + return nullptr; +} + +void ParagraphTheory::NonCenteredModels(SetOfModels *models) { + for (const auto* model : *models_) { + if (model->justification() != JUSTIFICATION_CENTER) + models->push_back_new(model); + } +} + +int ParagraphTheory::IndexOf(const ParagraphModel *model) const { + int i = 0; + for (const auto* m : *models_) { + if (m == model) + return i; + i++; + } + return -1; +} + +bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, + int row, const ParagraphModel *model) { + if (!StrongModel(model)) { + tprintf("ValidFirstLine() should only be called with strong models!\n"); + } + return StrongModel(model) && + model->ValidFirstLine( + (*rows)[row].lmargin_, (*rows)[row].lindent_, + (*rows)[row].rindent_, (*rows)[row].rmargin_); +} + +bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, + int row, const ParagraphModel *model) { + if (!StrongModel(model)) { + tprintf("ValidBodyLine() should only be called with strong models!\n"); + } + return StrongModel(model) && + model->ValidBodyLine( + (*rows)[row].lmargin_, (*rows)[row].lindent_, + (*rows)[row].rindent_, (*rows)[row].rmargin_); +} + +bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, + int a, int b, const ParagraphModel *model) { + if (model != kCrownRight && model != kCrownLeft) { + tprintf("CrownCompatible() should only be called with crown models!\n"); + return false; + } + auto &row_a = (*rows)[a]; + auto &row_b = (*rows)[b]; + if (model == kCrownRight) { + return NearlyEqual(row_a.rindent_ + row_a.rmargin_, + row_b.rindent_ + row_b.rmargin_, + Epsilon(row_a.ri_->average_interword_space)); + } + return NearlyEqual(row_a.lindent_ + row_a.lmargin_, + row_b.lindent_ + row_b.lmargin_, + Epsilon(row_a.ri_->average_interword_space)); +} + + +// =============== Implementation of ParagraphModelSmearer ==================== + +ParagraphModelSmearer::ParagraphModelSmearer( + GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, ParagraphTheory *theory) + : theory_(theory), rows_(rows), row_start_(row_start), + row_end_(row_end) { + if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { + row_start_ = 0; + row_end_ = 0; + return; + } + open_models_.resize(open_models_.size() + row_end - row_start + 2); +} + +// see paragraphs_internal.h +void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { + SetOfModels no_models; + if (row_start < row_start_) row_start = row_start_; + if (row_end > row_end_) row_end = row_end_; + + for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; + row++) { + if ((*rows_)[row].ri_->num_words == 0) { + OpenModels(row + 1) = no_models; + } else { + SetOfModels &opened = OpenModels(row); + (*rows_)[row].StartHypotheses(&opened); + + // Which models survive the transition from row to row + 1? + SetOfModels still_open; + for (int m = 0; m < opened.size(); m++) { + if (ValidFirstLine(rows_, row, opened[m]) || + ValidBodyLine(rows_, row, opened[m])) { + // This is basic filtering; we check likely paragraph starty-ness down + // below in Smear() -- you know, whether the first word would have fit + // and such. + still_open.push_back_new(opened[m]); + } + } + OpenModels(row + 1) = still_open; + } + } +} + +// see paragraphs_internal.h +void ParagraphModelSmearer::Smear() { + CalculateOpenModels(row_start_, row_end_); + + // For each row which we're unsure about (that is, it is LT_UNKNOWN or + // we have multiple LT_START hypotheses), see if there's a model that + // was recently used (an "open" model) which might model it well. + for (int i = row_start_; i < row_end_; i++) { + RowScratchRegisters &row = (*rows_)[i]; + if (row.ri_->num_words == 0) + continue; + + // Step One: + // Figure out if there are "open" models which are left-alined or + // right-aligned. This is important for determining whether the + // "first" word in a row would fit at the "end" of the previous row. + bool left_align_open = false; + bool right_align_open = false; + for (int m = 0; m < OpenModels(i).size(); m++) { + switch (OpenModels(i)[m]->justification()) { + case JUSTIFICATION_LEFT: left_align_open = true; break; + case JUSTIFICATION_RIGHT: right_align_open = true; break; + default: left_align_open = right_align_open = true; + } + } + // Step Two: + // Use that knowledge to figure out if this row is likely to + // start a paragraph. + bool likely_start; + if (i == 0) { + likely_start = true; + } else { + if ((left_align_open && right_align_open) || + (!left_align_open && !right_align_open)) { + likely_start = LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_LEFT) || + LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_RIGHT); + } else if (left_align_open) { + likely_start = LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_LEFT); + } else { + likely_start = LikelyParagraphStart((*rows_)[i - 1], row, + JUSTIFICATION_RIGHT); + } + } + + // Step Three: + // If this text line seems like an obvious first line of an + // open model, or an obvious continuation of an existing + // modelled paragraph, mark it up. + if (likely_start) { + // Add Start Hypotheses for all Open models that fit. + for (int m = 0; m < OpenModels(i).size(); m++) { + if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { + row.AddStartLine(OpenModels(i)[m]); + } + } + } else { + // Add relevant body line hypotheses. + SetOfModels last_line_models; + if (i > 0) { + (*rows_)[i - 1].StrongHypotheses(&last_line_models); + } else { + theory_->NonCenteredModels(&last_line_models); + } + for (int m = 0; m < last_line_models.size(); m++) { + const ParagraphModel *model = last_line_models[m]; + if (ValidBodyLine(rows_, i, model)) + row.AddBodyLine(model); + } + } + + // Step Four: + // If we're still quite unsure about this line, go through all + // models in our theory and see if this row could be the start + // of any of our models. + if (row.GetLineType() == LT_UNKNOWN || + (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { + SetOfModels all_models; + theory_->NonCenteredModels(&all_models); + for (int m = 0; m < all_models.size(); m++) { + if (ValidFirstLine(rows_, i, all_models[m])) { + row.AddStartLine(all_models[m]); + } + } + } + // Step Five: + // Since we may have updated the hypotheses about this row, we need + // to recalculate the Open models for the rest of rows[i + 1, row_end) + if (row.GetLineType() != LT_UNKNOWN) { + CalculateOpenModels(i + 1, row_end_); + } + } +} + +// ================ Main Paragraph Detection Algorithm ======================= + +// Find out what ParagraphModels are actually used, and discard any +// that are not. +static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows, + ParagraphTheory *theory) { + SetOfModels used_models; + for (int i = 0; i < rows.size(); i++) { + rows[i].StrongHypotheses(&used_models); + } + theory->DiscardUnusedModels(used_models); +} + +// DowngradeWeakestToCrowns: +// Forget any flush-{left, right} models unless we see two or more +// of them in sequence. +// +// In pass 3, we start to classify even flush-left paragraphs (paragraphs +// where the first line and body indent are the same) as having proper Models. +// This is generally dangerous, since if you start imagining that flush-left +// is a typical paragraph model when it is not, it will lead you to chop normal +// indented paragraphs in the middle whenever a sentence happens to start on a +// new line (see "This" above). What to do? +// What we do is to take any paragraph which is flush left and is not +// preceded by another paragraph of the same model and convert it to a "Crown" +// paragraph. This is a weak pseudo-ParagraphModel which is a placeholder +// for later. It means that the paragraph is flush, but it would be desirable +// to mark it as the same model as following text if it fits. This downgrade +// FlushLeft -> CrownLeft -> Model of following paragraph. Means that we +// avoid making flush left Paragraph Models whenever we see a top-of-the-page +// half-of-a-paragraph. and instead we mark it the same as normal body text. +// +// Implementation: +// +// Comb backwards through the row scratch registers, and turn any +// sequences of body lines of equivalent type abutted against the beginning +// or a body or start line of a different type into a crown paragraph. +static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, + GenericVector<RowScratchRegisters> *rows) { + int start; + for (int end = rows->size(); end > 0; end = start) { + // Search back for a body line of a unique type. + const ParagraphModel *model = nullptr; + while (end > 0 && + (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) { + end--; + } + if (end == 0) break; + start = end - 1; + while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { + start--; // walk back to the first line that is not the same body type. + } + if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && + StrongModel(model) && + NearlyEqual(model->first_indent(), model->body_indent(), + model->tolerance())) { + start--; + } + start++; + // Now rows[start, end) is a sequence of unique body hypotheses of model. + if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) + continue; + if (!StrongModel(model)) { + while (start > 0 && + CrownCompatible(rows, start - 1, start, model)) + start--; + } + if (start == 0 || + (!StrongModel(model)) || + (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { + // crownify rows[start, end) + const ParagraphModel *crown_model = model; + if (StrongModel(model)) { + if (model->justification() == JUSTIFICATION_LEFT) + crown_model = kCrownLeft; + else + crown_model = kCrownRight; + } + (*rows)[start].SetUnknown(); + (*rows)[start].AddStartLine(crown_model); + for (int row = start + 1; row < end; row++) { + (*rows)[row].SetUnknown(); + (*rows)[row].AddBodyLine(crown_model); + } + } + } + DiscardUnusedModels(*rows, theory); +} + + +// Clear all hypotheses about lines [start, end) and reset margins. +// +// The empty space between the left of a row and the block boundary (and +// similarly for the right) is split into two pieces: margin and indent. +// In initial processing, we assume the block is tight and the margin for +// all lines is set to zero. However, if our first pass does not yield +// models for everything, it may be due to an inset paragraph like a +// block-quote. In that case, we make a second pass over that unmarked +// section of the page and reset the "margin" portion of the empty space +// to the common amount of space at the ends of the lines under consid- +// eration. This would be equivalent to percentile set to 0. However, +// sometimes we have a single character sticking out in the right margin +// of a text block (like the 'r' in 'for' on line 3 above), and we can +// really just ignore it as an outlier. To express this, we allow the +// user to specify the percentile (0..100) of indent values to use as +// the common margin for each row in the run of rows[start, end). +void RecomputeMarginsAndClearHypotheses( + GenericVector<RowScratchRegisters> *rows, int start, int end, + int percentile) { + if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) + return; + + int lmin, lmax, rmin, rmax; + lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; + rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; + for (int i = start; i < end; i++) { + RowScratchRegisters &sr = (*rows)[i]; + sr.SetUnknown(); + if (sr.ri_->num_words == 0) + continue; + UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); + UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); + } + STATS lefts(lmin, lmax + 1); + STATS rights(rmin, rmax + 1); + for (int i = start; i < end; i++) { + RowScratchRegisters &sr = (*rows)[i]; + if (sr.ri_->num_words == 0) + continue; + lefts.add(sr.lmargin_ + sr.lindent_, 1); + rights.add(sr.rmargin_ + sr.rindent_, 1); + } + int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); + int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); + for (int i = start; i < end; i++) { + RowScratchRegisters &sr = (*rows)[i]; + int ldelta = ignorable_left - sr.lmargin_; + sr.lmargin_ += ldelta; + sr.lindent_ -= ldelta; + int rdelta = ignorable_right - sr.rmargin_; + sr.rmargin_ += rdelta; + sr.rindent_ -= rdelta; + } +} + +// Return the median inter-word space in rows[row_start, row_end). +int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, + int row_start, int row_end) { + if (row_end < row_start + 1) return 1; + int word_height = (rows[row_start].ri_->lword_box.height() + + rows[row_end - 1].ri_->lword_box.height()) / 2; + int word_width = (rows[row_start].ri_->lword_box.width() + + rows[row_end - 1].ri_->lword_box.width()) / 2; + STATS spacing_widths(0, 5 + word_width); + for (int i = row_start; i < row_end; i++) { + if (rows[i].ri_->num_words > 1) { + spacing_widths.add(rows[i].ri_->average_interword_space, 1); + } + } + int minimum_reasonable_space = word_height / 3; + if (minimum_reasonable_space < 2) + minimum_reasonable_space = 2; + int median = spacing_widths.median(); + return (median > minimum_reasonable_space) + ? median : minimum_reasonable_space; +} + +// Return whether the first word on the after line can fit in the space at +// the end of the before line (knowing which way the text is aligned and read). +bool FirstWordWouldHaveFit(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification justification) { + if (before.ri_->num_words == 0 || after.ri_->num_words == 0) + return true; + + if (justification == JUSTIFICATION_UNKNOWN) { + tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); + } + int available_space; + if (justification == JUSTIFICATION_CENTER) { + available_space = before.lindent_ + before.rindent_; + } else { + available_space = before.OffsideIndent(justification); + } + available_space -= before.ri_->average_interword_space; + + if (before.ri_->ltr) + return after.ri_->lword_box.width() < available_space; + return after.ri_->rword_box.width() < available_space; +} + +// Return whether the first word on the after line can fit in the space at +// the end of the before line (not knowing which way the text goes) in a left +// or right alignment. +bool FirstWordWouldHaveFit(const RowScratchRegisters &before, + const RowScratchRegisters &after) { + if (before.ri_->num_words == 0 || after.ri_->num_words == 0) + return true; + + int available_space = before.lindent_; + if (before.rindent_ > available_space) + available_space = before.rindent_; + available_space -= before.ri_->average_interword_space; + + if (before.ri_->ltr) + return after.ri_->lword_box.width() < available_space; + return after.ri_->rword_box.width() < available_space; +} + +static bool TextSupportsBreak(const RowScratchRegisters &before, + const RowScratchRegisters &after) { + if (before.ri_->ltr) { + return before.ri_->rword_likely_ends_idea && + after.ri_->lword_likely_starts_idea; + } else { + return before.ri_->lword_likely_ends_idea && + after.ri_->rword_likely_starts_idea; + } +} + +static bool LikelyParagraphStart(const RowScratchRegisters &before, + const RowScratchRegisters &after, + tesseract::ParagraphJustification j) { + return before.ri_->num_words == 0 || + (FirstWordWouldHaveFit(before, after, j) && + TextSupportsBreak(before, after)); +} + +// Examine rows[start, end) and try to determine what sort of ParagraphModel +// would fit them as a single paragraph. +// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. +// If the rows given could be a consistent start to a paragraph, set *consistent +// true. +static ParagraphModel InternalParagraphModelByOutline( + const GenericVector<RowScratchRegisters> *rows, + int start, int end, int tolerance, bool *consistent) { + int ltr_line_count = 0; + for (int i = start; i < end; i++) { + ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr); + } + bool ltr = (ltr_line_count >= (end - start) / 2); + + *consistent = true; + if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) + return ParagraphModel(); + + // Ensure the caller only passed us a region with a common rmargin and + // lmargin. + int lmargin = (*rows)[start].lmargin_; + int rmargin = (*rows)[start].rmargin_; + int lmin, lmax, rmin, rmax, cmin, cmax; + lmin = lmax = (*rows)[start + 1].lindent_; + rmin = rmax = (*rows)[start + 1].rindent_; + cmin = cmax = 0; + for (int i = start + 1; i < end; i++) { + if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { + tprintf("Margins don't match! Software error.\n"); + *consistent = false; + return ParagraphModel(); + } + UpdateRange((*rows)[i].lindent_, &lmin, &lmax); + UpdateRange((*rows)[i].rindent_, &rmin, &rmax); + UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); + } + int ldiff = lmax - lmin; + int rdiff = rmax - rmin; + int cdiff = cmax - cmin; + if (rdiff > tolerance && ldiff > tolerance) { + if (cdiff < tolerance * 2) { + if (end - start < 3) + return ParagraphModel(); + return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); + } + *consistent = false; + return ParagraphModel(); + } + if (end - start < 3) // Don't return a model for two line paras. + return ParagraphModel(); + + // These booleans keep us from saying something is aligned left when the body + // left variance is too large. + bool body_admits_left_alignment = ldiff < tolerance; + bool body_admits_right_alignment = rdiff < tolerance; + + ParagraphModel left_model = + ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, + (lmin + lmax) / 2, tolerance); + ParagraphModel right_model = + ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, + (rmin + rmax) / 2, tolerance); + + // These booleans keep us from having an indent on the "wrong side" for the + // first line. + bool text_admits_left_alignment = ltr || left_model.is_flush(); + bool text_admits_right_alignment = !ltr || right_model.is_flush(); + + // At least one of the edges is less than tolerance in variance. + // If the other is obviously ragged, it can't be the one aligned to. + // [Note the last line is included in this raggedness.] + if (tolerance < rdiff) { + if (body_admits_left_alignment && text_admits_left_alignment) + return left_model; + *consistent = false; + return ParagraphModel(); + } + if (tolerance < ldiff) { + if (body_admits_right_alignment && text_admits_right_alignment) + return right_model; + *consistent = false; + return ParagraphModel(); + } + + // At this point, we know the body text doesn't vary much on either side. + + // If the first line juts out oddly in one direction or the other, + // that likely indicates the side aligned to. + int first_left = (*rows)[start].lindent_; + int first_right = (*rows)[start].rindent_; + + if (ltr && body_admits_left_alignment && + (first_left < lmin || first_left > lmax)) + return left_model; + if (!ltr && body_admits_right_alignment && + (first_right < rmin || first_right > rmax)) + return right_model; + + *consistent = false; + return ParagraphModel(); +} + +// Examine rows[start, end) and try to determine what sort of ParagraphModel +// would fit them as a single paragraph. If nothing fits, +// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug +// output if we're debugging. +static ParagraphModel ParagraphModelByOutline( + int debug_level, + const GenericVector<RowScratchRegisters> *rows, + int start, int end, int tolerance) { + bool unused_consistent; + ParagraphModel retval = InternalParagraphModelByOutline( + rows, start, end, tolerance, &unused_consistent); + if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { + tprintf("Could not determine a model for this paragraph:\n"); + PrintRowRange(*rows, start, end); + } + return retval; +} + +// Do rows[start, end) form a single instance of the given paragraph model? +bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, + int start, int end, const ParagraphModel *model) { + if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) + return false; + if (!ValidFirstLine(rows, start, model)) return false; + for (int i = start + 1 ; i < end; i++) { + if (!ValidBodyLine(rows, i, model)) return false; + } + return true; +} + +// Examine rows[row_start, row_end) as an independent section of text, +// and mark rows that are exceptionally clear as start-of-paragraph +// and paragraph-body lines. +// +// We presume that any lines surrounding rows[row_start, row_end) may +// have wildly different paragraph models, so we don't key any data off +// of those lines. +// +// We only take the very strongest signals, as we don't want to get +// confused and marking up centered text, poetry, or source code as +// clearly part of a typical paragraph. +static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end) { + // Record patently obvious body text. + for (int i = row_start + 1; i < row_end; i++) { + const RowScratchRegisters &prev = (*rows)[i - 1]; + RowScratchRegisters &curr = (*rows)[i]; + tesseract::ParagraphJustification typical_justification = + prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (!curr.ri_->rword_likely_starts_idea && + !curr.ri_->lword_likely_starts_idea && + !FirstWordWouldHaveFit(prev, curr, typical_justification)) { + curr.SetBodyLine(); + } + } + + // Record patently obvious start paragraph lines. + // + // It's an extremely good signal of the start of a paragraph that + // the first word would have fit on the end of the previous line. + // However, applying just that signal would have us mark random + // start lines of lineated text (poetry and source code) and some + // centered headings as paragraph start lines. Therefore, we use + // a second qualification for a paragraph start: Not only should + // the first word of this line have fit on the previous line, + // but also, this line should go full to the right of the block, + // disallowing a subsequent word from having fit on this line. + + // First row: + { + RowScratchRegisters &curr = (*rows)[row_start]; + RowScratchRegisters &next = (*rows)[row_start + 1]; + tesseract::ParagraphJustification j = + curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (curr.GetLineType() == LT_UNKNOWN && + !FirstWordWouldHaveFit(curr, next, j) && + (curr.ri_->lword_likely_starts_idea || + curr.ri_->rword_likely_starts_idea)) { + curr.SetStartLine(); + } + } + // Middle rows + for (int i = row_start + 1; i < row_end - 1; i++) { + RowScratchRegisters &prev = (*rows)[i - 1]; + RowScratchRegisters &curr = (*rows)[i]; + RowScratchRegisters &next = (*rows)[i + 1]; + tesseract::ParagraphJustification j = + curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (curr.GetLineType() == LT_UNKNOWN && + !FirstWordWouldHaveFit(curr, next, j) && + LikelyParagraphStart(prev, curr, j)) { + curr.SetStartLine(); + } + } + // Last row + { // the short circuit at the top means we have at least two lines. + RowScratchRegisters &prev = (*rows)[row_end - 2]; + RowScratchRegisters &curr = (*rows)[row_end - 1]; + tesseract::ParagraphJustification j = + curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; + if (curr.GetLineType() == LT_UNKNOWN && + !FirstWordWouldHaveFit(curr, curr, j) && + LikelyParagraphStart(prev, curr, j)) { + curr.SetStartLine(); + } + } +} + +// Look for sequences of a start line followed by some body lines in +// rows[row_start, row_end) and create ParagraphModels for them if +// they seem coherent. +static void ModelStrongEvidence(int debug_level, + GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, + bool allow_flush_models, + ParagraphTheory *theory) { + if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) + return; + + int start = row_start; + while (start < row_end) { + while (start < row_end && (*rows)[start].GetLineType() != LT_START) + start++; + if (start >= row_end - 1) + break; + + int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); + int end = start; + ParagraphModel last_model; + bool next_consistent; + do { + ++end; + // rows[row, end) was consistent. + // If rows[row, end + 1) is not consistent, + // just model rows[row, end) + if (end < row_end - 1) { + RowScratchRegisters &next = (*rows)[end]; + LineType lt = next.GetLineType(); + next_consistent = lt == LT_BODY || + (lt == LT_UNKNOWN && + !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); + } else { + next_consistent = false; + } + if (next_consistent) { + ParagraphModel next_model = InternalParagraphModelByOutline( + rows, start, end + 1, tolerance, &next_consistent); + if (((*rows)[start].ri_->ltr && + last_model.justification() == JUSTIFICATION_LEFT && + next_model.justification() != JUSTIFICATION_LEFT) || + (!(*rows)[start].ri_->ltr && + last_model.justification() == JUSTIFICATION_RIGHT && + next_model.justification() != JUSTIFICATION_RIGHT)) { + next_consistent = false; + } + last_model = next_model; + } else { + next_consistent = false; + } + } while (next_consistent && end < row_end); + // At this point, rows[start, end) looked like it could have been a + // single paragraph. If we can make a good ParagraphModel for it, + // do so and mark this sequence with that model. + if (end > start + 1) { + // emit a new paragraph if we have more than one line. + const ParagraphModel *model = nullptr; + ParagraphModel new_model = ParagraphModelByOutline( + debug_level, rows, start, end, + Epsilon(InterwordSpace(*rows, start, end))); + if (new_model.justification() == JUSTIFICATION_UNKNOWN) { + // couldn't create a good model, oh well. + } else if (new_model.is_flush()) { + if (end == start + 2) { + // It's very likely we just got two paragraph starts in a row. + end = start + 1; + } else if (start == row_start) { + // Mark this as a Crown. + if (new_model.justification() == JUSTIFICATION_LEFT) { + model = kCrownLeft; + } else { + model = kCrownRight; + } + } else if (allow_flush_models) { + model = theory->AddModel(new_model); + } + } else { + model = theory->AddModel(new_model); + } + if (model) { + (*rows)[start].AddStartLine(model); + for (int i = start + 1; i < end; i++) { + (*rows)[i].AddBodyLine(model); + } + } + } + start = end; + } +} + +// We examine rows[row_start, row_end) and do the following: +// (1) Clear all existing hypotheses for the rows being considered. +// (2) Mark up any rows as exceptionally likely to be paragraph starts +// or paragraph body lines as such using both geometric and textual +// clues. +// (3) Form models for any sequence of start + continuation lines. +// (4) Smear the paragraph models to cover surrounding text. +static void StrongEvidenceClassify(int debug_level, + GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, + ParagraphTheory *theory) { + if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) + return; + + if (debug_level > 1) { + tprintf("#############################################\n"); + tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); + tprintf("#############################################\n"); + } + + RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); + MarkStrongEvidence(rows, row_start, row_end); + + DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); + + // Create paragraph models. + ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); + + DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); + + // At this point, some rows are marked up as paragraphs with model numbers, + // and some rows are marked up as either LT_START or LT_BODY. Now let's + // smear any good paragraph hypotheses forward and backward. + ParagraphModelSmearer smearer(rows, row_start, row_end, theory); + smearer.Smear(); +} + +static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, + int row_start, int row_end, + ParagraphTheory *theory) { + for (int i = row_start + 1; i < row_end - 1; i++) { + if ((*rows)[i - 1].ri_->has_leaders && + (*rows)[i].ri_->has_leaders && + (*rows)[i + 1].ri_->has_leaders) { + const ParagraphModel *model = theory->AddModel( + ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); + (*rows)[i].AddStartLine(model); + } + } +} + +// Collect sequences of unique hypotheses in row registers and create proper +// paragraphs for them, referencing the paragraphs in row_owners. +static void ConvertHypothesizedModelRunsToParagraphs( + int debug_level, + GenericVector<RowScratchRegisters> &rows, + GenericVector<PARA *> *row_owners, + ParagraphTheory *theory) { + int end = rows.size(); + int start; + for (; end > 0; end = start) { + start = end - 1; + const ParagraphModel *model = nullptr; + // TODO(eger): Be smarter about dealing with multiple hypotheses. + bool single_line_paragraph = false; + SetOfModels models; + rows[start].NonNullHypotheses(&models); + if (!models.empty()) { + model = models[0]; + if (rows[start].GetLineType(model) != LT_BODY) + single_line_paragraph = true; + } + if (model && !single_line_paragraph) { + // walk back looking for more body lines and then a start line. + while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { + // do nothing + } + if (start < 0 || rows[start].GetLineType(model) != LT_START) { + model = nullptr; + } + } + if (model == nullptr) { + continue; + } + // rows[start, end) should be a paragraph. + PARA *p = new PARA(); + if (model == kCrownLeft || model == kCrownRight) { + p->is_very_first_or_continuation = true; + // Crown paragraph. + // If we can find an existing ParagraphModel that fits, use it, + // else create a new one. + for (int row = end; row < rows.size(); row++) { + if ((*row_owners)[row] && + (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && + (start == 0 || + ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { + model = (*row_owners)[row]->model; + break; + } + } + if (model == kCrownLeft) { + // No subsequent model fits, so cons one up. + model = theory->AddModel(ParagraphModel( + JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_, + 0, 0, Epsilon(rows[start].ri_->average_interword_space))); + } else if (model == kCrownRight) { + // No subsequent model fits, so cons one up. + model = theory->AddModel(ParagraphModel( + JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_, + 0, 0, Epsilon(rows[start].ri_->average_interword_space))); + } + } + rows[start].SetUnknown(); + rows[start].AddStartLine(model); + for (int i = start + 1; i < end; i++) { + rows[i].SetUnknown(); + rows[i].AddBodyLine(model); + } + p->model = model; + p->has_drop_cap = rows[start].ri_->has_drop_cap; + p->is_list_item = + model->justification() == JUSTIFICATION_RIGHT + ? rows[start].ri_->rword_indicates_list_item + : rows[start].ri_->lword_indicates_list_item; + for (int row = start; row < end; row++) { + if ((*row_owners)[row] != nullptr) { + tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " + "more than once!\n"); + delete (*row_owners)[row]; + } + (*row_owners)[row] = p; + } + } +} + +struct Interval { + Interval() : begin(0), end(0) {} + Interval(int b, int e) : begin(b), end(e) {} + + int begin; + int end; +}; + +// Return whether rows[row] appears to be stranded, meaning that the evidence +// for this row is very weak due to context. For instance, two lines of source +// code may happen to be indented at the same tab vector as body text starts, +// leading us to think they are two start-of-paragraph lines. This is not +// optimal. However, we also don't want to mark a sequence of short dialog +// as "weak," so our heuristic is: +// (1) If a line is surrounded by lines of unknown type, it's weak. +// (2) If two lines in a row are start lines for a given paragraph type, but +// after that the same paragraph type does not continue, they're weak. +static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, + int row) { + SetOfModels row_models; + rows[row].StrongHypotheses(&row_models); + + for (int m = 0; m < row_models.size(); m++) { + bool all_starts = rows[row].GetLineType(); + int run_length = 1; + bool continues = true; + for (int i = row - 1; i >= 0 && continues; i--) { + SetOfModels models; + rows[i].NonNullHypotheses(&models); + switch (rows[i].GetLineType(row_models[m])) { + case LT_START: run_length++; break; + case LT_MULTIPLE: // explicit fall-through + case LT_BODY: run_length++; all_starts = false; break; + case LT_UNKNOWN: // explicit fall-through + default: continues = false; + } + } + continues = true; + for (int i = row + 1; i < rows.size() && continues; i++) { + SetOfModels models; + rows[i].NonNullHypotheses(&models); + switch (rows[i].GetLineType(row_models[m])) { + case LT_START: run_length++; break; + case LT_MULTIPLE: // explicit fall-through + case LT_BODY: run_length++; all_starts = false; break; + case LT_UNKNOWN: // explicit fall-through + default: continues = false; + } + } + if (run_length > 2 || (!all_starts && run_length > 1)) return false; + } + return true; +} + +// Go through rows[row_start, row_end) and gather up sequences that need better +// classification. +// + Sequences of non-empty rows without hypotheses. +// + Crown paragraphs not immediately followed by a strongly modeled line. +// + Single line paragraphs surrounded by text that doesn't match the +// model. +static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows, + GenericVector<Interval> *to_fix, + int row_start, int row_end) { + to_fix->clear(); + for (int i = row_start; i < row_end; i++) { + bool needs_fixing = false; + + SetOfModels models; + SetOfModels models_w_crowns; + rows[i].StrongHypotheses(&models); + rows[i].NonNullHypotheses(&models_w_crowns); + if (models.empty() && !models_w_crowns.empty()) { + // Crown paragraph. Is it followed by a modeled line? + for (int end = i + 1; end < rows.size(); end++) { + SetOfModels end_models; + SetOfModels strong_end_models; + rows[end].NonNullHypotheses(&end_models); + rows[end].StrongHypotheses(&strong_end_models); + if (end_models.empty()) { + needs_fixing = true; + break; + } else if (!strong_end_models.empty()) { + needs_fixing = false; + break; + } + } + } else if (models.empty() && rows[i].ri_->num_words > 0) { + // No models at all. + needs_fixing = true; + } + + if (!needs_fixing && !models.empty()) { + needs_fixing = RowIsStranded(rows, i); + } + + if (needs_fixing) { + if (!to_fix->empty() && to_fix->back().end == i - 1) + to_fix->back().end = i; + else + to_fix->push_back(Interval(i, i)); + } + } + // Convert inclusive intervals to half-open intervals. + for (int i = 0; i < to_fix->size(); i++) { + (*to_fix)[i].end = (*to_fix)[i].end + 1; + } +} + +// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), +// normalize each row_owner to point to an actual PARA, and output the +// paragraphs in order onto paragraphs. +void CanonicalizeDetectionResults( + GenericVector<PARA *> *row_owners, + PARA_LIST *paragraphs) { + GenericVector<PARA *> &rows = *row_owners; + paragraphs->clear(); + PARA_IT out(paragraphs); + PARA *formerly_null = nullptr; + for (int i = 0; i < rows.size(); i++) { + if (rows[i] == nullptr) { + if (i == 0 || rows[i - 1] != formerly_null) { + rows[i] = formerly_null = new PARA(); + } else { + rows[i] = formerly_null; + continue; + } + } else if (i > 0 && rows[i - 1] == rows[i]) { + continue; + } + out.add_after_then_move(rows[i]); + } +} + +// Main entry point for Paragraph Detection Algorithm. +// +// Given a set of equally spaced textlines (described by row_infos), +// Split them into paragraphs. +// +// Output: +// row_owners - one pointer for each row, to the paragraph it belongs to. +// paragraphs - this is the actual list of PARA objects. +// models - the list of paragraph models referenced by the PARA objects. +// caller is responsible for deleting the models. +void DetectParagraphs(int debug_level, + std::vector<RowInfo> *row_infos, + GenericVector<PARA *> *row_owners, + PARA_LIST *paragraphs, + std::vector<ParagraphModel *> *models) { + GenericVector<RowScratchRegisters> rows; + ParagraphTheory theory(models); + + // Initialize row_owners to be a bunch of nullptr pointers. + row_owners->init_to_size(row_infos->size(), nullptr); + + // Set up row scratch registers for the main algorithm. + rows.init_to_size(row_infos->size(), RowScratchRegisters()); + for (int i = 0; i < row_infos->size(); i++) { + rows[i].Init((*row_infos)[i]); + } + + // Pass 1: + // Detect sequences of lines that all contain leader dots (.....) + // These are likely Tables of Contents. If there are three text lines in + // a row with leader dots, it's pretty safe to say the middle one should + // be a paragraph of its own. + SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); + + DebugDump(debug_level > 1, "End of Pass 1", theory, rows); + + GenericVector<Interval> leftovers; + LeftoverSegments(rows, &leftovers, 0, rows.size()); + for (int i = 0; i < leftovers.size(); i++) { + // Pass 2a: + // Find any strongly evidenced start-of-paragraph lines. If they're + // followed by two lines that look like body lines, make a paragraph + // model for that and see if that model applies throughout the text + // (that is, "smear" it). + StrongEvidenceClassify(debug_level, &rows, + leftovers[i].begin, leftovers[i].end, &theory); + + // Pass 2b: + // If we had any luck in pass 2a, we got part of the page and didn't + // know how to classify a few runs of rows. Take the segments that + // didn't find a model and reprocess them individually. + GenericVector<Interval> leftovers2; + LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); + bool pass2a_was_useful = leftovers2.size() > 1 || + (leftovers2.size() == 1 && + (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size())); + if (pass2a_was_useful) { + for (int j = 0; j < leftovers2.size(); j++) { + StrongEvidenceClassify(debug_level, &rows, + leftovers2[j].begin, leftovers2[j].end, + &theory); + } + } + } + + DebugDump(debug_level > 1, "End of Pass 2", theory, rows); + + // Pass 3: + // These are the dregs for which we didn't have enough strong textual + // and geometric clues to form matching models for. Let's see if + // the geometric clues are simple enough that we could just use those. + LeftoverSegments(rows, &leftovers, 0, rows.size()); + for (int i = 0; i < leftovers.size(); i++) { + GeometricClassify(debug_level, &rows, + leftovers[i].begin, leftovers[i].end, &theory); + } + + // Undo any flush models for which there's little evidence. + DowngradeWeakestToCrowns(debug_level, &theory, &rows); + + DebugDump(debug_level > 1, "End of Pass 3", theory, rows); + + // Pass 4: + // Take everything that's still not marked up well and clear all markings. + LeftoverSegments(rows, &leftovers, 0, rows.size()); + for (int i = 0; i < leftovers.size(); i++) { + for (int j = leftovers[i].begin; j < leftovers[i].end; j++) { + rows[j].SetUnknown(); + } + } + + DebugDump(debug_level > 1, "End of Pass 4", theory, rows); + + // Convert all of the unique hypothesis runs to PARAs. + ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, + &theory); + + DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); + + // Finally, clean up any dangling nullptr row paragraph parents. + CanonicalizeDetectionResults(row_owners, paragraphs); +} + +// ============ Code interfacing with the rest of Tesseract ================== + +static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, + RowInfo *info) { + // Set up text, lword_text, and rword_text (mostly for debug printing). + STRING fake_text; + PageIterator pit(static_cast<const PageIterator&>(it)); + bool first_word = true; + if (!pit.Empty(RIL_WORD)) { + do { + fake_text += "x"; + if (first_word) info->lword_text += "x"; + info->rword_text += "x"; + if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) && + !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) { + fake_text += " "; + info->rword_text = ""; + first_word = false; + } + } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && + pit.Next(RIL_SYMBOL)); + } + if (fake_text.size() == 0) return; + + int lspaces = info->pix_ldistance / info->average_interword_space; + for (int i = 0; i < lspaces; i++) { + info->text += ' '; + } + info->text += fake_text; + + // Set up lword_box, rword_box, and num_words. + PAGE_RES_IT page_res_it = *it.PageResIt(); + WERD_RES *word_res = page_res_it.restart_row(); + ROW_RES *this_row = page_res_it.row(); + + WERD_RES *lword = nullptr; + WERD_RES *rword = nullptr; + info->num_words = 0; + do { + if (word_res) { + if (!lword) lword = word_res; + if (rword != word_res) info->num_words++; + rword = word_res; + } + word_res = page_res_it.forward(); + } while (page_res_it.row() == this_row); + + if (lword) info->lword_box = lword->word->bounding_box(); + if (rword) info->rword_box = rword->word->bounding_box(); +} + + +// Given a Tesseract Iterator pointing to a text line, fill in the paragraph +// detector RowInfo with all relevant information from the row. +static void InitializeRowInfo(bool after_recognition, + const MutableIterator &it, RowInfo *info) { + if (it.PageResIt()->row() != nullptr) { + ROW *row = it.PageResIt()->row()->row; + info->pix_ldistance = row->lmargin(); + info->pix_rdistance = row->rmargin(); + info->average_interword_space = + row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1); + info->pix_xheight = row->x_height(); + info->has_leaders = false; + info->has_drop_cap = row->has_drop_cap(); + info->ltr = true; // set below depending on word scripts + } else { + info->pix_ldistance = info->pix_rdistance = 0; + info->average_interword_space = 1; + info->pix_xheight = 1.0; + info->has_leaders = false; + info->has_drop_cap = false; + info->ltr = true; + } + + info->num_words = 0; + info->lword_indicates_list_item = false; + info->lword_likely_starts_idea = false; + info->lword_likely_ends_idea = false; + info->rword_indicates_list_item = false; + info->rword_likely_starts_idea = false; + info->rword_likely_ends_idea = false; + info->has_leaders = false; + info->ltr = true; + + if (!after_recognition) { + InitializeTextAndBoxesPreRecognition(it, info); + return; + } + info->text = ""; + const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE)); + int trailing_ws_idx = strlen(text.get()); // strip trailing space + while (trailing_ws_idx > 0 && + // isspace() only takes ASCII + isascii(text[trailing_ws_idx - 1]) && + isspace(text[trailing_ws_idx - 1])) + trailing_ws_idx--; + if (trailing_ws_idx > 0) { + int lspaces = info->pix_ldistance / info->average_interword_space; + for (int i = 0; i < lspaces; i++) + info->text += ' '; + for (int i = 0; i < trailing_ws_idx; i++) + info->text += text[i]; + } + + if (info->text.size() == 0) { + return; + } + + PAGE_RES_IT page_res_it = *it.PageResIt(); + GenericVector<WERD_RES *> werds; + WERD_RES *word_res = page_res_it.restart_row(); + ROW_RES *this_row = page_res_it.row(); + int num_leaders = 0; + int ltr = 0; + int rtl = 0; + do { + if (word_res && word_res->best_choice->unichar_string().length() > 0) { + werds.push_back(word_res); + ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; + rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; + if (word_res->word->flag(W_REP_CHAR)) num_leaders++; + } + word_res = page_res_it.forward(); + } while (page_res_it.row() == this_row); + info->ltr = ltr >= rtl; + info->has_leaders = num_leaders > 3; + info->num_words = werds.size(); + if (!werds.empty()) { + WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; + info->lword_text = lword->best_choice->unichar_string().c_str(); + info->rword_text = rword->best_choice->unichar_string().c_str(); + info->lword_box = lword->word->bounding_box(); + info->rword_box = rword->word->bounding_box(); + LeftWordAttributes(lword->uch_set, lword->best_choice, + info->lword_text, + &info->lword_indicates_list_item, + &info->lword_likely_starts_idea, + &info->lword_likely_ends_idea); + RightWordAttributes(rword->uch_set, rword->best_choice, + info->rword_text, + &info->rword_indicates_list_item, + &info->rword_likely_starts_idea, + &info->rword_likely_ends_idea); + } +} + +// This is called after rows have been identified and words are recognized. +// Much of this could be implemented before word recognition, but text helps +// to identify bulleted lists and gives good signals for sentence boundaries. +void DetectParagraphs(int debug_level, + bool after_text_recognition, + const MutableIterator *block_start, + std::vector<ParagraphModel *> *models) { + // Clear out any preconceived notions. + if (block_start->Empty(RIL_TEXTLINE)) { + return; + } + BLOCK *block = block_start->PageResIt()->block()->block; + block->para_list()->clear(); + bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText(); + + // Convert the Tesseract structures to RowInfos + // for the paragraph detection algorithm. + MutableIterator row(*block_start); + if (row.Empty(RIL_TEXTLINE)) + return; // end of input already. + + std::vector<RowInfo> row_infos; + do { + if (!row.PageResIt()->row()) + continue; // empty row. + row.PageResIt()->row()->row->set_para(nullptr); + row_infos.push_back(RowInfo()); + RowInfo &ri = row_infos.back(); + InitializeRowInfo(after_text_recognition, row, &ri); + } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && + row.Next(RIL_TEXTLINE)); + + // If we're called before text recognition, we might not have + // tight block bounding boxes, so trim by the minimum on each side. + if (!row_infos.empty()) { + int min_lmargin = row_infos[0].pix_ldistance; + int min_rmargin = row_infos[0].pix_rdistance; + for (int i = 1; i < row_infos.size(); i++) { + if (row_infos[i].pix_ldistance < min_lmargin) + min_lmargin = row_infos[i].pix_ldistance; + if (row_infos[i].pix_rdistance < min_rmargin) + min_rmargin = row_infos[i].pix_rdistance; + } + if (min_lmargin > 0 || min_rmargin > 0) { + for (int i = 0; i < row_infos.size(); i++) { + row_infos[i].pix_ldistance -= min_lmargin; + row_infos[i].pix_rdistance -= min_rmargin; + } + } + } + + // Run the paragraph detection algorithm. + GenericVector<PARA *> row_owners; + GenericVector<PARA *> the_paragraphs; + if (!is_image_block) { + DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), + models); + } else { + row_owners.init_to_size(row_infos.size(), nullptr); + CanonicalizeDetectionResults(&row_owners, block->para_list()); + } + + // Now stitch in the row_owners into the rows. + row = *block_start; + for (int i = 0; i < row_owners.size(); i++) { + while (!row.PageResIt()->row()) + row.Next(RIL_TEXTLINE); + row.PageResIt()->row()->row->set_para(row_owners[i]); + row.Next(RIL_TEXTLINE); + } +} + +} // namespace |