summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/src/classify')
-rw-r--r--tesseract/src/classify/adaptive.cpp498
-rw-r--r--tesseract/src/classify/adaptive.h128
-rw-r--r--tesseract/src/classify/adaptmatch.cpp2317
-rw-r--r--tesseract/src/classify/blobclass.cpp110
-rw-r--r--tesseract/src/classify/blobclass.h39
-rw-r--r--tesseract/src/classify/classify.cpp230
-rw-r--r--tesseract/src/classify/classify.h583
-rw-r--r--tesseract/src/classify/cluster.cpp2425
-rw-r--r--tesseract/src/classify/cluster.h138
-rw-r--r--tesseract/src/classify/clusttool.cpp319
-rw-r--r--tesseract/src/classify/clusttool.h43
-rw-r--r--tesseract/src/classify/cutoffs.cpp73
-rw-r--r--tesseract/src/classify/featdefs.cpp280
-rw-r--r--tesseract/src/classify/featdefs.h87
-rw-r--r--tesseract/src/classify/float2int.cpp109
-rw-r--r--tesseract/src/classify/float2int.h30
-rw-r--r--tesseract/src/classify/fpoint.cpp54
-rw-r--r--tesseract/src/classify/fpoint.h53
-rw-r--r--tesseract/src/classify/intfeaturespace.cpp124
-rw-r--r--tesseract/src/classify/intfeaturespace.h104
-rw-r--r--tesseract/src/classify/intfx.cpp488
-rw-r--r--tesseract/src/classify/intfx.h68
-rw-r--r--tesseract/src/classify/intmatcher.cpp1226
-rw-r--r--tesseract/src/classify/intmatcher.h165
-rw-r--r--tesseract/src/classify/intproto.cpp1743
-rw-r--r--tesseract/src/classify/intproto.h265
-rw-r--r--tesseract/src/classify/kdtree.cpp541
-rw-r--r--tesseract/src/classify/kdtree.h98
-rw-r--r--tesseract/src/classify/mf.cpp82
-rw-r--r--tesseract/src/classify/mf.h40
-rw-r--r--tesseract/src/classify/mfdefs.cpp46
-rw-r--r--tesseract/src/classify/mfdefs.h61
-rw-r--r--tesseract/src/classify/mfoutline.cpp446
-rw-r--r--tesseract/src/classify/mfoutline.h135
-rw-r--r--tesseract/src/classify/mfx.cpp152
-rw-r--r--tesseract/src/classify/mfx.h46
-rw-r--r--tesseract/src/classify/normfeat.cpp73
-rw-r--r--tesseract/src/classify/normfeat.h40
-rw-r--r--tesseract/src/classify/normmatch.cpp231
-rw-r--r--tesseract/src/classify/normmatch.h34
-rw-r--r--tesseract/src/classify/ocrfeatures.cpp190
-rw-r--r--tesseract/src/classify/ocrfeatures.h122
-rw-r--r--tesseract/src/classify/outfeat.cpp168
-rw-r--r--tesseract/src/classify/outfeat.h49
-rw-r--r--tesseract/src/classify/picofeat.cpp264
-rw-r--r--tesseract/src/classify/picofeat.h65
-rw-r--r--tesseract/src/classify/protos.cpp178
-rw-r--r--tesseract/src/classify/protos.h107
-rw-r--r--tesseract/src/classify/shapeclassifier.cpp234
-rw-r--r--tesseract/src/classify/shapeclassifier.h121
-rw-r--r--tesseract/src/classify/shapetable.cpp727
-rw-r--r--tesseract/src/classify/shapetable.h379
-rw-r--r--tesseract/src/classify/tessclassifier.cpp84
-rw-r--r--tesseract/src/classify/tessclassifier.h72
-rw-r--r--tesseract/src/classify/trainingsample.cpp339
-rw-r--r--tesseract/src/classify/trainingsample.h252
56 files changed, 17075 insertions, 0 deletions
diff --git a/tesseract/src/classify/adaptive.cpp b/tesseract/src/classify/adaptive.cpp
new file mode 100644
index 00000000..92f0d3da
--- /dev/null
+++ b/tesseract/src/classify/adaptive.cpp
@@ -0,0 +1,498 @@
+/******************************************************************************
+ ** Filename: adaptive.c
+ ** Purpose: Adaptive matcher.
+ ** Author: Dan Johnson
+ ** History: Fri Mar 8 10:00:21 1991, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "adaptive.h"
+
+#include "classify.h"
+
+#include <cassert>
+#include <cstdio>
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine adds a new adapted class to an existing
+ * set of adapted templates.
+ *
+ * @param Templates set of templates to add new class to
+ * @param Class new class to add to templates
+ * @param ClassId class id to associate with new class
+ *
+ * @note Globals: none
+ */
+void AddAdaptedClass(ADAPT_TEMPLATES Templates,
+ ADAPT_CLASS Class,
+ CLASS_ID ClassId) {
+ INT_CLASS IntClass;
+
+ assert (Templates != nullptr);
+ assert (Class != nullptr);
+ assert (LegalClassId (ClassId));
+ assert (UnusedClassIdIn (Templates->Templates, ClassId));
+ assert (Class->NumPermConfigs == 0);
+
+ IntClass = NewIntClass (1, 1);
+ AddIntClass (Templates->Templates, ClassId, IntClass);
+
+ assert (Templates->Class[ClassId] == nullptr);
+ Templates->Class[ClassId] = Class;
+
+} /* AddAdaptedClass */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine frees all memory consumed by a temporary
+ * configuration.
+ *
+ * @param Config config to be freed
+ *
+ * @note Globals: none
+ */
+void FreeTempConfig(TEMP_CONFIG Config) {
+ assert (Config != nullptr);
+ FreeBitVector (Config->Protos);
+ free(Config);
+} /* FreeTempConfig */
+
+/*---------------------------------------------------------------------------*/
+void FreeTempProto(void *arg) {
+ auto proto = static_cast<PROTO>(arg);
+
+ free(proto);
+}
+
+static void FreePermConfig(PERM_CONFIG Config) {
+ assert(Config != nullptr);
+ delete [] Config->Ambigs;
+ free(Config);
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This operation allocates and initializes a new adapted
+ * class data structure and returns a ptr to it.
+ *
+ * @return Ptr to new class data structure.
+ *
+ * @note Globals: none
+ */
+ADAPT_CLASS NewAdaptedClass() {
+ ADAPT_CLASS Class;
+
+ Class = static_cast<ADAPT_CLASS>(malloc (sizeof (ADAPT_CLASS_STRUCT)));
+ Class->NumPermConfigs = 0;
+ Class->MaxNumTimesSeen = 0;
+ Class->TempProtos = NIL_LIST;
+
+ Class->PermProtos = NewBitVector (MAX_NUM_PROTOS);
+ Class->PermConfigs = NewBitVector (MAX_NUM_CONFIGS);
+ zero_all_bits (Class->PermProtos, WordsInVectorOfSize (MAX_NUM_PROTOS));
+ zero_all_bits (Class->PermConfigs, WordsInVectorOfSize (MAX_NUM_CONFIGS));
+
+ for (int i = 0; i < MAX_NUM_CONFIGS; i++)
+ TempConfigFor (Class, i) = nullptr;
+
+ return (Class);
+
+} /* NewAdaptedClass */
+
+
+/*-------------------------------------------------------------------------*/
+void free_adapted_class(ADAPT_CLASS adapt_class) {
+ for (int i = 0; i < MAX_NUM_CONFIGS; i++) {
+ if (ConfigIsPermanent (adapt_class, i)
+ && PermConfigFor (adapt_class, i) != nullptr)
+ FreePermConfig (PermConfigFor (adapt_class, i));
+ else if (!ConfigIsPermanent (adapt_class, i)
+ && TempConfigFor (adapt_class, i) != nullptr)
+ FreeTempConfig (TempConfigFor (adapt_class, i));
+ }
+ FreeBitVector (adapt_class->PermProtos);
+ FreeBitVector (adapt_class->PermConfigs);
+ destroy_nodes (adapt_class->TempProtos, FreeTempProto);
+ free(adapt_class);
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Allocates memory for adapted templates.
+ * each char in unicharset to the newly created templates
+ *
+ * @param InitFromUnicharset if true, add an empty class for
+ * @return Ptr to new adapted templates.
+ *
+ * @note Globals: none
+ */
+ADAPT_TEMPLATES Classify::NewAdaptedTemplates(bool InitFromUnicharset) {
+ ADAPT_TEMPLATES Templates;
+
+ Templates = static_cast<ADAPT_TEMPLATES>(malloc (sizeof (ADAPT_TEMPLATES_STRUCT)));
+
+ Templates->Templates = NewIntTemplates ();
+ Templates->NumPermClasses = 0;
+ Templates->NumNonEmptyClasses = 0;
+
+ /* Insert an empty class for each unichar id in unicharset */
+ for (int i = 0; i < MAX_NUM_CLASSES; i++) {
+ Templates->Class[i] = nullptr;
+ if (InitFromUnicharset && i < unicharset.size()) {
+ AddAdaptedClass(Templates, NewAdaptedClass(), i);
+ }
+ }
+
+ return (Templates);
+
+} /* NewAdaptedTemplates */
+
+// Returns FontinfoId of the given config of the given adapted class.
+int Classify::GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId) {
+ return (ConfigIsPermanent(Class, ConfigId) ?
+ PermConfigFor(Class, ConfigId)->FontinfoId :
+ TempConfigFor(Class, ConfigId)->FontinfoId);
+}
+
+/*----------------------------------------------------------------------------*/
+void free_adapted_templates(ADAPT_TEMPLATES templates) {
+
+ if (templates != nullptr) {
+ for (int i = 0; i < (templates->Templates)->NumClasses; i++)
+ free_adapted_class (templates->Class[i]);
+ free_int_templates (templates->Templates);
+ free(templates);
+ }
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine allocates and returns a new temporary config.
+ *
+ * @param MaxProtoId max id of any proto in new config
+ * @param FontinfoId font information from pre-trained templates
+ * @return Ptr to new temp config.
+ *
+ * @note Globals: none
+ */
+TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId) {
+ int NumProtos = MaxProtoId + 1;
+
+ auto Config = static_cast<TEMP_CONFIG>(malloc(sizeof(TEMP_CONFIG_STRUCT)));
+ Config->Protos = NewBitVector (NumProtos);
+
+ Config->NumTimesSeen = 1;
+ Config->MaxProtoId = MaxProtoId;
+ Config->ProtoVectorSize = WordsInVectorOfSize (NumProtos);
+ zero_all_bits (Config->Protos, Config->ProtoVectorSize);
+ Config->FontinfoId = FontinfoId;
+
+ return (Config);
+
+} /* NewTempConfig */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine allocates and returns a new temporary proto.
+ *
+ * @return Ptr to new temporary proto.
+ *
+ * @note Globals: none
+ */
+TEMP_PROTO NewTempProto() {
+ return static_cast<TEMP_PROTO>(malloc(sizeof(TEMP_PROTO_STRUCT)));
+} /* NewTempProto */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine prints a summary of the adapted templates
+ * in Templates to File.
+ *
+ * @param File open text file to print Templates to
+ * @param Templates adapted templates to print to File
+ *
+ * @note Globals: none
+ */
+void Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates) {
+ INT_CLASS IClass;
+ ADAPT_CLASS AClass;
+
+ fprintf (File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
+ fprintf (File, "Num classes = %d; Num permanent classes = %d\n\n",
+ Templates->NumNonEmptyClasses, Templates->NumPermClasses);
+ fprintf (File, " Id NC NPC NP NPP\n");
+ fprintf (File, "------------------------\n");
+
+ for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
+ IClass = Templates->Templates->Class[i];
+ AClass = Templates->Class[i];
+ if (!IsEmptyAdaptedClass (AClass)) {
+ fprintf (File, "%5d %s %3d %3d %3d %3d\n",
+ i, unicharset.id_to_unichar(i),
+ IClass->NumConfigs, AClass->NumPermConfigs,
+ IClass->NumProtos,
+ IClass->NumProtos - count (AClass->TempProtos));
+ }
+ }
+ fprintf (File, "\n");
+
+} /* PrintAdaptedTemplates */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Read an adapted class description from file and return
+ * a ptr to the adapted class.
+ *
+ * @param fp open file to read adapted class from
+ * @return Ptr to new adapted class.
+ *
+ * @note Globals: none
+ */
+ADAPT_CLASS ReadAdaptedClass(TFile *fp) {
+ int NumTempProtos;
+ int NumConfigs;
+ int i;
+ ADAPT_CLASS Class;
+
+ /* first read high level adapted class structure */
+ Class = static_cast<ADAPT_CLASS>(malloc (sizeof (ADAPT_CLASS_STRUCT)));
+ fp->FRead(Class, sizeof(ADAPT_CLASS_STRUCT), 1);
+
+ /* then read in the definitions of the permanent protos and configs */
+ Class->PermProtos = NewBitVector (MAX_NUM_PROTOS);
+ Class->PermConfigs = NewBitVector (MAX_NUM_CONFIGS);
+ fp->FRead(Class->PermProtos, sizeof(uint32_t),
+ WordsInVectorOfSize(MAX_NUM_PROTOS));
+ fp->FRead(Class->PermConfigs, sizeof(uint32_t),
+ WordsInVectorOfSize(MAX_NUM_CONFIGS));
+
+ /* then read in the list of temporary protos */
+ fp->FRead(&NumTempProtos, sizeof(int), 1);
+ Class->TempProtos = NIL_LIST;
+ for (i = 0; i < NumTempProtos; i++) {
+ auto TempProto = static_cast<TEMP_PROTO>(malloc(sizeof(TEMP_PROTO_STRUCT)));
+ fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1);
+ Class->TempProtos = push_last (Class->TempProtos, TempProto);
+ }
+
+ /* then read in the adapted configs */
+ fp->FRead(&NumConfigs, sizeof(int), 1);
+ for (i = 0; i < NumConfigs; i++)
+ if (test_bit (Class->PermConfigs, i))
+ Class->Config[i].Perm = ReadPermConfig(fp);
+ else
+ Class->Config[i].Temp = ReadTempConfig(fp);
+
+ return (Class);
+
+} /* ReadAdaptedClass */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Read a set of adapted templates from file and return
+ * a ptr to the templates.
+ *
+ * @param fp open text file to read adapted templates from
+ * @return Ptr to adapted templates read from file.
+ *
+ * @note Globals: none
+ */
+ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(TFile *fp) {
+ ADAPT_TEMPLATES Templates;
+
+ /* first read the high level adaptive template struct */
+ Templates = static_cast<ADAPT_TEMPLATES>(malloc (sizeof (ADAPT_TEMPLATES_STRUCT)));
+ fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
+
+ /* then read in the basic integer templates */
+ Templates->Templates = ReadIntTemplates(fp);
+
+ /* then read in the adaptive info for each class */
+ for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
+ Templates->Class[i] = ReadAdaptedClass(fp);
+ }
+ return (Templates);
+
+} /* ReadAdaptedTemplates */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Read a permanent configuration description from file
+ * and return a ptr to it.
+ *
+ * @param fp open file to read permanent config from
+ * @return Ptr to new permanent configuration description.
+ *
+ * @note Globals: none
+ */
+PERM_CONFIG ReadPermConfig(TFile *fp) {
+ auto Config = static_cast<PERM_CONFIG>(malloc(sizeof(PERM_CONFIG_STRUCT)));
+ uint8_t NumAmbigs;
+ fp->FRead(&NumAmbigs, sizeof(NumAmbigs), 1);
+ Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1];
+ fp->FRead(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs);
+ Config->Ambigs[NumAmbigs] = -1;
+ fp->FRead(&(Config->FontinfoId), sizeof(int), 1);
+
+ return (Config);
+
+} /* ReadPermConfig */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Read a temporary configuration description from file
+ * and return a ptr to it.
+ *
+ * @param fp open file to read temporary config from
+ * @return Ptr to new temporary configuration description.
+ *
+ * @note Globals: none
+ */
+TEMP_CONFIG ReadTempConfig(TFile *fp) {
+ auto Config = static_cast<TEMP_CONFIG>(malloc(sizeof(TEMP_CONFIG_STRUCT)));
+ fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1);
+
+ Config->Protos = NewBitVector (Config->ProtoVectorSize * BITSINLONG);
+ fp->FRead(Config->Protos, sizeof(uint32_t), Config->ProtoVectorSize);
+
+ return (Config);
+
+} /* ReadTempConfig */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine writes a binary representation of Class
+ * to File.
+ *
+ * @param File open file to write Class to
+ * @param Class adapted class to write to File
+ * @param NumConfigs number of configs in Class
+ *
+ * @note Globals: none
+ */
+void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs) {
+ int NumTempProtos;
+ LIST TempProtos;
+ int i;
+
+ /* first write high level adapted class structure */
+ fwrite(Class, sizeof(ADAPT_CLASS_STRUCT), 1, File);
+
+ /* then write out the definitions of the permanent protos and configs */
+ fwrite(Class->PermProtos, sizeof(uint32_t),
+ WordsInVectorOfSize(MAX_NUM_PROTOS), File);
+ fwrite(Class->PermConfigs, sizeof(uint32_t),
+ WordsInVectorOfSize(MAX_NUM_CONFIGS), File);
+
+ /* then write out the list of temporary protos */
+ NumTempProtos = count (Class->TempProtos);
+ fwrite(&NumTempProtos, sizeof(int), 1, File);
+ TempProtos = Class->TempProtos;
+ iterate (TempProtos) {
+ void* proto = first_node(TempProtos);
+ fwrite(proto, sizeof(TEMP_PROTO_STRUCT), 1, File);
+ }
+
+ /* then write out the adapted configs */
+ fwrite(&NumConfigs, sizeof(int), 1, File);
+ for (i = 0; i < NumConfigs; i++)
+ if (test_bit (Class->PermConfigs, i))
+ WritePermConfig (File, Class->Config[i].Perm);
+ else
+ WriteTempConfig (File, Class->Config[i].Temp);
+
+} /* WriteAdaptedClass */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine saves Templates to File in a binary format.
+ *
+ * @param File open text file to write Templates to
+ * @param Templates set of adapted templates to write to File
+ *
+ * @note Globals: none
+ */
+void Classify::WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates) {
+ int i;
+
+ /* first write the high level adaptive template struct */
+ fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
+
+ /* then write out the basic integer templates */
+ WriteIntTemplates (File, Templates->Templates, unicharset);
+
+ /* then write out the adaptive info for each class */
+ for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
+ WriteAdaptedClass (File, Templates->Class[i],
+ Templates->Templates->Class[i]->NumConfigs);
+ }
+} /* WriteAdaptedTemplates */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine writes a binary representation of a
+ * permanent configuration to File.
+ *
+ * @param File open file to write Config to
+ * @param Config permanent config to write to File
+ *
+ * @note Globals: none
+ */
+void WritePermConfig(FILE *File, PERM_CONFIG Config) {
+ uint8_t NumAmbigs = 0;
+
+ assert (Config != nullptr);
+ while (Config->Ambigs[NumAmbigs] > 0) ++NumAmbigs;
+
+ fwrite(&NumAmbigs, sizeof(uint8_t), 1, File);
+ fwrite(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs, File);
+ fwrite(&(Config->FontinfoId), sizeof(int), 1, File);
+} /* WritePermConfig */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine writes a binary representation of a
+ * temporary configuration to File.
+ *
+ * @param File open file to write Config to
+ * @param Config temporary config to write to File
+ *
+ * @note Globals: none
+ */
+void WriteTempConfig(FILE *File, TEMP_CONFIG Config) {
+ assert (Config != nullptr);
+
+ fwrite(Config, sizeof (TEMP_CONFIG_STRUCT), 1, File);
+ fwrite(Config->Protos, sizeof (uint32_t), Config->ProtoVectorSize, File);
+
+} /* WriteTempConfig */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/adaptive.h b/tesseract/src/classify/adaptive.h
new file mode 100644
index 00000000..b1bf6a2e
--- /dev/null
+++ b/tesseract/src/classify/adaptive.h
@@ -0,0 +1,128 @@
+/******************************************************************************
+ ** Filename: adaptive.h
+ ** Purpose: Interface to adaptive matcher.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+#ifndef ADAPTIVE_H
+#define ADAPTIVE_H
+
+#include "intproto.h"
+#include "oldlist.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+typedef struct {
+ uint16_t ProtoId;
+ PROTO_STRUCT Proto;
+}
+
+TEMP_PROTO_STRUCT;
+using TEMP_PROTO = TEMP_PROTO_STRUCT*;
+
+typedef struct {
+ uint8_t NumTimesSeen;
+ uint8_t ProtoVectorSize;
+ PROTO_ID MaxProtoId;
+ BIT_VECTOR Protos;
+ int FontinfoId; // font information inferred from pre-trained templates
+} TEMP_CONFIG_STRUCT;
+using TEMP_CONFIG = TEMP_CONFIG_STRUCT*;
+
+typedef struct {
+ UNICHAR_ID* Ambigs;
+ int FontinfoId; // font information inferred from pre-trained templates
+} PERM_CONFIG_STRUCT;
+using PERM_CONFIG = PERM_CONFIG_STRUCT*;
+
+typedef union {
+ TEMP_CONFIG Temp;
+ PERM_CONFIG Perm;
+} ADAPTED_CONFIG;
+
+typedef struct {
+ uint8_t NumPermConfigs;
+ uint8_t MaxNumTimesSeen; // maximum number of times any TEMP_CONFIG was seen
+ // (cut at matcher_min_examples_for_prototyping)
+ BIT_VECTOR PermProtos;
+ BIT_VECTOR PermConfigs;
+ LIST TempProtos;
+ ADAPTED_CONFIG Config[MAX_NUM_CONFIGS];
+} ADAPT_CLASS_STRUCT;
+using ADAPT_CLASS = ADAPT_CLASS_STRUCT*;
+
+typedef struct {
+ INT_TEMPLATES Templates;
+ int NumNonEmptyClasses;
+ uint8_t NumPermClasses;
+ ADAPT_CLASS Class[MAX_NUM_CLASSES];
+} ADAPT_TEMPLATES_STRUCT;
+using ADAPT_TEMPLATES = ADAPT_TEMPLATES_STRUCT*;
+
+/*----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------*/
+#define NumNonEmptyClassesIn(Template) ((Template)->NumNonEmptyClasses)
+
+#define IsEmptyAdaptedClass(Class) \
+ ((Class)->NumPermConfigs == 0 && (Class)->TempProtos == NIL_LIST)
+
+#define ConfigIsPermanent(Class, ConfigId) \
+ (test_bit((Class)->PermConfigs, ConfigId))
+
+#define MakeConfigPermanent(Class, ConfigId) \
+ (SET_BIT((Class)->PermConfigs, ConfigId))
+
+#define MakeProtoPermanent(Class, ProtoId) \
+ (SET_BIT((Class)->PermProtos, ProtoId))
+
+#define TempConfigFor(Class, ConfigId) ((Class)->Config[ConfigId].Temp)
+
+#define PermConfigFor(Class, ConfigId) ((Class)->Config[ConfigId].Perm)
+
+#define IncreaseConfidence(TempConfig) ((TempConfig)->NumTimesSeen++)
+
+void AddAdaptedClass(ADAPT_TEMPLATES Templates, ADAPT_CLASS Class,
+ CLASS_ID ClassId);
+
+void FreeTempProto(void* arg);
+
+void FreeTempConfig(TEMP_CONFIG Config);
+
+ADAPT_CLASS NewAdaptedClass();
+
+void free_adapted_class(ADAPT_CLASS adapt_class);
+
+void free_adapted_templates(ADAPT_TEMPLATES templates);
+
+TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId);
+
+TEMP_PROTO NewTempProto();
+
+ADAPT_CLASS ReadAdaptedClass(tesseract::TFile* File);
+
+PERM_CONFIG ReadPermConfig(tesseract::TFile* File);
+
+TEMP_CONFIG ReadTempConfig(tesseract::TFile* File);
+
+void WriteAdaptedClass(FILE* File, ADAPT_CLASS Class, int NumConfigs);
+
+void WritePermConfig(FILE* File, PERM_CONFIG Config);
+
+void WriteTempConfig(FILE* File, TEMP_CONFIG Config);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/adaptmatch.cpp b/tesseract/src/classify/adaptmatch.cpp
new file mode 100644
index 00000000..65254b8a
--- /dev/null
+++ b/tesseract/src/classify/adaptmatch.cpp
@@ -0,0 +1,2317 @@
+/******************************************************************************
+ ** Filename: adaptmatch.cpp
+ ** Purpose: High level adaptive matcher.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+/*-----------------------------------------------------------------------------
+ Include Files and Type Defines
+-----------------------------------------------------------------------------*/
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "adaptive.h" // for ADAPT_CLASS, free_adapted_templates
+#include "ambigs.h" // for UnicharIdVector, UnicharAmbigs
+#include "bitvec.h" // for FreeBitVector, NewBitVector, BIT_VECTOR
+#include "blobs.h" // for TBLOB, TWERD
+#include "classify.h" // for Classify, CST_FRAGMENT, CST_WHOLE
+#include "dict.h" // for Dict
+#include "errcode.h" // for ASSERT_HOST
+#include "featdefs.h" // for CharNormDesc
+#include "float2int.h" // for BASELINE_Y_SHIFT
+#include "fontinfo.h" // for ScoredFont, FontSet
+#include "intfx.h" // for BlobToTrainingSample, INT_FX_RESULT_S...
+#include "intmatcher.h" // for CP_RESULT_STRUCT, IntegerMatcher
+#include "intproto.h" // for INT_FEATURE_STRUCT, (anonymous), Clas...
+#include "matchdefs.h" // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
+#include "mfoutline.h" // for baseline, character, MF_SCALE_FACTOR
+#include "normalis.h" // for DENORM, kBlnBaselineOffset, kBlnXHeight
+#include "normfeat.h" // for ActualOutlineLength, CharNormLength
+#include "ocrfeatures.h" // for FEATURE_STRUCT, FreeFeatureSet, FEATURE
+#include "oldlist.h" // for push, delete_d
+#include "outfeat.h" // for OutlineFeatDir, OutlineFeatLength
+#include "pageres.h" // for WERD_RES
+#include "params.h" // for IntParam, BoolParam, DoubleParam, Str...
+#include "picofeat.h" // for PicoFeatDir, PicoFeatX, PicoFeatY
+#include "protos.h" // for PROTO_STRUCT, FillABC, PROTO
+#include "ratngs.h" // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
+#include "rect.h" // for TBOX
+#include "scrollview.h" // for ScrollView, ScrollView::BROWN, Scroll...
+#include "seam.h" // for SEAM
+#include "shapeclassifier.h" // for ShapeClassifier
+#include "shapetable.h" // for UnicharRating, ShapeTable, Shape, Uni...
+#include "tessclassifier.h" // for TessClassifier
+#include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
+#include "tprintf.h" // for tprintf
+#include "trainingsample.h" // for TrainingSample
+#include "unicharset.h" // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
+#include "unicity_table.h" // for UnicityTable
+
+#include "genericvector.h" // for GenericVector
+#include "serialis.h" // for TFile
+#include "strngs.h" // for STRING
+#include "helpers.h" // for IntCastRounded, ClipToRange
+#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
+
+#include <algorithm> // for max, min
+#include <cassert> // for assert
+#include <cmath> // for fabs
+#include <cstdint> // for INT32_MAX, UINT8_MAX
+#include <cstdio> // for fflush, fclose, fopen, stdout, FILE
+#include <cstdlib> // for malloc
+#include <cstring> // for strstr, memset, strcmp
+
+namespace tesseract {
+
+#define ADAPT_TEMPLATE_SUFFIX ".a"
+
+#define MAX_MATCHES 10
+#define UNLIKELY_NUM_FEAT 200
+#define NO_DEBUG 0
+#define MAX_ADAPTABLE_WERD_SIZE 40
+
+#define ADAPTABLE_WERD_ADJUSTMENT (0.05)
+
+#define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
+
+#define WORST_POSSIBLE_RATING (0.0f)
+
+struct ADAPT_RESULTS {
+ int32_t BlobLength;
+ bool HasNonfragment;
+ UNICHAR_ID best_unichar_id;
+ int best_match_index;
+ float best_rating;
+ std::vector<UnicharRating> match;
+ std::vector<CP_RESULT_STRUCT> CPResults;
+
+ /// Initializes data members to the default values. Sets the initial
+ /// rating of each class to be the worst possible rating (1.0).
+ inline void Initialize() {
+ BlobLength = INT32_MAX;
+ HasNonfragment = false;
+ ComputeBest();
+ }
+ // Computes best_unichar_id, best_match_index and best_rating.
+ void ComputeBest() {
+ best_unichar_id = INVALID_UNICHAR_ID;
+ best_match_index = -1;
+ best_rating = WORST_POSSIBLE_RATING;
+ for (int i = 0; i < match.size(); ++i) {
+ if (match[i].rating > best_rating) {
+ best_rating = match[i].rating;
+ best_unichar_id = match[i].unichar_id;
+ best_match_index = i;
+ }
+ }
+ }
+};
+
+struct PROTO_KEY {
+ ADAPT_TEMPLATES Templates;
+ CLASS_ID ClassId;
+ int ConfigId;
+};
+
+// Sort function to sort ratings appropriately by descending rating.
+static bool SortDescendingRating(const UnicharRating &a, const UnicharRating &b) {
+ if (a.rating != b.rating) {
+ return a.rating > b.rating;
+ } else {
+ return a.unichar_id < b.unichar_id;
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ Private Macros
+-----------------------------------------------------------------------------*/
+inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
+ return (1.0f - confidence) > matcher_great_threshold;
+}
+
+/*-----------------------------------------------------------------------------
+ Private Function Prototypes
+-----------------------------------------------------------------------------*/
+// Returns the index of the given id in results, if present, or the size of the
+// vector (index it will go at) if not present.
+static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
+ for (int i = 0; i < results.match.size(); i++) {
+ if (results.match[i].unichar_id == id)
+ return i;
+ }
+ return results.match.size();
+}
+
+// Returns the current rating for a unichar id if we have rated it, defaulting
+// to WORST_POSSIBLE_RATING.
+static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
+ int index = FindScoredUnichar(id, results);
+ if (index >= results.match.size()) return WORST_POSSIBLE_RATING;
+ return results.match[index].rating;
+}
+
+void InitMatcherRatings(float *Rating);
+
+int MakeTempProtoPerm(void *item1, void *item2);
+
+void SetAdaptiveThreshold(float Threshold);
+
+
+/*-----------------------------------------------------------------------------
+ Public Code
+-----------------------------------------------------------------------------*/
+/**
+ * This routine calls the adaptive matcher
+ * which returns (in an array) the class id of each
+ * class matched.
+ *
+ * It also returns the number of classes matched.
+ * For each class matched it places the best rating
+ * found for that class into the Ratings array.
+ *
+ * Bad matches are then removed so that they don't
+ * need to be sorted. The remaining good matches are
+ * then sorted and converted to choices.
+ *
+ * This routine also performs some simple speckle
+ * filtering.
+ *
+ * @param Blob blob to be classified
+ * @param[out] Choices List of choices found by adaptive matcher.
+ * filled on return with the choices found by the
+ * class pruner and the ratings therefrom. Also
+ * contains the detailed results of the integer matcher.
+ *
+ */
+void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
+ assert(Choices != nullptr);
+ auto *Results = new ADAPT_RESULTS;
+ Results->Initialize();
+
+ ASSERT_HOST(AdaptedTemplates != nullptr);
+
+ DoAdaptiveMatch(Blob, Results);
+
+ RemoveBadMatches(Results);
+ std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
+ RemoveExtraPuncs(Results);
+ Results->ComputeBest();
+ ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
+ Choices);
+
+ // TODO(rays) Move to before ConvertMatchesToChoices!
+ if (LargeSpeckle(*Blob) || Choices->length() == 0)
+ AddLargeSpeckleTo(Results->BlobLength, Choices);
+
+ if (matcher_debug_level >= 1) {
+ tprintf("AD Matches = ");
+ PrintAdaptiveMatchResults(*Results);
+ }
+
+#ifndef GRAPHICS_DISABLED
+ if (classify_enable_adaptive_debugger)
+ DebugAdaptiveClassifier(Blob, Results);
+#endif
+
+ delete Results;
+} /* AdaptiveClassifier */
+
+#ifndef GRAPHICS_DISABLED
+
+// If *win is nullptr, sets it to a new ScrollView() object with title msg.
+// Clears the window and draws baselines.
+void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
+ int y_offset, const TBOX &wbox) {
+ const int kSampleSpaceWidth = 500;
+ if (*win == nullptr) {
+ *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
+ kSampleSpaceWidth * 2, 200, true);
+ }
+ (*win)->Clear();
+ (*win)->Pen(64, 64, 64);
+ (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
+ kSampleSpaceWidth, kBlnBaselineOffset);
+ (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
+ kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
+ (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
+ wbox.right(), wbox.bottom());
+}
+
+#endif // !GRAPHICS_DISABLED
+
+// Learns the given word using its chopped_word, seam_array, denorm,
+// box_word, best_state, and correct_text to learn both correctly and
+// incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
+// is called and the data will be saved in an internal buffer.
+// Otherwise AdaptToBlob is called for adaption within a document.
+void Classify::LearnWord(const char* fontname, WERD_RES* word) {
+ int word_len = word->correct_text.size();
+ if (word_len == 0) return;
+
+ float* thresholds = nullptr;
+ if (fontname == nullptr) {
+ // Adaption mode.
+ if (!EnableLearning || word->best_choice == nullptr)
+ return; // Can't or won't adapt.
+
+ if (classify_learning_debug_level >= 1)
+ tprintf("\n\nAdapting to word = %s\n",
+ word->best_choice->debug_string().c_str());
+ thresholds = new float[word_len];
+ word->ComputeAdaptionThresholds(certainty_scale,
+ matcher_perfect_threshold,
+ matcher_good_threshold,
+ matcher_rating_margin, thresholds);
+ }
+ int start_blob = 0;
+
+ #ifndef GRAPHICS_DISABLED
+ if (classify_debug_character_fragments) {
+ if (learn_fragmented_word_debug_win_ != nullptr) {
+ learn_fragmented_word_debug_win_->Wait();
+ }
+ RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
+ word->chopped_word->bounding_box());
+ RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
+ word->chopped_word->bounding_box());
+ word->chopped_word->plot(learn_fragmented_word_debug_win_);
+ ScrollView::Update();
+ }
+ #endif // !GRAPHICS_DISABLED
+
+ for (int ch = 0; ch < word_len; ++ch) {
+ if (classify_debug_character_fragments) {
+ tprintf("\nLearning %s\n", word->correct_text[ch].c_str());
+ }
+ if (word->correct_text[ch].length() > 0) {
+ float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
+
+ LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
+ CST_WHOLE, word->correct_text[ch].c_str(), word);
+
+ if (word->best_state[ch] > 1 && !disable_character_fragments) {
+ // Check that the character breaks into meaningful fragments
+ // that each match a whole character with at least
+ // classify_character_fragments_garbage_certainty_threshold
+ bool garbage = false;
+ int frag;
+ for (frag = 0; frag < word->best_state[ch]; ++frag) {
+ TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
+ if (classify_character_fragments_garbage_certainty_threshold < 0) {
+ garbage |= LooksLikeGarbage(frag_blob);
+ }
+ }
+ // Learn the fragments.
+ if (!garbage) {
+ bool pieces_all_natural = word->PiecesAllNatural(start_blob,
+ word->best_state[ch]);
+ if (pieces_all_natural || !prioritize_division) {
+ for (frag = 0; frag < word->best_state[ch]; ++frag) {
+ std::vector<STRING> tokens;
+ word->correct_text[ch].split(' ', &tokens);
+
+ tokens[0] = CHAR_FRAGMENT::to_string(
+ tokens[0].c_str(), frag, word->best_state[ch],
+ pieces_all_natural);
+
+ STRING full_string;
+ for (int i = 0; i < tokens.size(); i++) {
+ full_string += tokens[i];
+ if (i != tokens.size() - 1)
+ full_string += ' ';
+ }
+ LearnPieces(fontname, start_blob + frag, 1, threshold,
+ CST_FRAGMENT, full_string.c_str(), word);
+ }
+ }
+ }
+ }
+
+ // TODO(rays): re-enable this part of the code when we switch to the
+ // new classifier that needs to see examples of garbage.
+ /*
+ if (word->best_state[ch] > 1) {
+ // If the next blob is good, make junk with the rightmost fragment.
+ if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
+ LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
+ word->best_state[ch + 1] + 1,
+ threshold, CST_IMPROPER, INVALID_UNICHAR, word);
+ }
+ // If the previous blob is good, make junk with the leftmost fragment.
+ if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
+ LearnPieces(fontname, start_blob - word->best_state[ch - 1],
+ word->best_state[ch - 1] + 1,
+ threshold, CST_IMPROPER, INVALID_UNICHAR, word);
+ }
+ }
+ // If the next blob is good, make a join with it.
+ if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
+ STRING joined_text = word->correct_text[ch];
+ joined_text += word->correct_text[ch + 1];
+ LearnPieces(fontname, start_blob,
+ word->best_state[ch] + word->best_state[ch + 1],
+ threshold, CST_NGRAM, joined_text.c_str(), word);
+ }
+ */
+ }
+ start_blob += word->best_state[ch];
+ }
+ delete [] thresholds;
+} // LearnWord.
+
+// Builds a blob of length fragments, from the word, starting at start,
+// and then learns it, as having the given correct_text.
+// If fontname is not nullptr, then LearnBlob is called and the data will be
+// saved in an internal buffer for static training.
+// Otherwise AdaptToBlob is called for adaption within a document.
+// threshold is a magic number required by AdaptToChar and generated by
+// ComputeAdaptionThresholds.
+// Although it can be partly inferred from the string, segmentation is
+// provided to explicitly clarify the character segmentation.
+void Classify::LearnPieces(const char* fontname, int start, int length,
+ float threshold, CharSegmentationType segmentation,
+ const char* correct_text, WERD_RES* word) {
+ // TODO(daria) Remove/modify this if/when we want
+ // to train and/or adapt to n-grams.
+ if (segmentation != CST_WHOLE &&
+ (segmentation != CST_FRAGMENT || disable_character_fragments))
+ return;
+
+ if (length > 1) {
+ SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
+ start + length - 1);
+ }
+ TBLOB* blob = word->chopped_word->blobs[start];
+ // Rotate the blob if needed for classification.
+ TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
+ if (rotated_blob == nullptr)
+ rotated_blob = blob;
+
+ #ifndef GRAPHICS_DISABLED
+ // Draw debug windows showing the blob that is being learned if needed.
+ if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {
+ RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
+ word->chopped_word->bounding_box());
+ rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
+ learn_debug_win_->Update();
+ learn_debug_win_->Wait();
+ }
+ if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
+ ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
+ blob->plot(learn_fragments_debug_win_,
+ ScrollView::BLUE, ScrollView::BROWN);
+ learn_fragments_debug_win_->Update();
+ }
+ #endif // !GRAPHICS_DISABLED
+
+ if (fontname != nullptr) {
+ classify_norm_method.set_value(character); // force char norm spc 30/11/93
+ tess_bn_matching.set_value(false); // turn it off
+ tess_cn_matching.set_value(false);
+ DENORM bl_denorm, cn_denorm;
+ INT_FX_RESULT_STRUCT fx_info;
+ SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
+ &bl_denorm, &cn_denorm, &fx_info);
+ LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
+ } else if (unicharset.contains_unichar(correct_text)) {
+ UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
+ int font_id = word->fontinfo != nullptr
+ ? fontinfo_table_.get_id(*word->fontinfo)
+ : 0;
+ if (classify_learning_debug_level >= 1)
+ tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
+ unicharset.id_to_unichar(class_id), threshold, font_id);
+ // If filename is not nullptr we are doing recognition
+ // (as opposed to training), so we must have already set word fonts.
+ AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
+ if (BackupAdaptedTemplates != nullptr) {
+ // Adapt the backup templates too. They will be used if the primary gets
+ // too full.
+ AdaptToChar(rotated_blob, class_id, font_id, threshold,
+ BackupAdaptedTemplates);
+ }
+ } else if (classify_debug_level >= 1) {
+ tprintf("Can't adapt to %s not in unicharset\n", correct_text);
+ }
+ if (rotated_blob != blob) {
+ delete rotated_blob;
+ }
+
+ SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
+ start + length - 1);
+} // LearnPieces.
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine performs cleanup operations
+ * on the adaptive classifier. It should be called
+ * before the program is terminated. Its main function
+ * is to save the adapted templates to a file.
+ *
+ * Globals:
+ * - #AdaptedTemplates current set of adapted templates
+ * - #classify_save_adapted_templates true if templates should be saved
+ * - #classify_enable_adaptive_matcher true if adaptive matcher is enabled
+ */
+void Classify::EndAdaptiveClassifier() {
+ STRING Filename;
+ FILE *File;
+
+ if (AdaptedTemplates != nullptr &&
+ classify_enable_adaptive_matcher && classify_save_adapted_templates) {
+ Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
+ File = fopen (Filename.c_str(), "wb");
+ if (File == nullptr)
+ tprintf ("Unable to save adapted templates to %s!\n", Filename.c_str());
+ else {
+ tprintf ("\nSaving adapted templates to %s ...", Filename.c_str());
+ fflush(stdout);
+ WriteAdaptedTemplates(File, AdaptedTemplates);
+ tprintf ("\n");
+ fclose(File);
+ }
+ }
+
+ if (AdaptedTemplates != nullptr) {
+ free_adapted_templates(AdaptedTemplates);
+ AdaptedTemplates = nullptr;
+ }
+ if (BackupAdaptedTemplates != nullptr) {
+ free_adapted_templates(BackupAdaptedTemplates);
+ BackupAdaptedTemplates = nullptr;
+ }
+
+ if (PreTrainedTemplates != nullptr) {
+ free_int_templates(PreTrainedTemplates);
+ PreTrainedTemplates = nullptr;
+ }
+ getDict().EndDangerousAmbigs();
+ FreeNormProtos();
+ if (AllProtosOn != nullptr) {
+ FreeBitVector(AllProtosOn);
+ FreeBitVector(AllConfigsOn);
+ FreeBitVector(AllConfigsOff);
+ FreeBitVector(TempProtoMask);
+ AllProtosOn = nullptr;
+ AllConfigsOn = nullptr;
+ AllConfigsOff = nullptr;
+ TempProtoMask = nullptr;
+ }
+ delete shape_table_;
+ shape_table_ = nullptr;
+ delete static_classifier_;
+ static_classifier_ = nullptr;
+} /* EndAdaptiveClassifier */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine reads in the training
+ * information needed by the adaptive classifier
+ * and saves it into global variables.
+ * Parameters:
+ * load_pre_trained_templates Indicates whether the pre-trained
+ * templates (inttemp, normproto and pffmtable components)
+ * should be loaded. Should only be set to true if the
+ * necessary classifier components are present in the
+ * [lang].traineddata file.
+ * Globals:
+ * BuiltInTemplatesFile file to get built-in temps from
+ * BuiltInCutoffsFile file to get avg. feat per class from
+ * classify_use_pre_adapted_templates
+ * enables use of pre-adapted templates
+ */
+void Classify::InitAdaptiveClassifier(TessdataManager* mgr) {
+ if (!classify_enable_adaptive_matcher)
+ return;
+ if (AllProtosOn != nullptr)
+ EndAdaptiveClassifier(); // Don't leak with multiple inits.
+
+ // If there is no language_data_path_prefix, the classifier will be
+ // adaptive only.
+ if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
+ TFile fp;
+ ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
+ PreTrainedTemplates = ReadIntTemplates(&fp);
+
+ if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
+ shape_table_ = new ShapeTable(unicharset);
+ if (!shape_table_->DeSerialize(&fp)) {
+ tprintf("Error loading shape table!\n");
+ delete shape_table_;
+ shape_table_ = nullptr;
+ }
+ }
+
+ ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
+ ReadNewCutoffs(&fp, CharNormCutoffs);
+
+ ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
+ NormProtos = ReadNormProtos(&fp);
+ static_classifier_ = new TessClassifier(false, this);
+ }
+
+ InitIntegerFX();
+
+ AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
+ AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
+ AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
+ TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
+ set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
+ set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
+ zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
+
+ for (uint16_t& BaselineCutoff : BaselineCutoffs) {
+ BaselineCutoff = 0;
+ }
+
+ if (classify_use_pre_adapted_templates) {
+ TFile fp;
+ STRING Filename;
+
+ Filename = imagefile;
+ Filename += ADAPT_TEMPLATE_SUFFIX;
+ if (!fp.Open(Filename.c_str(), nullptr)) {
+ AdaptedTemplates = NewAdaptedTemplates(true);
+ } else {
+ tprintf("\nReading pre-adapted templates from %s ...\n",
+ Filename.c_str());
+ fflush(stdout);
+ AdaptedTemplates = ReadAdaptedTemplates(&fp);
+ tprintf("\n");
+ PrintAdaptedTemplates(stdout, AdaptedTemplates);
+
+ for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
+ BaselineCutoffs[i] = CharNormCutoffs[i];
+ }
+ }
+ } else {
+ if (AdaptedTemplates != nullptr)
+ free_adapted_templates(AdaptedTemplates);
+ AdaptedTemplates = NewAdaptedTemplates(true);
+ }
+} /* InitAdaptiveClassifier */
+
+void Classify::ResetAdaptiveClassifierInternal() {
+ if (classify_learning_debug_level > 0) {
+ tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
+ NumAdaptationsFailed);
+ }
+ free_adapted_templates(AdaptedTemplates);
+ AdaptedTemplates = NewAdaptedTemplates(true);
+ if (BackupAdaptedTemplates != nullptr)
+ free_adapted_templates(BackupAdaptedTemplates);
+ BackupAdaptedTemplates = nullptr;
+ NumAdaptationsFailed = 0;
+}
+
+// If there are backup adapted templates, switches to those, otherwise resets
+// the main adaptive classifier (because it is full.)
+void Classify::SwitchAdaptiveClassifier() {
+ if (BackupAdaptedTemplates == nullptr) {
+ ResetAdaptiveClassifierInternal();
+ return;
+ }
+ if (classify_learning_debug_level > 0) {
+ tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
+ NumAdaptationsFailed);
+ }
+ free_adapted_templates(AdaptedTemplates);
+ AdaptedTemplates = BackupAdaptedTemplates;
+ BackupAdaptedTemplates = nullptr;
+ NumAdaptationsFailed = 0;
+}
+
+// Resets the backup adaptive classifier to empty.
+void Classify::StartBackupAdaptiveClassifier() {
+ if (BackupAdaptedTemplates != nullptr)
+ free_adapted_templates(BackupAdaptedTemplates);
+ BackupAdaptedTemplates = NewAdaptedTemplates(true);
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine prepares the adaptive
+ * matcher for the start
+ * of the first pass. Learning is enabled (unless it
+ * is disabled for the whole program).
+ *
+ * @note this is somewhat redundant, it simply says that if learning is
+ * enabled then it will remain enabled on the first pass. If it is
+ * disabled, then it will remain disabled. This is only put here to
+ * make it very clear that learning is controlled directly by the global
+ * setting of EnableLearning.
+ *
+ * Globals:
+ * - #EnableLearning
+ * set to true by this routine
+ */
+void Classify::SettupPass1() {
+ EnableLearning = classify_enable_learning;
+
+ getDict().SettupStopperPass1();
+
+} /* SettupPass1 */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine prepares the adaptive
+ * matcher for the start of the second pass. Further
+ * learning is disabled.
+ *
+ * Globals:
+ * - #EnableLearning set to false by this routine
+ */
+void Classify::SettupPass2() {
+ EnableLearning = false;
+ getDict().SettupStopperPass2();
+
+} /* SettupPass2 */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine creates a new adapted
+ * class and uses Blob as the model for the first
+ * config in that class.
+ *
+ * @param Blob blob to model new class after
+ * @param ClassId id of the class to be initialized
+ * @param FontinfoId font information inferred from pre-trained templates
+ * @param Class adapted class to be initialized
+ * @param Templates adapted templates to add new class to
+ *
+ * Globals:
+ * - #AllProtosOn dummy mask with all 1's
+ * - BaselineCutoffs kludge needed to get cutoffs
+ * - #PreTrainedTemplates kludge needed to get cutoffs
+ */
+void Classify::InitAdaptedClass(TBLOB *Blob,
+ CLASS_ID ClassId,
+ int FontinfoId,
+ ADAPT_CLASS Class,
+ ADAPT_TEMPLATES Templates) {
+ FEATURE_SET Features;
+ int Fid, Pid;
+ FEATURE Feature;
+ int NumFeatures;
+ TEMP_PROTO TempProto;
+ PROTO Proto;
+ INT_CLASS IClass;
+ TEMP_CONFIG Config;
+
+ classify_norm_method.set_value(baseline);
+ Features = ExtractOutlineFeatures(Blob);
+ NumFeatures = Features->NumFeatures;
+ if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
+ FreeFeatureSet(Features);
+ return;
+ }
+
+ Config = NewTempConfig(NumFeatures - 1, FontinfoId);
+ TempConfigFor(Class, 0) = Config;
+
+ /* this is a kludge to construct cutoffs for adapted templates */
+ if (Templates == AdaptedTemplates)
+ BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
+
+ IClass = ClassForClassId (Templates->Templates, ClassId);
+
+ for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
+ Pid = AddIntProto (IClass);
+ assert (Pid != NO_PROTO);
+
+ Feature = Features->Features[Fid];
+ TempProto = NewTempProto ();
+ Proto = &(TempProto->Proto);
+
+ /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
+ ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
+ instead of the -0.25 to 0.75 used in baseline normalization */
+ Proto->Angle = Feature->Params[OutlineFeatDir];
+ Proto->X = Feature->Params[OutlineFeatX];
+ Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
+ Proto->Length = Feature->Params[OutlineFeatLength];
+ FillABC(Proto);
+
+ TempProto->ProtoId = Pid;
+ SET_BIT (Config->Protos, Pid);
+
+ ConvertProto(Proto, Pid, IClass);
+ AddProtoToProtoPruner(Proto, Pid, IClass,
+ classify_learning_debug_level >= 2);
+
+ Class->TempProtos = push (Class->TempProtos, TempProto);
+ }
+ FreeFeatureSet(Features);
+
+ AddIntConfig(IClass);
+ ConvertConfig (AllProtosOn, 0, IClass);
+
+ if (classify_learning_debug_level >= 1) {
+ tprintf("Added new class '%s' with class id %d and %d protos.\n",
+ unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
+#ifndef GRAPHICS_DISABLED
+ if (classify_learning_debug_level > 1)
+ DisplayAdaptedChar(Blob, IClass);
+#endif
+ }
+
+ if (IsEmptyAdaptedClass(Class))
+ (Templates->NumNonEmptyClasses)++;
+} /* InitAdaptedClass */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine sets up the feature
+ * extractor to extract baseline normalized
+ * pico-features.
+ *
+ * The extracted pico-features are converted
+ * to integer form and placed in IntFeatures. The
+ * original floating-pt. features are returned in
+ * FloatFeatures.
+ *
+ * Globals: none
+ * @param Blob blob to extract features from
+ * @param[out] IntFeatures array to fill with integer features
+ * @param[out] FloatFeatures place to return actual floating-pt features
+ *
+ * @return Number of pico-features returned (0 if
+ * an error occurred)
+ */
+int Classify::GetAdaptiveFeatures(TBLOB *Blob,
+ INT_FEATURE_ARRAY IntFeatures,
+ FEATURE_SET *FloatFeatures) {
+ FEATURE_SET Features;
+ int NumFeatures;
+
+ classify_norm_method.set_value(baseline);
+ Features = ExtractPicoFeatures(Blob);
+
+ NumFeatures = Features->NumFeatures;
+ if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
+ FreeFeatureSet(Features);
+ return 0;
+ }
+
+ ComputeIntFeatures(Features, IntFeatures);
+ *FloatFeatures = Features;
+
+ return NumFeatures;
+} /* GetAdaptiveFeatures */
+
+
+/*-----------------------------------------------------------------------------
+ Private Code
+-----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+ * Return true if the specified word is acceptable for adaptation.
+ *
+ * Globals: none
+ *
+ * @param word current word
+ *
+ * @return true or false
+ */
+bool Classify::AdaptableWord(WERD_RES* word) {
+ if (word->best_choice == nullptr) return false;
+ int BestChoiceLength = word->best_choice->length();
+ float adaptable_score =
+ getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
+ return // rules that apply in general - simplest to compute first
+ BestChoiceLength > 0 &&
+ BestChoiceLength == word->rebuild_word->NumBlobs() &&
+ BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
+ // This basically ensures that the word is at least a dictionary match
+ // (freq word, user word, system dawg word, etc).
+ // Since all the other adjustments will make adjust factor higher
+ // than higher than adaptable_score=1.1+0.05=1.15
+ // Since these are other flags that ensure that the word is dict word,
+ // this check could be at times redundant.
+ word->best_choice->adjust_factor() <= adaptable_score &&
+ // Make sure that alternative choices are not dictionary words.
+ word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * @param Blob blob to add to templates for ClassId
+ * @param ClassId class to add blob to
+ * @param FontinfoId font information from pre-trained templates
+ * @param Threshold minimum match rating to existing template
+ * @param adaptive_templates current set of adapted templates
+ *
+ * Globals:
+ * - AllProtosOn dummy mask to match against all protos
+ * - AllConfigsOn dummy mask to match against all configs
+ */
+void Classify::AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
+ float Threshold,
+ ADAPT_TEMPLATES adaptive_templates) {
+ int NumFeatures;
+ INT_FEATURE_ARRAY IntFeatures;
+ UnicharRating int_result;
+ INT_CLASS IClass;
+ ADAPT_CLASS Class;
+ TEMP_CONFIG TempConfig;
+ FEATURE_SET FloatFeatures;
+ int NewTempConfigId;
+
+ if (!LegalClassId (ClassId))
+ return;
+
+ int_result.unichar_id = ClassId;
+ Class = adaptive_templates->Class[ClassId];
+ assert(Class != nullptr);
+ if (IsEmptyAdaptedClass(Class)) {
+ InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
+ } else {
+ IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
+
+ NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
+ if (NumFeatures <= 0) {
+ return; // Features already freed by GetAdaptiveFeatures.
+ }
+
+ // Only match configs with the matching font.
+ BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
+ for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
+ if (GetFontinfoId(Class, cfg) == FontinfoId) {
+ SET_BIT(MatchingFontConfigs, cfg);
+ } else {
+ reset_bit(MatchingFontConfigs, cfg);
+ }
+ }
+ im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
+ NumFeatures, IntFeatures,
+ &int_result, classify_adapt_feature_threshold,
+ NO_DEBUG, matcher_debug_separate_windows);
+ FreeBitVector(MatchingFontConfigs);
+
+ SetAdaptiveThreshold(Threshold);
+
+ if (1.0f - int_result.rating <= Threshold) {
+ if (ConfigIsPermanent(Class, int_result.config)) {
+ if (classify_learning_debug_level >= 1)
+ tprintf("Found good match to perm config %d = %4.1f%%.\n",
+ int_result.config, int_result.rating * 100.0);
+ FreeFeatureSet(FloatFeatures);
+ return;
+ }
+
+ TempConfig = TempConfigFor(Class, int_result.config);
+ IncreaseConfidence(TempConfig);
+ if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
+ Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
+ }
+ if (classify_learning_debug_level >= 1)
+ tprintf("Increasing reliability of temp config %d to %d.\n",
+ int_result.config, TempConfig->NumTimesSeen);
+
+ if (TempConfigReliable(ClassId, TempConfig)) {
+ MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
+ UpdateAmbigsGroup(ClassId, Blob);
+ }
+ } else {
+ if (classify_learning_debug_level >= 1) {
+ tprintf("Found poor match to temp config %d = %4.1f%%.\n",
+ int_result.config, int_result.rating * 100.0);
+#ifndef GRAPHICS_DISABLED
+ if (classify_learning_debug_level > 2)
+ DisplayAdaptedChar(Blob, IClass);
+#endif
+ }
+ NewTempConfigId =
+ MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
+ NumFeatures, IntFeatures, FloatFeatures);
+ if (NewTempConfigId >= 0 &&
+ TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
+ MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
+ UpdateAmbigsGroup(ClassId, Blob);
+ }
+
+#ifndef GRAPHICS_DISABLED
+ if (classify_learning_debug_level > 1) {
+ DisplayAdaptedChar(Blob, IClass);
+ }
+#endif
+ }
+ FreeFeatureSet(FloatFeatures);
+ }
+} /* AdaptToChar */
+
+#ifndef GRAPHICS_DISABLED
+
+void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) {
+ INT_FX_RESULT_STRUCT fx_info;
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ TrainingSample* sample =
+ BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info,
+ &bl_features);
+ if (sample == nullptr) return;
+
+ UnicharRating int_result;
+ im_.Match(int_class, AllProtosOn, AllConfigsOn,
+ bl_features.size(), &bl_features[0],
+ &int_result, classify_adapt_feature_threshold,
+ NO_DEBUG, matcher_debug_separate_windows);
+ tprintf("Best match to temp config %d = %4.1f%%.\n",
+ int_result.config, int_result.rating * 100.0);
+ if (classify_learning_debug_level >= 2) {
+ uint32_t ConfigMask;
+ ConfigMask = 1 << int_result.config;
+ ShowMatchDisplay();
+ im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask),
+ bl_features.size(), &bl_features[0],
+ &int_result, classify_adapt_feature_threshold,
+ 6 | 0x19, matcher_debug_separate_windows);
+ UpdateMatchDisplay();
+ }
+
+ delete sample;
+}
+
+#endif
+
+/**
+ * This routine adds the result of a classification into
+ * Results. If the new rating is much worse than the current
+ * best rating, it is not entered into results because it
+ * would end up being stripped later anyway. If the new rating
+ * is better than the old rating for the class, it replaces the
+ * old rating. If this is the first rating for the class, the
+ * class is added to the list of matched classes in Results.
+ * If the new rating is better than the best so far, it
+ * becomes the best so far.
+ *
+ * Globals:
+ * - #matcher_bad_match_pad defines limits of an acceptable match
+ *
+ * @param new_result new result to add
+ * @param[out] results results to add new result to
+ */
+void Classify::AddNewResult(const UnicharRating& new_result,
+ ADAPT_RESULTS *results) {
+ int old_match = FindScoredUnichar(new_result.unichar_id, *results);
+
+ if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
+ (old_match < results->match.size() &&
+ new_result.rating <= results->match[old_match].rating))
+ return; // New one not good enough.
+
+ if (!unicharset.get_fragment(new_result.unichar_id))
+ results->HasNonfragment = true;
+
+ if (old_match < results->match.size()) {
+ results->match[old_match].rating = new_result.rating;
+ } else {
+ results->match.push_back(new_result);
+ }
+
+ if (new_result.rating > results->best_rating &&
+ // Ensure that fragments do not affect best rating, class and config.
+ // This is needed so that at least one non-fragmented character is
+ // always present in the results.
+ // TODO(daria): verify that this helps accuracy and does not
+ // hurt performance.
+ !unicharset.get_fragment(new_result.unichar_id)) {
+ results->best_match_index = old_match;
+ results->best_rating = new_result.rating;
+ results->best_unichar_id = new_result.unichar_id;
+ }
+} /* AddNewResult */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine is identical to CharNormClassifier()
+ * except that it does no class pruning. It simply matches
+ * the unknown blob against the classes listed in
+ * Ambiguities.
+ *
+ * Globals:
+ * - #AllProtosOn mask that enables all protos
+ * - #AllConfigsOn mask that enables all configs
+ *
+ * @param blob blob to be classified
+ * @param templates built-in templates to classify against
+ * @param classes adapted class templates
+ * @param ambiguities array of unichar id's to match against
+ * @param[out] results place to put match results
+ * @param int_features
+ * @param fx_info
+ */
+void Classify::AmbigClassifier(
+ const std::vector<INT_FEATURE_STRUCT>& int_features,
+ const INT_FX_RESULT_STRUCT& fx_info,
+ const TBLOB *blob,
+ INT_TEMPLATES templates,
+ ADAPT_CLASS *classes,
+ UNICHAR_ID *ambiguities,
+ ADAPT_RESULTS *results) {
+ if (int_features.empty()) return;
+ auto* CharNormArray = new uint8_t[unicharset.size()];
+ UnicharRating int_result;
+
+ results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr,
+ CharNormArray);
+ bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
+ if (debug)
+ tprintf("AM Matches = ");
+
+ int top = blob->bounding_box().top();
+ int bottom = blob->bounding_box().bottom();
+ while (*ambiguities >= 0) {
+ CLASS_ID class_id = *ambiguities;
+
+ int_result.unichar_id = class_id;
+ im_.Match(ClassForClassId(templates, class_id),
+ AllProtosOn, AllConfigsOn,
+ int_features.size(), &int_features[0],
+ &int_result,
+ classify_adapt_feature_threshold, NO_DEBUG,
+ matcher_debug_separate_windows);
+
+ ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0,
+ results->BlobLength,
+ classify_integer_matcher_multiplier,
+ CharNormArray, &int_result, results);
+ ambiguities++;
+ }
+ delete [] CharNormArray;
+} /* AmbigClassifier */
+
+/*---------------------------------------------------------------------------*/
+/// Factored-out calls to IntegerMatcher based on class pruner results.
+/// Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.
+void Classify::MasterMatcher(INT_TEMPLATES templates,
+ int16_t num_features,
+ const INT_FEATURE_STRUCT* features,
+ const uint8_t* norm_factors,
+ ADAPT_CLASS* classes,
+ int debug,
+ int matcher_multiplier,
+ const TBOX& blob_box,
+ const std::vector<CP_RESULT_STRUCT>& results,
+ ADAPT_RESULTS* final_results) {
+ int top = blob_box.top();
+ int bottom = blob_box.bottom();
+ UnicharRating int_result;
+ for (int c = 0; c < results.size(); c++) {
+ CLASS_ID class_id = results[c].Class;
+ BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos
+ : AllProtosOn;
+ BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs
+ : AllConfigsOn;
+
+ int_result.unichar_id = class_id;
+ im_.Match(ClassForClassId(templates, class_id),
+ protos, configs,
+ num_features, features,
+ &int_result, classify_adapt_feature_threshold, debug,
+ matcher_debug_separate_windows);
+ bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
+ ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top,
+ results[c].Rating,
+ final_results->BlobLength,
+ matcher_multiplier, norm_factors,
+ &int_result, final_results);
+ }
+}
+
+// Converts configs to fonts, and if the result is not adapted, and a
+// shape_table_ is present, the shape is expanded to include all
+// unichar_ids represented, before applying a set of corrections to the
+// distance rating in int_result, (see ComputeCorrectedRating.)
+// The results are added to the final_results output.
+void Classify::ExpandShapesAndApplyCorrections(
+ ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top,
+ float cp_rating, int blob_length, int matcher_multiplier,
+ const uint8_t* cn_factors,
+ UnicharRating* int_result, ADAPT_RESULTS* final_results) {
+ if (classes != nullptr) {
+ // Adapted result. Convert configs to fontinfo_ids.
+ int_result->adapted = true;
+ for (int f = 0; f < int_result->fonts.size(); ++f) {
+ int_result->fonts[f].fontinfo_id =
+ GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
+ }
+ } else {
+ // Pre-trained result. Map fonts using font_sets_.
+ int_result->adapted = false;
+ for (int f = 0; f < int_result->fonts.size(); ++f) {
+ int_result->fonts[f].fontinfo_id =
+ ClassAndConfigIDToFontOrShapeID(class_id,
+ int_result->fonts[f].fontinfo_id);
+ }
+ if (shape_table_ != nullptr) {
+ // Two possible cases:
+ // 1. Flat shapetable. All unichar-ids of the shapes referenced by
+ // int_result->fonts are the same. In this case build a new vector of
+ // mapped fonts and replace the fonts in int_result.
+ // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
+ // by int_result. In this case, build a vector of UnicharRating to
+ // gather together different font-ids for each unichar. Also covers case1.
+ GenericVector<UnicharRating> mapped_results;
+ for (int f = 0; f < int_result->fonts.size(); ++f) {
+ int shape_id = int_result->fonts[f].fontinfo_id;
+ const Shape& shape = shape_table_->GetShape(shape_id);
+ for (int c = 0; c < shape.size(); ++c) {
+ int unichar_id = shape[c].unichar_id;
+ if (!unicharset.get_enabled(unichar_id)) continue;
+ // Find the mapped_result for unichar_id.
+ int r = 0;
+ for (r = 0; r < mapped_results.size() &&
+ mapped_results[r].unichar_id != unichar_id; ++r) {}
+ if (r == mapped_results.size()) {
+ mapped_results.push_back(*int_result);
+ mapped_results[r].unichar_id = unichar_id;
+ mapped_results[r].fonts.clear();
+ }
+ for (int i = 0; i < shape[c].font_ids.size(); ++i) {
+ mapped_results[r].fonts.push_back(
+ ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
+ }
+ }
+ }
+ for (int m = 0; m < mapped_results.size(); ++m) {
+ mapped_results[m].rating =
+ ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
+ cp_rating, int_result->rating,
+ int_result->feature_misses, bottom, top,
+ blob_length, matcher_multiplier, cn_factors);
+ AddNewResult(mapped_results[m], final_results);
+ }
+ return;
+ }
+ }
+ if (unicharset.get_enabled(class_id)) {
+ int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
+ int_result->rating,
+ int_result->feature_misses,
+ bottom, top, blob_length,
+ matcher_multiplier, cn_factors);
+ AddNewResult(*int_result, final_results);
+ }
+}
+
+// Applies a set of corrections to the confidence im_rating,
+// including the cn_correction, miss penalty and additional penalty
+// for non-alnums being vertical misfits. Returns the corrected confidence.
+double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
+ double cp_rating, double im_rating,
+ int feature_misses,
+ int bottom, int top,
+ int blob_length, int matcher_multiplier,
+ const uint8_t* cn_factors) {
+ // Compute class feature corrections.
+ double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
+ cn_factors[unichar_id],
+ matcher_multiplier);
+ double miss_penalty = tessedit_class_miss_scale * feature_misses;
+ double vertical_penalty = 0.0;
+ // Penalize non-alnums for being vertical misfits.
+ if (!unicharset.get_isalpha(unichar_id) &&
+ !unicharset.get_isdigit(unichar_id) &&
+ cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
+ int min_bottom, max_bottom, min_top, max_top;
+ unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
+ &min_top, &max_top);
+ if (debug) {
+ tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
+ top, min_top, max_top, bottom, min_bottom, max_bottom);
+ }
+ if (top < min_top || top > max_top ||
+ bottom < min_bottom || bottom > max_bottom) {
+ vertical_penalty = classify_misfit_junk_penalty;
+ }
+ }
+ double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
+ if (result < WORST_POSSIBLE_RATING)
+ result = WORST_POSSIBLE_RATING;
+ if (debug) {
+ tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
+ unicharset.id_to_unichar(unichar_id),
+ result * 100.0,
+ cp_rating * 100.0,
+ (1.0 - im_rating) * 100.0,
+ (cn_corrected - (1.0 - im_rating)) * 100.0,
+ cn_factors[unichar_id],
+ miss_penalty * 100.0,
+ vertical_penalty * 100.0);
+ }
+ return result;
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine extracts baseline normalized features
+ * from the unknown character and matches them against the
+ * specified set of templates. The classes which match
+ * are added to Results.
+ *
+ * Globals:
+ * - BaselineCutoffs expected num features for each class
+ *
+ * @param Blob blob to be classified
+ * @param Templates current set of adapted templates
+ * @param Results place to put match results
+ * @param int_features
+ * @param fx_info
+ *
+ * @return Array of possible ambiguous chars that should be checked.
+ */
+UNICHAR_ID *Classify::BaselineClassifier(
+ TBLOB *Blob, const std::vector<INT_FEATURE_STRUCT>& int_features,
+ const INT_FX_RESULT_STRUCT& fx_info,
+ ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) {
+ if (int_features.empty()) return nullptr;
+ auto* CharNormArray = new uint8_t[unicharset.size()];
+ ClearCharNormArray(CharNormArray);
+
+ Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
+ PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
+ CharNormArray, BaselineCutoffs, &Results->CPResults);
+
+ if (matcher_debug_level >= 2 || classify_debug_level > 1)
+ tprintf("BL Matches = ");
+
+ MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
+ CharNormArray,
+ Templates->Class, matcher_debug_flags, 0,
+ Blob->bounding_box(), Results->CPResults, Results);
+
+ delete [] CharNormArray;
+ CLASS_ID ClassId = Results->best_unichar_id;
+ if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
+ return nullptr;
+
+ return Templates->Class[ClassId]->
+ Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
+} /* BaselineClassifier */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine extracts character normalized features
+ * from the unknown character and matches them against the
+ * specified set of templates. The classes which match
+ * are added to Results.
+ *
+ * @param blob blob to be classified
+ * @param sample templates to classify unknown against
+ * @param adapt_results place to put match results
+ *
+ * Globals:
+ * - CharNormCutoffs expected num features for each class
+ * - AllProtosOn mask that enables all protos
+ * - AllConfigsOn mask that enables all configs
+ */
+int Classify::CharNormClassifier(TBLOB *blob,
+ const TrainingSample& sample,
+ ADAPT_RESULTS *adapt_results) {
+ // This is the length that is used for scaling ratings vs certainty.
+ adapt_results->BlobLength =
+ IntCastRounded(sample.outline_length() / kStandardFeatureLength);
+ std::vector<UnicharRating> unichar_results;
+ static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
+ -1, &unichar_results);
+ // Convert results to the format used internally by AdaptiveClassifier.
+ for (int r = 0; r < unichar_results.size(); ++r) {
+ AddNewResult(unichar_results[r], adapt_results);
+ }
+ return sample.num_features();
+} /* CharNormClassifier */
+
+// As CharNormClassifier, but operates on a TrainingSample and outputs to
+// a GenericVector of ShapeRating without conversion to classes.
+int Classify::CharNormTrainingSample(bool pruner_only,
+ int keep_this,
+ const TrainingSample& sample,
+ std::vector<UnicharRating>* results) {
+ results->clear();
+ auto* adapt_results = new ADAPT_RESULTS();
+ adapt_results->Initialize();
+ // Compute the bounding box of the features.
+ uint32_t num_features = sample.num_features();
+ // Only the top and bottom of the blob_box are used by MasterMatcher, so
+ // fabricate right and left using top and bottom.
+ TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
+ sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
+ // Compute the char_norm_array from the saved cn_feature.
+ FEATURE norm_feature = sample.GetCNFeature();
+ auto* char_norm_array = new uint8_t[unicharset.size()];
+ int num_pruner_classes = std::max(unicharset.size(),
+ PreTrainedTemplates->NumClasses);
+ auto* pruner_norm_array = new uint8_t[num_pruner_classes];
+ adapt_results->BlobLength =
+ static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
+ ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
+ pruner_norm_array);
+
+ PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
+ pruner_norm_array,
+ shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
+ &adapt_results->CPResults);
+ delete [] pruner_norm_array;
+ if (keep_this >= 0) {
+ adapt_results->CPResults[0].Class = keep_this;
+ adapt_results->CPResults.resize(1);
+ }
+ if (pruner_only) {
+ // Convert pruner results to output format.
+ for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
+ int class_id = adapt_results->CPResults[i].Class;
+ results->push_back(
+ UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
+ }
+ } else {
+ MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
+ char_norm_array,
+ nullptr, matcher_debug_flags,
+ classify_integer_matcher_multiplier,
+ blob_box, adapt_results->CPResults, adapt_results);
+ // Convert master matcher results to output format.
+ for (int i = 0; i < adapt_results->match.size(); i++) {
+ results->push_back(adapt_results->match[i]);
+ }
+ if (results->size() > 1) {
+ std::sort(results->begin(), results->end(), SortDescendingRating);
+ }
+ }
+ delete [] char_norm_array;
+ delete adapt_results;
+ return num_features;
+} /* CharNormTrainingSample */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine computes a rating which reflects the
+ * likelihood that the blob being classified is a noise
+ * blob. NOTE: assumes that the blob length has already been
+ * computed and placed into Results.
+ *
+ * @param results results to add noise classification to
+ *
+ * Globals:
+ * - matcher_avg_noise_size avg. length of a noise blob
+ */
+void Classify::ClassifyAsNoise(ADAPT_RESULTS *results) {
+ float rating = results->BlobLength / matcher_avg_noise_size;
+ rating *= rating;
+ rating /= 1.0 + rating;
+
+ AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
+} /* ClassifyAsNoise */
+
+/// The function converts the given match ratings to the list of blob
+/// choices with ratings and certainties (used by the context checkers).
+/// If character fragments are present in the results, this function also makes
+/// sure that there is at least one non-fragmented classification included.
+/// For each classification result check the unicharset for "definite"
+/// ambiguities and modify the resulting Choices accordingly.
+void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
+ ADAPT_RESULTS *Results,
+ BLOB_CHOICE_LIST *Choices) {
+ assert(Choices != nullptr);
+ float Rating;
+ float Certainty;
+ BLOB_CHOICE_IT temp_it;
+ bool contains_nonfrag = false;
+ temp_it.set_to_list(Choices);
+ int choices_length = 0;
+ // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
+ // number of returned results, but with a shape_table_ we want to have room
+ // for at least the biggest shape (which might contain hundreds of Indic
+ // grapheme fragments) and more, so use double the size of the biggest shape
+ // if that is more than the default.
+ int max_matches = MAX_MATCHES;
+ if (shape_table_ != nullptr) {
+ max_matches = shape_table_->MaxNumUnichars() * 2;
+ if (max_matches < MAX_MATCHES)
+ max_matches = MAX_MATCHES;
+ }
+
+ float best_certainty = -FLT_MAX;
+ for (int i = 0; i < Results->match.size(); i++) {
+ const UnicharRating& result = Results->match[i];
+ bool adapted = result.adapted;
+ bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
+ if (temp_it.length()+1 == max_matches &&
+ !contains_nonfrag && current_is_frag) {
+ continue; // look for a non-fragmented character to fill the
+ // last spot in Choices if only fragments are present
+ }
+ // BlobLength can never be legally 0, this means recognition failed.
+ // But we must return a classification result because some invoking
+ // functions (chopper/permuter) do not anticipate a null blob choice.
+ // So we need to assign a poor, but not infinitely bad score.
+ if (Results->BlobLength == 0) {
+ Certainty = -20;
+ Rating = 100; // should be -certainty * real_blob_length
+ } else {
+ Rating = Certainty = (1.0f - result.rating);
+ Rating *= rating_scale * Results->BlobLength;
+ Certainty *= -(getDict().certainty_scale);
+ }
+ // Adapted results, by their very nature, should have good certainty.
+ // Those that don't are at best misleading, and often lead to errors,
+ // so don't accept adapted results that are too far behind the best result,
+ // whether adapted or static.
+ // TODO(rays) find some way of automatically tuning these constants.
+ if (Certainty > best_certainty) {
+ best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
+ } else if (adapted &&
+ Certainty / classify_adapted_pruning_factor < best_certainty) {
+ continue; // Don't accept bad adapted results.
+ }
+
+ float min_xheight, max_xheight, yshift;
+ denorm.XHeightRange(result.unichar_id, unicharset, box,
+ &min_xheight, &max_xheight, &yshift);
+ auto* choice =
+ new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
+ unicharset.get_script(result.unichar_id),
+ min_xheight, max_xheight, yshift,
+ adapted ? BCC_ADAPTED_CLASSIFIER
+ : BCC_STATIC_CLASSIFIER);
+ choice->set_fonts(result.fonts);
+ temp_it.add_to_end(choice);
+ contains_nonfrag |= !current_is_frag; // update contains_nonfrag
+ choices_length++;
+ if (choices_length >= max_matches) break;
+ }
+ Results->match.resize(choices_length);
+} // ConvertMatchesToChoices
+
+
+/*---------------------------------------------------------------------------*/
+#ifndef GRAPHICS_DISABLED
+/**
+ *
+ * @param blob blob whose classification is being debugged
+ * @param Results results of match being debugged
+ *
+ * Globals: none
+ */
+void Classify::DebugAdaptiveClassifier(TBLOB *blob,
+ ADAPT_RESULTS *Results) {
+ if (static_classifier_ == nullptr) return;
+ INT_FX_RESULT_STRUCT fx_info;
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ TrainingSample* sample =
+ BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
+ if (sample == nullptr) return;
+ static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
+ Results->best_unichar_id);
+} /* DebugAdaptiveClassifier */
+#endif
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine performs an adaptive classification.
+ * If we have not yet adapted to enough classes, a simple
+ * classification to the pre-trained templates is performed.
+ * Otherwise, we match the blob against the adapted templates.
+ * If the adapted templates do not match well, we try a
+ * match against the pre-trained templates. If an adapted
+ * template match is found, we do a match to any pre-trained
+ * templates which could be ambiguous. The results from all
+ * of these classifications are merged together into Results.
+ *
+ * @param Blob blob to be classified
+ * @param Results place to put match results
+ *
+ * Globals:
+ * - PreTrainedTemplates built-in training templates
+ * - AdaptedTemplates templates adapted for this page
+ * - matcher_reliable_adaptive_result rating limit for a great match
+ */
+void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {
+ UNICHAR_ID *Ambiguities;
+
+ INT_FX_RESULT_STRUCT fx_info;
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ TrainingSample* sample =
+ BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info,
+ &bl_features);
+ if (sample == nullptr) return;
+
+ // TODO: With LSTM, static_classifier_ is nullptr.
+ // Return to avoid crash in CharNormClassifier.
+ if (static_classifier_ == nullptr) {
+ delete sample;
+ return;
+ }
+
+ if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
+ tess_cn_matching) {
+ CharNormClassifier(Blob, *sample, Results);
+ } else {
+ Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
+ AdaptedTemplates, Results);
+ if ((!Results->match.empty() &&
+ MarginalMatch(Results->best_rating,
+ matcher_reliable_adaptive_result) &&
+ !tess_bn_matching) ||
+ Results->match.empty()) {
+ CharNormClassifier(Blob, *sample, Results);
+ } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
+ AmbigClassifier(bl_features, fx_info, Blob,
+ PreTrainedTemplates,
+ AdaptedTemplates->Class,
+ Ambiguities,
+ Results);
+ }
+ }
+
+ // Force the blob to be classified as noise
+ // if the results contain only fragments.
+ // TODO(daria): verify that this is better than
+ // just adding a nullptr classification.
+ if (!Results->HasNonfragment || Results->match.empty())
+ ClassifyAsNoise(Results);
+ delete sample;
+} /* DoAdaptiveMatch */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine matches blob to the built-in templates
+ * to find out if there are any classes other than the correct
+ * class which are potential ambiguities.
+ *
+ * @param Blob blob to get classification ambiguities for
+ * @param CorrectClass correct class for Blob
+ *
+ * Globals:
+ * - CurrentRatings used by qsort compare routine
+ * - PreTrainedTemplates built-in templates
+ *
+ * @return String containing all possible ambiguous classes.
+ */
+UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
+ CLASS_ID CorrectClass) {
+ auto *Results = new ADAPT_RESULTS();
+ UNICHAR_ID *Ambiguities;
+ int i;
+
+ Results->Initialize();
+ INT_FX_RESULT_STRUCT fx_info;
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ TrainingSample* sample =
+ BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info,
+ &bl_features);
+ if (sample == nullptr) {
+ delete Results;
+ return nullptr;
+ }
+
+ CharNormClassifier(Blob, *sample, Results);
+ delete sample;
+ RemoveBadMatches(Results);
+ std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
+
+ /* copy the class id's into an string of ambiguities - don't copy if
+ the correct class is the only class id matched */
+ Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
+ if (Results->match.size() > 1 ||
+ (Results->match.size() == 1 &&
+ Results->match[0].unichar_id != CorrectClass)) {
+ for (i = 0; i < Results->match.size(); i++)
+ Ambiguities[i] = Results->match[i].unichar_id;
+ Ambiguities[i] = -1;
+ } else {
+ Ambiguities[0] = -1;
+ }
+
+ delete Results;
+ return Ambiguities;
+} /* GetAmbiguities */
+
+// Returns true if the given blob looks too dissimilar to any character
+// present in the classifier templates.
+bool Classify::LooksLikeGarbage(TBLOB *blob) {
+ auto *ratings = new BLOB_CHOICE_LIST();
+ AdaptiveClassifier(blob, ratings);
+ BLOB_CHOICE_IT ratings_it(ratings);
+ const UNICHARSET &unicharset = getDict().getUnicharset();
+ if (classify_debug_character_fragments) {
+ print_ratings_list("======================\nLooksLikeGarbage() got ",
+ ratings, unicharset);
+ }
+ for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
+ ratings_it.forward()) {
+ if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
+ continue;
+ }
+ float certainty = ratings_it.data()->certainty();
+ delete ratings;
+ return certainty <
+ classify_character_fragments_garbage_certainty_threshold;
+ }
+ delete ratings;
+ return true; // no whole characters in ratings
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine calls the integer (Hardware) feature
+ * extractor if it has not been called before for this blob.
+ *
+ * The results from the feature extractor are placed into
+ * globals so that they can be used in other routines without
+ * re-extracting the features.
+ *
+ * It then copies the char norm features into the IntFeatures
+ * array provided by the caller.
+ *
+ * @param templates used to compute char norm adjustments
+ * @param pruner_norm_array Array of factors from blob normalization
+ * process
+ * @param char_norm_array array to fill with dummy char norm adjustments
+ * @param fx_info
+ *
+ * Globals:
+ *
+ * @return Number of features extracted or 0 if an error occurred.
+ */
+int Classify::GetCharNormFeature(const INT_FX_RESULT_STRUCT& fx_info,
+ INT_TEMPLATES templates,
+ uint8_t* pruner_norm_array,
+ uint8_t* char_norm_array) {
+ FEATURE norm_feature = NewFeature(&CharNormDesc);
+ float baseline = kBlnBaselineOffset;
+ float scale = MF_SCALE_FACTOR;
+ norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
+ norm_feature->Params[CharNormLength] =
+ fx_info.Length * scale / LENGTH_COMPRESSION;
+ norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
+ norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
+ // Deletes norm_feature.
+ ComputeCharNormArrays(norm_feature, templates, char_norm_array,
+ pruner_norm_array);
+ return IntCastRounded(fx_info.Length / kStandardFeatureLength);
+} /* GetCharNormFeature */
+
+// Computes the char_norm_array for the unicharset and, if not nullptr, the
+// pruner_array as appropriate according to the existence of the shape_table.
+void Classify::ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
+ INT_TEMPLATES_STRUCT* templates,
+ uint8_t* char_norm_array,
+ uint8_t* pruner_array) {
+ ComputeIntCharNormArray(*norm_feature, char_norm_array);
+ if (pruner_array != nullptr) {
+ if (shape_table_ == nullptr) {
+ ComputeIntCharNormArray(*norm_feature, pruner_array);
+ } else {
+ memset(pruner_array, UINT8_MAX,
+ templates->NumClasses * sizeof(pruner_array[0]));
+ // Each entry in the pruner norm array is the MIN of all the entries of
+ // the corresponding unichars in the CharNormArray.
+ for (int id = 0; id < templates->NumClasses; ++id) {
+ int font_set_id = templates->Class[id]->font_set_id;
+ const FontSet &fs = fontset_table_.get(font_set_id);
+ for (int config = 0; config < fs.size; ++config) {
+ const Shape& shape = shape_table_->GetShape(fs.configs[config]);
+ for (int c = 0; c < shape.size(); ++c) {
+ if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
+ pruner_array[id] = char_norm_array[shape[c].unichar_id];
+ }
+ }
+ }
+ }
+ }
+ FreeFeature(norm_feature);
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ *
+ * @param Templates adapted templates to add new config to
+ * @param ClassId class id to associate with new config
+ * @param FontinfoId font information inferred from pre-trained templates
+ * @param NumFeatures number of features in IntFeatures
+ * @param Features features describing model for new config
+ * @param FloatFeatures floating-pt representation of features
+ *
+ * @return The id of the new config created, a negative integer in
+ * case of error.
+ */
+int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
+ CLASS_ID ClassId,
+ int FontinfoId,
+ int NumFeatures,
+ INT_FEATURE_ARRAY Features,
+ FEATURE_SET FloatFeatures) {
+ INT_CLASS IClass;
+ ADAPT_CLASS Class;
+ PROTO_ID OldProtos[MAX_NUM_PROTOS];
+ FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
+ int NumOldProtos;
+ int NumBadFeatures;
+ int MaxProtoId, OldMaxProtoId;
+ int MaskSize;
+ int ConfigId;
+ TEMP_CONFIG Config;
+ int i;
+ int debug_level = NO_DEBUG;
+
+ if (classify_learning_debug_level >= 3)
+ debug_level =
+ PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;
+
+ IClass = ClassForClassId(Templates->Templates, ClassId);
+ Class = Templates->Class[ClassId];
+
+ if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
+ ++NumAdaptationsFailed;
+ if (classify_learning_debug_level >= 1)
+ tprintf("Cannot make new temporary config: maximum number exceeded.\n");
+ return -1;
+ }
+
+ OldMaxProtoId = IClass->NumProtos - 1;
+
+ NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
+ NumFeatures, Features,
+ OldProtos, classify_adapt_proto_threshold,
+ debug_level);
+
+ MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
+ zero_all_bits(TempProtoMask, MaskSize);
+ for (i = 0; i < NumOldProtos; i++)
+ SET_BIT(TempProtoMask, OldProtos[i]);
+
+ NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
+ NumFeatures, Features,
+ BadFeatures,
+ classify_adapt_feature_threshold,
+ debug_level);
+
+ MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
+ IClass, Class, TempProtoMask);
+ if (MaxProtoId == NO_PROTO) {
+ ++NumAdaptationsFailed;
+ if (classify_learning_debug_level >= 1)
+ tprintf("Cannot make new temp protos: maximum number exceeded.\n");
+ return -1;
+ }
+
+ ConfigId = AddIntConfig(IClass);
+ ConvertConfig(TempProtoMask, ConfigId, IClass);
+ Config = NewTempConfig(MaxProtoId, FontinfoId);
+ TempConfigFor(Class, ConfigId) = Config;
+ copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
+
+ if (classify_learning_debug_level >= 1)
+ tprintf("Making new temp config %d fontinfo id %d"
+ " using %d old and %d new protos.\n",
+ ConfigId, Config->FontinfoId,
+ NumOldProtos, MaxProtoId - OldMaxProtoId);
+
+ return ConfigId;
+} /* MakeNewTemporaryConfig */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine finds sets of sequential bad features
+ * that all have the same angle and converts each set into
+ * a new temporary proto. The temp proto is added to the
+ * proto pruner for IClass, pushed onto the list of temp
+ * protos in Class, and added to TempProtoMask.
+ *
+ * @param Features floating-pt features describing new character
+ * @param NumBadFeat number of bad features to turn into protos
+ * @param BadFeat feature id's of bad features
+ * @param IClass integer class templates to add new protos to
+ * @param Class adapted class templates to add new protos to
+ * @param TempProtoMask proto mask to add new protos to
+ *
+ * Globals: none
+ *
+ * @return Max proto id in class after all protos have been added.
+ */
+PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features,
+ int NumBadFeat,
+ FEATURE_ID BadFeat[],
+ INT_CLASS IClass,
+ ADAPT_CLASS Class,
+ BIT_VECTOR TempProtoMask) {
+ FEATURE_ID *ProtoStart;
+ FEATURE_ID *ProtoEnd;
+ FEATURE_ID *LastBad;
+ TEMP_PROTO TempProto;
+ PROTO Proto;
+ FEATURE F1, F2;
+ float X1, X2, Y1, Y2;
+ float A1, A2, AngleDelta;
+ float SegmentLength;
+ PROTO_ID Pid;
+
+ for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
+ ProtoStart < LastBad; ProtoStart = ProtoEnd) {
+ F1 = Features->Features[*ProtoStart];
+ X1 = F1->Params[PicoFeatX];
+ Y1 = F1->Params[PicoFeatY];
+ A1 = F1->Params[PicoFeatDir];
+
+ for (ProtoEnd = ProtoStart + 1,
+ SegmentLength = GetPicoFeatureLength();
+ ProtoEnd < LastBad;
+ ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
+ F2 = Features->Features[*ProtoEnd];
+ X2 = F2->Params[PicoFeatX];
+ Y2 = F2->Params[PicoFeatY];
+ A2 = F2->Params[PicoFeatDir];
+
+ AngleDelta = fabs(A1 - A2);
+ if (AngleDelta > 0.5)
+ AngleDelta = 1.0 - AngleDelta;
+
+ if (AngleDelta > matcher_clustering_max_angle_delta ||
+ fabs(X1 - X2) > SegmentLength ||
+ fabs(Y1 - Y2) > SegmentLength)
+ break;
+ }
+
+ F2 = Features->Features[*(ProtoEnd - 1)];
+ X2 = F2->Params[PicoFeatX];
+ Y2 = F2->Params[PicoFeatY];
+ A2 = F2->Params[PicoFeatDir];
+
+ Pid = AddIntProto(IClass);
+ if (Pid == NO_PROTO)
+ return (NO_PROTO);
+
+ TempProto = NewTempProto();
+ Proto = &(TempProto->Proto);
+
+ /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
+ ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
+ instead of the -0.25 to 0.75 used in baseline normalization */
+ Proto->Length = SegmentLength;
+ Proto->Angle = A1;
+ Proto->X = (X1 + X2) / 2.0;
+ Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
+ FillABC(Proto);
+
+ TempProto->ProtoId = Pid;
+ SET_BIT(TempProtoMask, Pid);
+
+ ConvertProto(Proto, Pid, IClass);
+ AddProtoToProtoPruner(Proto, Pid, IClass,
+ classify_learning_debug_level >= 2);
+
+ Class->TempProtos = push(Class->TempProtos, TempProto);
+ }
+ return IClass->NumProtos - 1;
+} /* MakeNewTempProtos */
+
+/*---------------------------------------------------------------------------*/
+/**
+ *
+ * @param Templates current set of adaptive templates
+ * @param ClassId class containing config to be made permanent
+ * @param ConfigId config to be made permanent
+ * @param Blob current blob being adapted to
+ *
+ * Globals: none
+ */
+void Classify::MakePermanent(ADAPT_TEMPLATES Templates,
+ CLASS_ID ClassId,
+ int ConfigId,
+ TBLOB *Blob) {
+ UNICHAR_ID *Ambigs;
+ TEMP_CONFIG Config;
+ ADAPT_CLASS Class;
+ PROTO_KEY ProtoKey;
+
+ Class = Templates->Class[ClassId];
+ Config = TempConfigFor(Class, ConfigId);
+
+ MakeConfigPermanent(Class, ConfigId);
+ if (Class->NumPermConfigs == 0)
+ Templates->NumPermClasses++;
+ Class->NumPermConfigs++;
+
+ // Initialize permanent config.
+ Ambigs = GetAmbiguities(Blob, ClassId);
+ auto Perm = static_cast<PERM_CONFIG>(malloc(sizeof(PERM_CONFIG_STRUCT)));
+ Perm->Ambigs = Ambigs;
+ Perm->FontinfoId = Config->FontinfoId;
+
+ // Free memory associated with temporary config (since ADAPTED_CONFIG
+ // is a union we need to clean up before we record permanent config).
+ ProtoKey.Templates = Templates;
+ ProtoKey.ClassId = ClassId;
+ ProtoKey.ConfigId = ConfigId;
+ Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
+ FreeTempConfig(Config);
+
+ // Record permanent config.
+ PermConfigFor(Class, ConfigId) = Perm;
+
+ if (classify_learning_debug_level >= 1) {
+ tprintf("Making config %d for %s (ClassId %d) permanent:"
+ " fontinfo id %d, ambiguities '",
+ ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(),
+ ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
+ for (UNICHAR_ID *AmbigsPointer = Ambigs;
+ *AmbigsPointer >= 0; ++AmbigsPointer)
+ tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
+ tprintf("'.\n");
+ }
+} /* MakePermanent */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine converts TempProto to be permanent if
+ * its proto id is used by the configuration specified in
+ * ProtoKey.
+ *
+ * @param item1 (TEMP_PROTO) temporary proto to compare to key
+ * @param item2 (PROTO_KEY) defines which protos to make permanent
+ *
+ * Globals: none
+ *
+ * @return true if TempProto is converted, false otherwise
+ */
+int MakeTempProtoPerm(void *item1, void *item2) {
+ ADAPT_CLASS Class;
+ TEMP_CONFIG Config;
+ TEMP_PROTO TempProto;
+ PROTO_KEY *ProtoKey;
+
+ TempProto = static_cast<TEMP_PROTO>(item1);
+ ProtoKey = static_cast<PROTO_KEY *>(item2);
+
+ Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
+ Config = TempConfigFor(Class, ProtoKey->ConfigId);
+
+ if (TempProto->ProtoId > Config->MaxProtoId ||
+ !test_bit (Config->Protos, TempProto->ProtoId))
+ return false;
+
+ MakeProtoPermanent(Class, TempProto->ProtoId);
+ AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId,
+ ProtoKey->Templates->Templates);
+ FreeTempProto(TempProto);
+
+ return true;
+} /* MakeTempProtoPerm */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine writes the matches in Results to File.
+ *
+ * @param results match results to write to File
+ *
+ * Globals: none
+ */
+void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) {
+ for (int i = 0; i < results.match.size(); ++i) {
+ tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).c_str());
+ results.match[i].Print();
+ }
+} /* PrintAdaptiveMatchResults */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine steps through each matching class in Results
+ * and removes it from the match list if its rating
+ * is worse than the BestRating plus a pad. In other words,
+ * all good matches get moved to the front of the classes
+ * array.
+ *
+ * @param Results contains matches to be filtered
+ *
+ * Globals:
+ * - matcher_bad_match_pad defines a "bad match"
+ */
+void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
+ int Next, NextGood;
+ float BadMatchThreshold;
+ static const char* romans = "i v x I V X";
+ BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
+
+ if (classify_bln_numeric_mode) {
+ UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
+ unicharset.unichar_to_id("1") : -1;
+ UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
+ unicharset.unichar_to_id("0") : -1;
+ float scored_one = ScoredUnichar(unichar_id_one, *Results);
+ float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
+
+ for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
+ const UnicharRating& match = Results->match[Next];
+ if (match.rating >= BadMatchThreshold) {
+ if (!unicharset.get_isalpha(match.unichar_id) ||
+ strstr(romans,
+ unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
+ } else if (unicharset.eq(match.unichar_id, "l") &&
+ scored_one < BadMatchThreshold) {
+ Results->match[Next].unichar_id = unichar_id_one;
+ } else if (unicharset.eq(match.unichar_id, "O") &&
+ scored_zero < BadMatchThreshold) {
+ Results->match[Next].unichar_id = unichar_id_zero;
+ } else {
+ Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
+ }
+ if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
+ if (NextGood == Next) {
+ ++NextGood;
+ } else {
+ Results->match[NextGood++] = Results->match[Next];
+ }
+ }
+ }
+ }
+ } else {
+ for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
+ if (Results->match[Next].rating >= BadMatchThreshold) {
+ if (NextGood == Next) {
+ ++NextGood;
+ } else {
+ Results->match[NextGood++] = Results->match[Next];
+ }
+ }
+ }
+ }
+ Results->match.resize(NextGood);
+} /* RemoveBadMatches */
+
+/*----------------------------------------------------------------------------*/
+/**
+ * This routine discards extra digits or punctuation from the results.
+ * We keep only the top 2 punctuation answers and the top 1 digit answer if
+ * present.
+ *
+ * @param Results contains matches to be filtered
+ */
+void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
+ int Next, NextGood;
+ int punc_count; /*no of garbage characters */
+ int digit_count;
+ /*garbage characters */
+ static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
+ static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
+
+ punc_count = 0;
+ digit_count = 0;
+ for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
+ const UnicharRating& match = Results->match[Next];
+ bool keep = true;
+ if (strstr(punc_chars,
+ unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
+ if (punc_count >= 2)
+ keep = false;
+ punc_count++;
+ } else {
+ if (strstr(digit_chars,
+ unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
+ if (digit_count >= 1)
+ keep = false;
+ digit_count++;
+ }
+ }
+ if (keep) {
+ if (NextGood == Next) {
+ ++NextGood;
+ } else {
+ Results->match[NextGood++] = match;
+ }
+ }
+ }
+ Results->match.resize(NextGood);
+} /* RemoveExtraPuncs */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine resets the internal thresholds inside
+ * the integer matcher to correspond to the specified
+ * threshold.
+ *
+ * @param Threshold threshold for creating new templates
+ *
+ * Globals:
+ * - matcher_good_threshold default good match rating
+ */
+void Classify::SetAdaptiveThreshold(float Threshold) {
+ Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
+ classify_adapt_proto_threshold.set_value(
+ ClipToRange<int>(255 * Threshold, 0, 255));
+ classify_adapt_feature_threshold.set_value(
+ ClipToRange<int>(255 * Threshold, 0, 255));
+} /* SetAdaptiveThreshold */
+
+#ifndef GRAPHICS_DISABLED
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine displays debug information for the best config
+ * of the given shape_id for the given set of features.
+ *
+ * @param shape_id classifier id to work with
+ * @param features features of the unknown character
+ * @param num_features Number of features in the features array.
+ */
+
+void Classify::ShowBestMatchFor(int shape_id,
+ const INT_FEATURE_STRUCT* features,
+ int num_features) {
+ uint32_t config_mask;
+ if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
+ tprintf("No built-in templates for class/shape %d\n", shape_id);
+ return;
+ }
+ if (num_features <= 0) {
+ tprintf("Illegal blob (char norm features)!\n");
+ return;
+ }
+ UnicharRating cn_result;
+ classify_norm_method.set_value(character);
+ im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
+ AllProtosOn, AllConfigsOn,
+ num_features, features, &cn_result,
+ classify_adapt_feature_threshold, NO_DEBUG,
+ matcher_debug_separate_windows);
+ tprintf("\n");
+ config_mask = 1 << cn_result.config;
+
+ tprintf("Static Shape ID: %d\n", shape_id);
+ ShowMatchDisplay();
+ im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn,
+ &config_mask, num_features, features, &cn_result,
+ classify_adapt_feature_threshold, matcher_debug_flags,
+ matcher_debug_separate_windows);
+ UpdateMatchDisplay();
+} /* ShowBestMatchFor */
+
+#endif // !GRAPHICS_DISABLED
+
+// Returns a string for the classifier class_id: either the corresponding
+// unicharset debug_str or the shape_table_ debug str.
+STRING Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
+ int class_id, int config_id) const {
+ STRING class_string;
+ if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
+ int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
+ class_string = shape_table_->DebugStr(shape_id);
+ } else {
+ class_string = unicharset.debug_str(class_id);
+ }
+ return class_string;
+}
+
+// Converts a classifier class_id index to a shape_table_ index
+int Classify::ClassAndConfigIDToFontOrShapeID(int class_id,
+ int int_result_config) const {
+ int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
+ // Older inttemps have no font_ids.
+ if (font_set_id < 0)
+ return kBlankFontinfoId;
+ const FontSet &fs = fontset_table_.get(font_set_id);
+ ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
+ return fs.configs[int_result_config];
+}
+
+// Converts a shape_table_ index to a classifier class_id index (not a
+// unichar-id!). Uses a search, so not fast.
+int Classify::ShapeIDToClassID(int shape_id) const {
+ for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
+ int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
+ ASSERT_HOST(font_set_id >= 0);
+ const FontSet &fs = fontset_table_.get(font_set_id);
+ for (int config = 0; config < fs.size; ++config) {
+ if (fs.configs[config] == shape_id)
+ return id;
+ }
+ }
+ tprintf("Shape %d not found\n", shape_id);
+ return -1;
+}
+
+// Returns true if the given TEMP_CONFIG is good enough to make it
+// a permanent config.
+bool Classify::TempConfigReliable(CLASS_ID class_id,
+ const TEMP_CONFIG &config) {
+ if (classify_learning_debug_level >= 1) {
+ tprintf("NumTimesSeen for config of %s is %d\n",
+ getDict().getUnicharset().debug_str(class_id).c_str(),
+ config->NumTimesSeen);
+ }
+ if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
+ return true;
+ } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
+ return false;
+ } else if (use_ambigs_for_adaption) {
+ // Go through the ambigs vector and see whether we have already seen
+ // enough times all the characters represented by the ambigs vector.
+ const UnicharIdVector *ambigs =
+ getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
+ int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
+ for (int ambig = 0; ambig < ambigs_size; ++ambig) {
+ ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
+ assert(ambig_class != nullptr);
+ if (ambig_class->NumPermConfigs == 0 &&
+ ambig_class->MaxNumTimesSeen <
+ matcher_min_examples_for_prototyping) {
+ if (classify_learning_debug_level >= 1) {
+ tprintf("Ambig %s has not been seen enough times,"
+ " not making config for %s permanent\n",
+ getDict().getUnicharset().debug_str(
+ (*ambigs)[ambig]).c_str(),
+ getDict().getUnicharset().debug_str(class_id).c_str());
+ }
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void Classify::UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob) {
+ const UnicharIdVector *ambigs =
+ getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
+ int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
+ if (classify_learning_debug_level >= 1) {
+ tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
+ getDict().getUnicharset().debug_str(class_id).c_str(), class_id);
+ }
+ for (int ambig = 0; ambig < ambigs_size; ++ambig) {
+ CLASS_ID ambig_class_id = (*ambigs)[ambig];
+ const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
+ for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
+ if (ConfigIsPermanent(ambigs_class, cfg)) continue;
+ const TEMP_CONFIG config =
+ TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
+ if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
+ if (classify_learning_debug_level >= 1) {
+ tprintf("Making config %d of %s permanent\n", cfg,
+ getDict().getUnicharset().debug_str(
+ ambig_class_id).c_str());
+ }
+ MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
+ }
+ }
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/blobclass.cpp b/tesseract/src/classify/blobclass.cpp
new file mode 100644
index 00000000..497ad045
--- /dev/null
+++ b/tesseract/src/classify/blobclass.cpp
@@ -0,0 +1,110 @@
+/******************************************************************************
+ ** Filename: blobclass.c
+ ** Purpose: High level blob classification and training routines.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "blobclass.h"
+
+#include <cstdio>
+
+#include "classify.h"
+#include "featdefs.h"
+#include "mf.h"
+#include "normfeat.h"
+
+namespace tesseract {
+
+static const char kUnknownFontName[] = "UnknownFont";
+
+static STRING_VAR(classify_font_name, kUnknownFontName,
+ "Default font name to be used in training");
+
+/**----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------**/
+
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+void ExtractFontName(const char* filename, STRING* fontname) {
+ *fontname = classify_font_name;
+ if (*fontname == kUnknownFontName) {
+ // filename is expected to be of the form [lang].[fontname].exp[num]
+ // The [lang], [fontname] and [num] fields should not have '.' characters.
+ const char *basename = strrchr(filename, '/');
+ const char *firstdot = strchr(basename ? basename : filename, '.');
+ const char *lastdot = strrchr(filename, '.');
+ if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
+ ++firstdot;
+ *fontname = firstdot;
+ fontname->truncate_at(lastdot - firstdot);
+ }
+ }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+// Extracts features from the given blob and saves them in the tr_file_data_
+// member variable.
+// fontname: Name of font that this blob was printed in.
+// cn_denorm: Character normalization transformation to apply to the blob.
+// fx_info: Character normalization parameters computed with cn_denorm.
+// blob_text: Ground truth text for the blob.
+void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
+ const DENORM& cn_denorm,
+ const INT_FX_RESULT_STRUCT& fx_info,
+ const char* blob_text) {
+ CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
+ CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
+ CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
+ CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
+ CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
+
+ if (ValidCharDescription(feature_defs_, CharDesc)) {
+ // Label the features with a class name and font name.
+ tr_file_data_ += "\n";
+ tr_file_data_ += fontname;
+ tr_file_data_ += " ";
+ tr_file_data_ += blob_text;
+ tr_file_data_ += "\n";
+
+ // write micro-features to file and clean up
+ WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
+ } else {
+ tprintf("Blob learned was invalid!\n");
+ }
+ FreeCharDescription(CharDesc);
+} // LearnBlob
+
+// Writes stored training data to a .tr file based on the given filename.
+// Returns false on error.
+bool Classify::WriteTRFile(const char* filename) {
+ bool result = false;
+ std::string tr_filename = filename;
+ tr_filename += ".tr";
+ FILE* fp = fopen(tr_filename.c_str(), "wb");
+ if (fp) {
+ result =
+ tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
+ fclose(fp);
+ }
+ tr_file_data_.truncate_at(0);
+ return result;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/blobclass.h b/tesseract/src/classify/blobclass.h
new file mode 100644
index 00000000..94532fc9
--- /dev/null
+++ b/tesseract/src/classify/blobclass.h
@@ -0,0 +1,39 @@
+/******************************************************************************
+ ** Filename: blobclass.h
+ ** Purpose: Interface to high level classification and training.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef BLOBCLASS_H
+#define BLOBCLASS_H
+
+/**----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#include "strngs.h"
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+namespace tesseract {
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+void ExtractFontName(const char* filename, STRING* fontname);
+
+} // namespace tesseract.
+
+#endif
diff --git a/tesseract/src/classify/classify.cpp b/tesseract/src/classify/classify.cpp
new file mode 100644
index 00000000..939036d0
--- /dev/null
+++ b/tesseract/src/classify/classify.cpp
@@ -0,0 +1,230 @@
+///////////////////////////////////////////////////////////////////////
+// File: classify.cpp
+// Description: classify class.
+// Author: Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "classify.h"
+
+#ifdef DISABLED_LEGACY_ENGINE
+
+#include <string.h>
+
+namespace tesseract {
+
+Classify::Classify()
+ :
+ INT_MEMBER(classify_debug_level, 0, "Classify debug level",
+ this->params()),
+
+ BOOL_MEMBER(classify_bln_numeric_mode, 0,
+"Assume the input is numbers [0-9].", this->params()),
+
+ double_MEMBER(classify_max_rating_ratio, 1.5,
+ "Veto ratio between classifier ratings", this->params()),
+
+ double_MEMBER(classify_max_certainty_margin, 5.5,
+ "Veto difference between classifier certainties",
+ this->params()),
+
+ dict_(this) {}
+
+Classify::~Classify() {}
+
+} // namespace tesseract
+
+#else // DISABLED_LEGACY_ENGINE not defined
+
+#include "fontinfo.h"
+#include "intproto.h"
+#include "mfoutline.h"
+#include "scrollview.h"
+#include "shapeclassifier.h"
+#include "shapetable.h"
+#include "unicity_table.h"
+#include <cstring>
+
+namespace tesseract {
+Classify::Classify()
+ : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
+ this->params()),
+ BOOL_MEMBER(prioritize_division, false,
+ "Prioritize blob division over chopping", this->params()),
+ BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
+ this->params()),
+ INT_MEMBER(classify_debug_level, 0, "Classify debug level",
+ this->params()),
+ INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
+ this->params()),
+ double_MEMBER(classify_char_norm_range, 0.2,
+ "Character Normalization Range ...", this->params()),
+ double_MEMBER(classify_max_rating_ratio, 1.5,
+ "Veto ratio between classifier ratings", this->params()),
+ double_MEMBER(classify_max_certainty_margin, 5.5,
+ "Veto difference between classifier certainties",
+ this->params()),
+ BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
+ this->params()),
+ BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
+ this->params()),
+ BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
+ "Enable adaptive classifier", this->params()),
+ BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
+ "Use pre-adapted classifier templates", this->params()),
+ BOOL_MEMBER(classify_save_adapted_templates, 0,
+ "Save adapted templates to a file", this->params()),
+ BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
+ this->params()),
+ BOOL_MEMBER(classify_nonlinear_norm, 0,
+ "Non-linear stroke-density normalization", this->params()),
+ INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
+ INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
+ INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
+ this->params()),
+ double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
+ this->params()),
+ double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
+ this->params()),
+ double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
+ this->params()),
+ double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
+ this->params()),
+ double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
+ this->params()),
+ double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
+ this->params()),
+ INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
+ this->params()),
+ INT_MEMBER(matcher_min_examples_for_prototyping, 3,
+ "Reliable Config Threshold", this->params()),
+ INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
+ "Enable adaption even if the ambiguities have not been seen",
+ this->params()),
+ double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
+ "Maximum angle delta for prototype clustering",
+ this->params()),
+ double_MEMBER(classify_misfit_junk_penalty, 0.0,
+ "Penalty to apply when a non-alnum is vertically out of "
+ "its expected textline position",
+ this->params()),
+ double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
+ double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
+ this->params()),
+ double_MEMBER(tessedit_class_miss_scale, 0.00390625,
+ "Scale factor for features not used", this->params()),
+ double_MEMBER(
+ classify_adapted_pruning_factor, 2.5,
+ "Prune poor adapted results this much worse than best result",
+ this->params()),
+ double_MEMBER(classify_adapted_pruning_threshold, -1.0,
+ "Threshold at which classify_adapted_pruning_factor starts",
+ this->params()),
+ INT_MEMBER(classify_adapt_proto_threshold, 230,
+ "Threshold for good protos during adaptive 0-255",
+ this->params()),
+ INT_MEMBER(classify_adapt_feature_threshold, 230,
+ "Threshold for good features during adaptive 0-255",
+ this->params()),
+ BOOL_MEMBER(disable_character_fragments, true,
+ "Do not include character fragments in the"
+ " results of the classifier",
+ this->params()),
+ double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
+ -3.0,
+ "Exclude fragments that do not look like whole"
+ " characters from training and adaption",
+ this->params()),
+ BOOL_MEMBER(classify_debug_character_fragments, false,
+ "Bring up graphical debugging windows for fragments training",
+ this->params()),
+ BOOL_MEMBER(matcher_debug_separate_windows, false,
+ "Use two different windows for debugging the matching: "
+ "One for the protos and one for the features.",
+ this->params()),
+ STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
+ this->params()),
+ INT_MEMBER(classify_class_pruner_threshold, 229,
+ "Class Pruner Threshold 0-255", this->params()),
+ INT_MEMBER(classify_class_pruner_multiplier, 15,
+ "Class Pruner Multiplier 0-255: ", this->params()),
+ INT_MEMBER(classify_cp_cutoff_strength, 7,
+ "Class Pruner CutoffStrength: ", this->params()),
+ INT_MEMBER(classify_integer_matcher_multiplier, 10,
+ "Integer Matcher Multiplier 0-255: ", this->params()),
+ BOOL_MEMBER(classify_bln_numeric_mode, 0,
+ "Assume the input is numbers [0-9].", this->params()),
+ double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
+ this->params()),
+ double_MEMBER(speckle_rating_penalty, 10.0,
+ "Penalty to add to worst rating for noise", this->params()),
+ im_(&classify_debug_level),
+ dict_(this) {
+ using namespace std::placeholders; // for _1, _2
+ fontinfo_table_.set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
+ fontset_table_.set_clear_callback(std::bind(FontSetDeleteCallback, _1));
+
+ InitFeatureDefs(&feature_defs_);
+}
+
+Classify::~Classify() {
+ EndAdaptiveClassifier();
+ delete learn_debug_win_;
+ delete learn_fragmented_word_debug_win_;
+ delete learn_fragments_debug_win_;
+}
+
+
+// Takes ownership of the given classifier, and uses it for future calls
+// to CharNormClassifier.
+void Classify::SetStaticClassifier(ShapeClassifier* static_classifier) {
+ delete static_classifier_;
+ static_classifier_ = static_classifier;
+}
+
+// Moved from speckle.cpp
+// Adds a noise classification result that is a bit worse than the worst
+// current result, or the worst possible result if no current results.
+void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
+ BLOB_CHOICE_IT bc_it(choices);
+ // If there is no classifier result, we will use the worst possible certainty
+ // and corresponding rating.
+ float certainty = -getDict().certainty_scale;
+ float rating = rating_scale * blob_length;
+ if (!choices->empty() && blob_length > 0) {
+ bc_it.move_to_last();
+ BLOB_CHOICE* worst_choice = bc_it.data();
+ // Add speckle_rating_penalty to worst rating, matching old value.
+ rating = worst_choice->rating() + speckle_rating_penalty;
+ // Compute the rating to correspond to the certainty. (Used to be kept
+ // the same, but that messes up the language model search.)
+ certainty = -rating * getDict().certainty_scale /
+ (rating_scale * blob_length);
+ }
+ auto* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
+ -1, 0.0f, FLT_MAX, 0,
+ BCC_SPECKLE_CLASSIFIER);
+ bc_it.add_to_end(blob_choice);
+}
+
+// Returns true if the blob is small enough to be a large speckle.
+bool Classify::LargeSpeckle(const TBLOB &blob) {
+ double speckle_size = kBlnXHeight * speckle_large_max_size;
+ TBOX bbox = blob.bounding_box();
+ return bbox.width() < speckle_size && bbox.height() < speckle_size;
+}
+
+} // namespace tesseract
+
+#endif // def DISABLED_LEGACY_ENGINE
diff --git a/tesseract/src/classify/classify.h b/tesseract/src/classify/classify.h
new file mode 100644
index 00000000..44e0a77b
--- /dev/null
+++ b/tesseract/src/classify/classify.h
@@ -0,0 +1,583 @@
+///////////////////////////////////////////////////////////////////////
+// File: classify.h
+// Description: classify class.
+// Author: Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CLASSIFY_CLASSIFY_H_
+#define TESSERACT_CLASSIFY_CLASSIFY_H_
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#ifdef DISABLED_LEGACY_ENGINE
+
+#include "ccstruct.h"
+#include "dict.h"
+
+namespace tesseract {
+
+class Classify : public CCStruct {
+ public:
+ Classify();
+ virtual ~Classify();
+ virtual Dict& getDict() {
+ return dict_;
+ }
+
+ // Member variables.
+
+ INT_VAR_H(classify_debug_level, 0, "Classify debug level");
+
+ BOOL_VAR_H(classify_bln_numeric_mode, 0,
+ "Assume the input is numbers [0-9].");
+
+ double_VAR_H(classify_max_rating_ratio, 1.5,
+ "Veto ratio between classifier ratings");
+
+ double_VAR_H(classify_max_certainty_margin, 5.5,
+ "Veto difference between classifier certainties");
+
+ private:
+ Dict dict_;
+};
+
+} // namespace tesseract
+
+
+#else // DISABLED_LEGACY_ENGINE not defined
+
+#include "adaptive.h"
+#include "ccstruct.h"
+#include "dict.h"
+#include "featdefs.h"
+#include "fontinfo.h"
+#include "imagedata.h"
+#include "intfx.h"
+#include "intmatcher.h"
+#include "normalis.h"
+#include "ratngs.h"
+#include "ocrfeatures.h"
+#include "unicity_table.h"
+
+namespace tesseract {
+
+class ScrollView;
+class WERD_CHOICE;
+class WERD_RES;
+struct ADAPT_RESULTS;
+struct NORM_PROTOS;
+
+static const int kUnknownFontinfoId = -1;
+static const int kBlankFontinfoId = -2;
+
+class ShapeClassifier;
+struct ShapeRating;
+class ShapeTable;
+struct UnicharRating;
+
+// How segmented is a blob. In this enum, character refers to a classifiable
+// unit, but that is too long and character is usually easier to understand.
+enum CharSegmentationType {
+ CST_FRAGMENT, // A partial character.
+ CST_WHOLE, // A correctly segmented character.
+ CST_IMPROPER, // More than one but less than 2 characters.
+ CST_NGRAM // Multiple characters.
+};
+
+class TESS_API Classify : public CCStruct {
+ public:
+ Classify();
+ ~Classify() override;
+ virtual Dict& getDict() {
+ return dict_;
+ }
+
+ const ShapeTable* shape_table() const {
+ return shape_table_;
+ }
+
+ // Takes ownership of the given classifier, and uses it for future calls
+ // to CharNormClassifier.
+ void SetStaticClassifier(ShapeClassifier* static_classifier);
+
+ // Adds a noise classification result that is a bit worse than the worst
+ // current result, or the worst possible result if no current results.
+ void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
+
+ // Returns true if the blob is small enough to be a large speckle.
+ bool LargeSpeckle(const TBLOB &blob);
+
+ /* adaptive.cpp ************************************************************/
+ ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
+ int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId);
+ // Runs the class pruner from int_templates on the given features, returning
+ // the number of classes output in results.
+ // int_templates Class pruner tables
+ // num_features Number of features in blob
+ // features Array of features
+ // normalization_factors (input) Array of int_templates->NumClasses fudge
+ // factors from blob normalization process.
+ // (Indexed by CLASS_INDEX)
+ // expected_num_features (input) Array of int_templates->NumClasses
+ // expected number of features for each class.
+ // (Indexed by CLASS_INDEX)
+ // results (output) Sorted Array of pruned classes.
+ // Array must be sized to take the maximum possible
+ // number of outputs : int_templates->NumClasses.
+ int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates, int num_features,
+ int keep_this, const INT_FEATURE_STRUCT* features,
+ const uint8_t* normalization_factors,
+ const uint16_t* expected_num_features,
+ std::vector<CP_RESULT_STRUCT>* results);
+ void ReadNewCutoffs(TFile* fp, uint16_t* Cutoffs);
+ void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
+ void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
+ ADAPT_TEMPLATES ReadAdaptedTemplates(TFile* File);
+ /* normmatch.cpp ************************************************************/
+ float ComputeNormMatch(CLASS_ID ClassId,
+ const FEATURE_STRUCT& feature, bool DebugMatch);
+ void FreeNormProtos();
+ NORM_PROTOS* ReadNormProtos(TFile* fp);
+ /* protos.cpp ***************************************************************/
+ void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
+ INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
+ const UNICHARSET& target_unicharset);
+ /* adaptmatch.cpp ***********************************************************/
+
+ // Learns the given word using its chopped_word, seam_array, denorm,
+ // box_word, best_state, and correct_text to learn both correctly and
+ // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
+ // is called and the data will be saved in an internal buffer.
+ // Otherwise AdaptToBlob is called for adaption within a document.
+ void LearnWord(const char* fontname, WERD_RES* word);
+
+ // Builds a blob of length fragments, from the word, starting at start,
+ // and then learns it, as having the given correct_text.
+ // If fontname is not nullptr, then LearnBlob is called and the data will be
+ // saved in an internal buffer for static training.
+ // Otherwise AdaptToBlob is called for adaption within a document.
+ // threshold is a magic number required by AdaptToChar and generated by
+ // ComputeAdaptionThresholds.
+ // Although it can be partly inferred from the string, segmentation is
+ // provided to explicitly clarify the character segmentation.
+ void LearnPieces(const char* fontname, int start, int length, float threshold,
+ CharSegmentationType segmentation, const char* correct_text,
+ WERD_RES* word);
+ void InitAdaptiveClassifier(TessdataManager* mgr);
+ void InitAdaptedClass(TBLOB *Blob,
+ CLASS_ID ClassId,
+ int FontinfoId,
+ ADAPT_CLASS Class,
+ ADAPT_TEMPLATES Templates);
+ void AmbigClassifier(const std::vector<INT_FEATURE_STRUCT>& int_features,
+ const INT_FX_RESULT_STRUCT& fx_info,
+ const TBLOB *blob,
+ INT_TEMPLATES templates,
+ ADAPT_CLASS *classes,
+ UNICHAR_ID *ambiguities,
+ ADAPT_RESULTS *results);
+ void MasterMatcher(INT_TEMPLATES templates,
+ int16_t num_features,
+ const INT_FEATURE_STRUCT* features,
+ const uint8_t* norm_factors,
+ ADAPT_CLASS* classes,
+ int debug,
+ int matcher_multiplier,
+ const TBOX& blob_box,
+ const std::vector<CP_RESULT_STRUCT>& results,
+ ADAPT_RESULTS* final_results);
+ // Converts configs to fonts, and if the result is not adapted, and a
+ // shape_table_ is present, the shape is expanded to include all
+ // unichar_ids represented, before applying a set of corrections to the
+ // distance rating in int_result, (see ComputeCorrectedRating.)
+ // The results are added to the final_results output.
+ void ExpandShapesAndApplyCorrections(ADAPT_CLASS* classes,
+ bool debug,
+ int class_id,
+ int bottom, int top,
+ float cp_rating,
+ int blob_length,
+ int matcher_multiplier,
+ const uint8_t* cn_factors,
+ UnicharRating* int_result,
+ ADAPT_RESULTS* final_results);
+ // Applies a set of corrections to the distance im_rating,
+ // including the cn_correction, miss penalty and additional penalty
+ // for non-alnums being vertical misfits. Returns the corrected distance.
+ double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
+ double im_rating, int feature_misses,
+ int bottom, int top,
+ int blob_length, int matcher_multiplier,
+ const uint8_t* cn_factors);
+ void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
+ ADAPT_RESULTS *Results,
+ BLOB_CHOICE_LIST *Choices);
+ void AddNewResult(const UnicharRating& new_result, ADAPT_RESULTS *results);
+ int GetAdaptiveFeatures(TBLOB *Blob,
+ INT_FEATURE_ARRAY IntFeatures,
+ FEATURE_SET *FloatFeatures);
+
+#ifndef GRAPHICS_DISABLED
+ void DebugAdaptiveClassifier(TBLOB *Blob,
+ ADAPT_RESULTS *Results);
+#endif
+ PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
+ int NumBadFeat,
+ FEATURE_ID BadFeat[],
+ INT_CLASS IClass,
+ ADAPT_CLASS Class,
+ BIT_VECTOR TempProtoMask);
+ int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
+ CLASS_ID ClassId,
+ int FontinfoId,
+ int NumFeatures,
+ INT_FEATURE_ARRAY Features,
+ FEATURE_SET FloatFeatures);
+ void MakePermanent(ADAPT_TEMPLATES Templates,
+ CLASS_ID ClassId,
+ int ConfigId,
+ TBLOB *Blob);
+ void PrintAdaptiveMatchResults(const ADAPT_RESULTS& results);
+ void RemoveExtraPuncs(ADAPT_RESULTS *Results);
+ void RemoveBadMatches(ADAPT_RESULTS *Results);
+ void SetAdaptiveThreshold(float Threshold);
+ void ShowBestMatchFor(int shape_id,
+ const INT_FEATURE_STRUCT* features,
+ int num_features);
+ // Returns a string for the classifier class_id: either the corresponding
+ // unicharset debug_str or the shape_table_ debug str.
+ STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
+ int class_id, int config_id) const;
+ // Converts a classifier class_id index with a config ID to:
+ // shape_table_ present: a shape_table_ index OR
+ // No shape_table_: a font ID.
+ // Without shape training, each class_id, config pair represents a single
+ // unichar id/font combination, so this function looks up the corresponding
+ // font id.
+ // With shape training, each class_id, config pair represents a single
+ // shape table index, so the fontset_table stores the shape table index,
+ // and the shape_table_ must be consulted to obtain the actual unichar_id/
+ // font combinations that the shape represents.
+ int ClassAndConfigIDToFontOrShapeID(int class_id,
+ int int_result_config) const;
+ // Converts a shape_table_ index to a classifier class_id index (not a
+ // unichar-id!). Uses a search, so not fast.
+ int ShapeIDToClassID(int shape_id) const;
+ UNICHAR_ID *BaselineClassifier(
+ TBLOB *Blob, const std::vector<INT_FEATURE_STRUCT>& int_features,
+ const INT_FX_RESULT_STRUCT& fx_info,
+ ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results);
+ int CharNormClassifier(TBLOB *blob,
+ const TrainingSample& sample,
+ ADAPT_RESULTS *adapt_results);
+
+ // As CharNormClassifier, but operates on a TrainingSample and outputs to
+ // a GenericVector of ShapeRating without conversion to classes.
+ int CharNormTrainingSample(bool pruner_only, int keep_this,
+ const TrainingSample& sample,
+ std::vector<UnicharRating>* results);
+ UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
+ void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
+ void AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
+ float Threshold, ADAPT_TEMPLATES adaptive_templates);
+ void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
+ bool AdaptableWord(WERD_RES* word);
+ void EndAdaptiveClassifier();
+ void SettupPass1();
+ void SettupPass2();
+ void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices);
+ void ClassifyAsNoise(ADAPT_RESULTS *Results);
+ void ResetAdaptiveClassifierInternal();
+ void SwitchAdaptiveClassifier();
+ void StartBackupAdaptiveClassifier();
+
+ int GetCharNormFeature(const INT_FX_RESULT_STRUCT& fx_info,
+ INT_TEMPLATES templates,
+ uint8_t* pruner_norm_array,
+ uint8_t* char_norm_array);
+ // Computes the char_norm_array for the unicharset and, if not nullptr, the
+ // pruner_array as appropriate according to the existence of the shape_table.
+ // The norm_feature is deleted as it is almost certainly no longer needed.
+ void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
+ INT_TEMPLATES_STRUCT* templates,
+ uint8_t* char_norm_array,
+ uint8_t* pruner_array);
+
+ bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
+ void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
+
+ bool AdaptiveClassifierIsFull() const { return NumAdaptationsFailed > 0; }
+ bool AdaptiveClassifierIsEmpty() const {
+ return AdaptedTemplates->NumPermClasses == 0;
+ }
+ bool LooksLikeGarbage(TBLOB *blob);
+ void RefreshDebugWindow(ScrollView **win, const char *msg,
+ int y_offset, const TBOX &wbox);
+ // intfx.cpp
+ // Computes the DENORMS for bl(baseline) and cn(character) normalization
+ // during feature extraction. The input denorm describes the current state
+ // of the blob, which is usually a baseline-normalized word.
+ // The Transforms setup are as follows:
+ // Baseline Normalized (bl) Output:
+ // We center the grapheme by aligning the x-coordinate of its centroid with
+ // x=128 and leaving the already-baseline-normalized y as-is.
+ //
+ // Character Normalized (cn) Output:
+ // We align the grapheme's centroid at the origin and scale it
+ // asymmetrically in x and y so that the 2nd moments are a standard value
+ // (51.2) ie the result is vaguely square.
+ // If classify_nonlinear_norm is true:
+ // A non-linear normalization is setup that attempts to evenly distribute
+ // edges across x and y.
+ //
+ // Some of the fields of fx_info are also setup:
+ // Length: Total length of outline.
+ // Rx: Rounded y second moment. (Reversed by convention.)
+ // Ry: rounded x second moment.
+ // Xmean: Rounded x center of mass of the blob.
+ // Ymean: Rounded y center of mass of the blob.
+ static void SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
+ DENORM* bl_denorm, DENORM* cn_denorm,
+ INT_FX_RESULT_STRUCT* fx_info);
+
+ // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
+ // (x,y) position and angle as measured counterclockwise from the vector
+ // <-1, 0>, from blob using two normalizations defined by bl_denorm and
+ // cn_denorm. See SetpuBLCNDenorms for definitions.
+ // If outline_cn_counts is not nullptr, on return it contains the cumulative
+ // number of cn features generated for each outline in the blob (in order).
+ // Thus after the first outline, there were (*outline_cn_counts)[0] features,
+ // after the second outline, there were (*outline_cn_counts)[1] features etc.
+ static void ExtractFeatures(const TBLOB& blob,
+ bool nonlinear_norm,
+ std::vector<INT_FEATURE_STRUCT>* bl_features,
+ std::vector<INT_FEATURE_STRUCT>* cn_features,
+ INT_FX_RESULT_STRUCT* results,
+ GenericVector<int>* outline_cn_counts);
+ /* float2int.cpp ************************************************************/
+ void ClearCharNormArray(uint8_t* char_norm_array);
+ void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
+ uint8_t* char_norm_array);
+ void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
+ /* intproto.cpp *************************************************************/
+ INT_TEMPLATES ReadIntTemplates(TFile* fp);
+ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
+ const UNICHARSET& target_unicharset);
+ CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
+ bool* pretrained_on, int* shape_id);
+ void ShowMatchDisplay();
+ /* font detection ***********************************************************/
+ UnicityTable<FontInfo>& get_fontinfo_table() {
+ return fontinfo_table_;
+ }
+ const UnicityTable<FontInfo>& get_fontinfo_table() const {
+ return fontinfo_table_;
+ }
+ UnicityTable<FontSet>& get_fontset_table() {
+ return fontset_table_;
+ }
+ /* mfoutline.cpp ***********************************************************/
+ void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale);
+ /* outfeat.cpp ***********************************************************/
+ FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
+ /* picofeat.cpp ***********************************************************/
+ FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
+ FEATURE_SET ExtractIntCNFeatures(const TBLOB& blob,
+ const INT_FX_RESULT_STRUCT& fx_info);
+ FEATURE_SET ExtractIntGeoFeatures(const TBLOB& blob,
+ const INT_FX_RESULT_STRUCT& fx_info);
+ /* blobclass.cpp ***********************************************************/
+ // Extracts features from the given blob and saves them in the tr_file_data_
+ // member variable.
+ // fontname: Name of font that this blob was printed in.
+ // cn_denorm: Character normalization transformation to apply to the blob.
+ // fx_info: Character normalization parameters computed with cn_denorm.
+ // blob_text: Ground truth text for the blob.
+ void LearnBlob(const STRING& fontname, TBLOB* Blob, const DENORM& cn_denorm,
+ const INT_FX_RESULT_STRUCT& fx_info, const char* blob_text);
+ // Writes stored training data to a .tr file based on the given filename.
+ // Returns false on error.
+ bool WriteTRFile(const char* filename);
+
+ // Member variables.
+
+ // Parameters.
+ // Set during training (in lang.config) to indicate whether the divisible
+ // blobs chopper should be used (true for latin script.)
+ BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping");
+ // Set during training (in lang.config) to indicate whether the divisible
+ // blobs chopper should be used in preference to chopping. Set to true for
+ // southern Indic scripts.
+ BOOL_VAR_H(prioritize_division, false,
+ "Prioritize blob division over chopping");
+ BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
+ INT_VAR_H(classify_debug_level, 0, "Classify debug level");
+
+ /* mfoutline.cpp ***********************************************************/
+ /* control knobs used to control normalization of outlines */
+ INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
+ double_VAR_H(classify_char_norm_range, 0.2,
+ "Character Normalization Range ...");
+ double_VAR_H(classify_max_rating_ratio, 1.5,
+ "Veto ratio between classifier ratings");
+ double_VAR_H(classify_max_certainty_margin, 5.5,
+ "Veto difference between classifier certainties");
+
+ /* adaptmatch.cpp ***********************************************************/
+ BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
+ BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
+ BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
+ BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
+ "Use pre-adapted classifier templates");
+ BOOL_VAR_H(classify_save_adapted_templates, 0,
+ "Save adapted templates to a file");
+ BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
+ BOOL_VAR_H(classify_nonlinear_norm, 0,
+ "Non-linear stroke-density normalization");
+ INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
+ INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
+ INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
+ double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
+ double_VAR_H(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)");
+ double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
+ double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
+ double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
+ double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
+ INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
+ INT_VAR_H(matcher_min_examples_for_prototyping, 3,
+ "Reliable Config Threshold");
+ INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
+ "Enable adaption even if the ambiguities have not been seen");
+ double_VAR_H(matcher_clustering_max_angle_delta, 0.015,
+ "Maximum angle delta for prototype clustering");
+ double_VAR_H(classify_misfit_junk_penalty, 0.0,
+ "Penalty to apply when a non-alnum is vertically out of "
+ "its expected textline position");
+ double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
+ double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
+ double_VAR_H(tessedit_class_miss_scale, 0.00390625,
+ "Scale factor for features not used");
+ double_VAR_H(classify_adapted_pruning_factor, 2.5,
+ "Prune poor adapted results this much worse than best result");
+ double_VAR_H(classify_adapted_pruning_threshold, -1.0,
+ "Threshold at which classify_adapted_pruning_factor starts");
+ INT_VAR_H(classify_adapt_proto_threshold, 230,
+ "Threshold for good protos during adaptive 0-255");
+ INT_VAR_H(classify_adapt_feature_threshold, 230,
+ "Threshold for good features during adaptive 0-255");
+ BOOL_VAR_H(disable_character_fragments, true,
+ "Do not include character fragments in the"
+ " results of the classifier");
+ double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0,
+ "Exclude fragments that do not match any whole character"
+ " with at least this certainty");
+ BOOL_VAR_H(classify_debug_character_fragments, false,
+ "Bring up graphical debugging windows for fragments training");
+ BOOL_VAR_H(matcher_debug_separate_windows, false,
+ "Use two different windows for debugging the matching: "
+ "One for the protos and one for the features.");
+ STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
+
+ /* intmatcher.cpp **********************************************************/
+ INT_VAR_H(classify_class_pruner_threshold, 229,
+ "Class Pruner Threshold 0-255");
+ INT_VAR_H(classify_class_pruner_multiplier, 15,
+ "Class Pruner Multiplier 0-255: ");
+ INT_VAR_H(classify_cp_cutoff_strength, 7,
+ "Class Pruner CutoffStrength: ");
+ INT_VAR_H(classify_integer_matcher_multiplier, 10,
+ "Integer Matcher Multiplier 0-255: ");
+
+ BOOL_VAR_H(classify_bln_numeric_mode, 0,
+ "Assume the input is numbers [0-9].");
+ double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
+ double_VAR_H(speckle_rating_penalty, 10.0,
+ "Penalty to add to worst rating for noise");
+
+ // Use class variables to hold onto built-in templates and adapted templates.
+ INT_TEMPLATES PreTrainedTemplates = nullptr;
+ ADAPT_TEMPLATES AdaptedTemplates = nullptr;
+ // The backup adapted templates are created from the previous page (only)
+ // so they are always ready and reasonably well trained if the primary
+ // adapted templates become full.
+ ADAPT_TEMPLATES BackupAdaptedTemplates = nullptr;
+
+ // Create dummy proto and config masks for use with the built-in templates.
+ BIT_VECTOR AllProtosOn = nullptr;
+ BIT_VECTOR AllConfigsOn = nullptr;
+ BIT_VECTOR AllConfigsOff = nullptr;
+ BIT_VECTOR TempProtoMask = nullptr;
+ /* normmatch.cpp */
+ NORM_PROTOS* NormProtos = nullptr;
+ /* font detection ***********************************************************/
+ UnicityTable<FontInfo> fontinfo_table_;
+ // Without shape training, each class_id, config pair represents a single
+ // unichar id/font combination, so each fontset_table_ entry holds font ids
+ // for each config in the class.
+ // With shape training, each class_id, config pair represents a single
+ // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
+ // and the shape_table_ must be consulted to obtain the actual unichar_id/
+ // font combinations that the shape represents.
+ UnicityTable<FontSet> fontset_table_;
+
+ protected:
+ IntegerMatcher im_;
+ FEATURE_DEFS_STRUCT feature_defs_;
+ // If a shape_table_ is present, it is used to remap classifier output in
+ // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
+ // mean an index to the shape_table_ and the choices returned are *all* the
+ // shape_table_ entries at that index.
+ ShapeTable* shape_table_ = nullptr;
+
+ private:
+ // The currently active static classifier.
+ ShapeClassifier* static_classifier_ = nullptr;
+ ScrollView* learn_debug_win_ = nullptr;
+ ScrollView* learn_fragmented_word_debug_win_ = nullptr;
+ ScrollView* learn_fragments_debug_win_ = nullptr;
+
+ // Training data gathered here for all the images in a document.
+ STRING tr_file_data_;
+
+ Dict dict_;
+
+ GenericVector<uint16_t> shapetable_cutoffs_;
+
+ /* variables used to hold performance statistics */
+ int NumAdaptationsFailed = 0;
+
+ // Expected number of features in the class pruner, used to penalize
+ // unknowns that have too few features (like a c being classified as e) so
+ // it doesn't recognize everything as '@' or '#'.
+ // CharNormCutoffs is for the static classifier (with no shapetable).
+ // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
+ // value in the adaptive classifier. Both are indexed by unichar_id.
+ // shapetable_cutoffs_ provides a similar value for each shape in the
+ // shape_table_
+ uint16_t CharNormCutoffs[MAX_NUM_CLASSES];
+ uint16_t BaselineCutoffs[MAX_NUM_CLASSES];
+
+ public:
+ bool EnableLearning = true;
+};
+
+} // namespace tesseract
+
+#endif // DISABLED_LEGACY_ENGINE
+
+#endif // TESSERACT_CLASSIFY_CLASSIFY_H_
diff --git a/tesseract/src/classify/cluster.cpp b/tesseract/src/classify/cluster.cpp
new file mode 100644
index 00000000..25b2776d
--- /dev/null
+++ b/tesseract/src/classify/cluster.cpp
@@ -0,0 +1,2425 @@
+/******************************************************************************
+ ** Filename: cluster.cpp
+ ** Purpose: Routines for clustering points in N-D space
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#define _USE_MATH_DEFINES // for M_PI
+
+#include "cluster.h"
+
+#include "genericheap.h"
+#include "kdpair.h"
+#include "matrix.h"
+#include "tprintf.h"
+
+#include "helpers.h"
+
+#include <cfloat> // for FLT_MAX
+#include <cmath> // for M_PI
+#include <vector> // for std::vector
+
+namespace tesseract {
+
+#define HOTELLING 1 // If true use Hotelling's test to decide where to split.
+#define FTABLE_X 10 // Size of FTable.
+#define FTABLE_Y 100 // Size of FTable.
+
+// Table of values approximating the cumulative F-distribution for a confidence of 1%.
+const double FTable[FTABLE_Y][FTABLE_X] = {
+ {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
+ {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
+ {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
+ {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
+ {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
+ {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
+ {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
+ {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
+ {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
+ {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
+ { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
+ { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
+ { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
+ { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
+ { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
+ { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
+ { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
+ { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
+ { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
+ { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
+ { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
+ { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
+ { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
+ { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
+ { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
+ { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
+ { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
+ { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
+ { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
+ { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
+ { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
+ { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
+ { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
+ { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
+ { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
+ { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
+ { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
+ { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
+ { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
+ { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
+ { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
+ { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
+ { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
+ { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
+ { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
+ { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
+ { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
+ { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
+ { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
+ { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
+ { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
+ { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
+ { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
+ { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
+ { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
+ { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
+ { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
+ { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
+ { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
+ { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
+ { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
+ { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
+ { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
+ { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
+ { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
+ { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
+ { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
+ { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
+ { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
+ { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
+ { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
+ { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
+ { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
+ { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
+ { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
+ { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
+ { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
+ { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
+ { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
+ { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
+ { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
+ { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
+ { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
+ { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
+ { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
+ { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
+ { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
+ { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
+ { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
+ { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
+ { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
+ { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
+ { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
+ { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
+ { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
+ { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
+ { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
+ { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
+ { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
+ { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
+};
+
+/** define the variance which will be used as a minimum variance for any
+ dimension of any feature. Since most features are calculated from numbers
+ with a precision no better than 1 in 128, the variance should never be
+ less than the square of this number for parameters whose range is 1. */
+#define MINVARIANCE 0.0004
+
+/** define the absolute minimum number of samples which must be present in
+ order to accurately test hypotheses about underlying probability
+ distributions. Define separately the minimum samples that are needed
+ before a statistical analysis is attempted; this number should be
+ equal to MINSAMPLES but can be set to a lower number for early testing
+ when very few samples are available. */
+#define MINSAMPLESPERBUCKET 5
+#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
+#define MINSAMPLESNEEDED 1
+
+/** define the size of the table which maps normalized samples to
+ histogram buckets. Also define the number of standard deviations
+ in a normal distribution which are considered to be significant.
+ The mapping table will be defined in such a way that it covers
+ the specified number of standard deviations on either side of
+ the mean. BUCKETTABLESIZE should always be even. */
+#define BUCKETTABLESIZE 1024
+#define NORMALEXTENT 3.0
+
+struct TEMPCLUSTER {
+ CLUSTER *Cluster;
+ CLUSTER *Neighbor;
+};
+
+using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER*>;
+using ClusterHeap = tesseract::GenericHeap<ClusterPair>;
+
+struct STATISTICS {
+ float AvgVariance;
+ float *CoVariance;
+ float *Min; // largest negative distance from the mean
+ float *Max; // largest positive distance from the mean
+};
+
+struct BUCKETS {
+ DISTRIBUTION Distribution; // distribution being tested for
+ uint32_t SampleCount; // # of samples in histogram
+ double Confidence; // confidence level of test
+ double ChiSquared; // test threshold
+ uint16_t NumberOfBuckets; // number of cells in histogram
+ uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets
+ uint32_t *Count; // frequency of occurrence histogram
+ float *ExpectedCount; // expected histogram
+};
+
+struct CHISTRUCT{
+ uint16_t DegreesOfFreedom;
+ double Alpha;
+ double ChiSquared;
+};
+
+// For use with KDWalk / MakePotentialClusters
+struct ClusteringContext {
+ ClusterHeap *heap; // heap used to hold temp clusters, "best" on top
+ TEMPCLUSTER *candidates; // array of potential clusters
+ KDTREE *tree; // kd-tree to be searched for neighbors
+ int32_t next; // next candidate to be used
+};
+
+using DENSITYFUNC = double (*)(int32_t);
+using SOLVEFUNC = double (*)(CHISTRUCT*, double);
+
+#define Odd(N) ((N)%2)
+#define Mirror(N,R) ((R) - (N) - 1)
+#define Abs(N) (((N) < 0) ? (-(N)) : (N))
+
+//--------------Global Data Definitions and Declarations----------------------
+/** the following variables describe a discrete normal distribution
+ which is used by NormalDensity() and NormalBucket(). The
+ constant NORMALEXTENT determines how many standard
+ deviations of the distribution are mapped onto the fixed
+ discrete range of x. x=0 is mapped to -NORMALEXTENT standard
+ deviations and x=BUCKETTABLESIZE is mapped to
+ +NORMALEXTENT standard deviations. */
+#define SqrtOf2Pi 2.506628275
+static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
+static const double kNormalVariance =
+ (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
+static const double kNormalMagnitude =
+ (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
+static const double kNormalMean = BUCKETTABLESIZE / 2;
+
+/** define lookup tables used to compute the number of histogram buckets
+ that should be used for a given number of samples. */
+#define LOOKUPTABLESIZE 8
+#define MAXDEGREESOFFREEDOM MAXBUCKETS
+
+static const uint32_t kCountTable[LOOKUPTABLESIZE] = {
+ MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
+}; // number of samples
+
+static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {
+ MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
+}; // number of buckets
+
+/*-------------------------------------------------------------------------
+ Private Function Prototypes
+--------------------------------------------------------------------------*/
+static void CreateClusterTree(CLUSTERER* Clusterer);
+
+static void MakePotentialClusters(ClusteringContext* context, CLUSTER* Cluster,
+ int32_t Level);
+
+static CLUSTER* FindNearestNeighbor(KDTREE*Tree, CLUSTER* Cluster,
+ float* Distance);
+
+static CLUSTER* MakeNewCluster(CLUSTERER* Clusterer, TEMPCLUSTER* TempCluster);
+
+static void ComputePrototypes(CLUSTERER* Clusterer, CLUSTERCONFIG* Config);
+
+static PROTOTYPE* MakePrototype(CLUSTERER* Clusterer, CLUSTERCONFIG* Config,
+ CLUSTER* Cluster);
+
+static PROTOTYPE* MakeDegenerateProto(uint16_t N,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ PROTOSTYLE Style, int32_t MinSamples);
+
+static PROTOTYPE* TestEllipticalProto(CLUSTERER* Clusterer,
+ CLUSTERCONFIG* Config, CLUSTER* Cluster,
+ STATISTICS* Statistics);
+
+static PROTOTYPE* MakeSphericalProto(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ BUCKETS* Buckets);
+
+static PROTOTYPE* MakeEllipticalProto(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ BUCKETS* Buckets);
+
+static PROTOTYPE* MakeMixedProto(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ BUCKETS* NormalBuckets, double Confidence);
+
+static void MakeDimRandom(uint16_t i, PROTOTYPE* Proto, PARAM_DESC* ParamDesc);
+
+static void MakeDimUniform(uint16_t i, PROTOTYPE* Proto, STATISTICS* Statistics);
+
+static STATISTICS* ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[],
+ CLUSTER* Cluster);
+
+static PROTOTYPE* NewSphericalProto(uint16_t N, CLUSTER* Cluster,
+ STATISTICS* Statistics);
+
+static PROTOTYPE* NewEllipticalProto(int16_t N, CLUSTER* Cluster,
+ STATISTICS* Statistics);
+
+static PROTOTYPE* NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
+
+static PROTOTYPE* NewSimpleProto(int16_t N, CLUSTER *Cluster);
+
+static bool Independent(PARAM_DESC* ParamDesc,
+ int16_t N, float* CoVariance, float Independence);
+
+static BUCKETS *GetBuckets(CLUSTERER* clusterer,
+ DISTRIBUTION Distribution,
+ uint32_t SampleCount,
+ double Confidence);
+
+static BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
+ uint32_t SampleCount,
+ double Confidence);
+
+static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
+
+static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);
+
+static double NormalDensity(int32_t x);
+
+static double UniformDensity(int32_t x);
+
+static double Integral(double f1, double f2, double Dx);
+
+static void FillBuckets(BUCKETS *Buckets,
+ CLUSTER *Cluster,
+ uint16_t Dim,
+ PARAM_DESC *ParamDesc,
+ float Mean,
+ float StdDev);
+
+static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
+ float x,
+ float Mean,
+ float StdDev);
+
+static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
+ float x,
+ float Mean,
+ float StdDev);
+
+static bool DistributionOK(BUCKETS* Buckets);
+
+static void FreeStatistics(STATISTICS *Statistics);
+
+static void FreeBuckets(BUCKETS *Buckets);
+
+static void FreeCluster(CLUSTER *Cluster);
+
+static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);
+
+static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
+
+static void InitBuckets(BUCKETS *Buckets);
+
+static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
+ void *arg2); // CHISTRUCT *SearchKey);
+
+static CHISTRUCT *NewChiStruct(uint16_t DegreesOfFreedom, double Alpha);
+
+static double Solve(SOLVEFUNC Function,
+ void *FunctionParams,
+ double InitialGuess,
+ double Accuracy);
+
+static double ChiArea(CHISTRUCT *ChiParams, double x);
+
+static bool MultipleCharSamples(CLUSTERER* Clusterer,
+ CLUSTER* Cluster,
+ float MaxIllegal);
+
+static double InvertMatrix(const float* input, int size, float* inv);
+
+//--------------------------Public Code--------------------------------------
+/**
+ * This routine creates a new clusterer data structure,
+ * initializes it, and returns a pointer to it.
+ *
+ * @param SampleSize number of dimensions in feature space
+ * @param ParamDesc description of each dimension
+ * @return pointer to the new clusterer data structure
+ */
+CLUSTERER *
+MakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
+ CLUSTERER *Clusterer;
+ int i;
+
+ // allocate main clusterer data structure and init simple fields
+ Clusterer = static_cast<CLUSTERER *>(malloc (sizeof (CLUSTERER)));
+ Clusterer->SampleSize = SampleSize;
+ Clusterer->NumberOfSamples = 0;
+ Clusterer->NumChar = 0;
+
+ // init fields which will not be used initially
+ Clusterer->Root = nullptr;
+ Clusterer->ProtoList = NIL_LIST;
+
+ // maintain a copy of param descriptors in the clusterer data structure
+ Clusterer->ParamDesc =
+ static_cast<PARAM_DESC *>(malloc (SampleSize * sizeof (PARAM_DESC)));
+ for (i = 0; i < SampleSize; i++) {
+ Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
+ Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
+ Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
+ Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
+ Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
+ Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
+ Clusterer->ParamDesc[i].MidRange =
+ (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
+ }
+
+ // allocate a kd tree to hold the samples
+ Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
+
+ // Initialize cache of histogram buckets to minimize recomputing them.
+ for (auto & d : Clusterer->bucket_cache) {
+ for (auto & c : d)
+ c = nullptr;
+ }
+
+ return Clusterer;
+} // MakeClusterer
+
+/**
+ * This routine creates a new sample data structure to hold
+ * the specified feature. This sample is added to the clusterer
+ * data structure (so that it knows which samples are to be
+ * clustered later), and a pointer to the sample is returned to
+ * the caller.
+ *
+ * @param Clusterer clusterer data structure to add sample to
+ * @param Feature feature to be added to clusterer
+ * @param CharID unique ident. of char that sample came from
+ *
+ * @return Pointer to the new sample data structure
+ */
+SAMPLE* MakeSample(CLUSTERER * Clusterer, const float* Feature,
+ int32_t CharID) {
+ SAMPLE *Sample;
+ int i;
+
+ // see if the samples have already been clustered - if so trap an error
+ // Can't add samples after they have been clustered.
+ ASSERT_HOST(Clusterer->Root == nullptr);
+
+ // allocate the new sample and initialize it
+ Sample = static_cast<SAMPLE *>(malloc (sizeof (SAMPLE) +
+ (Clusterer->SampleSize -
+ 1) * sizeof (float)));
+ Sample->Clustered = false;
+ Sample->Prototype = false;
+ Sample->SampleCount = 1;
+ Sample->Left = nullptr;
+ Sample->Right = nullptr;
+ Sample->CharID = CharID;
+
+ for (i = 0; i < Clusterer->SampleSize; i++)
+ Sample->Mean[i] = Feature[i];
+
+ // add the sample to the KD tree - keep track of the total # of samples
+ Clusterer->NumberOfSamples++;
+ KDStore(Clusterer->KDTree, Sample->Mean, Sample);
+ if (CharID >= Clusterer->NumChar)
+ Clusterer->NumChar = CharID + 1;
+
+ // execute hook for monitoring clustering operation
+ // (*SampleCreationHook)(Sample);
+
+ return (Sample);
+} // MakeSample
+
+/**
+ * This routine first checks to see if the samples in this
+ * clusterer have already been clustered before; if so, it does
+ * not bother to recreate the cluster tree. It simply recomputes
+ * the prototypes based on the new Config info.
+ *
+ * If the samples have not been clustered before, the
+ * samples in the KD tree are formed into a cluster tree and then
+ * the prototypes are computed from the cluster tree.
+ *
+ * In either case this routine returns a pointer to a
+ * list of prototypes that best represent the samples given
+ * the constraints specified in Config.
+ *
+ * @param Clusterer data struct containing samples to be clustered
+ * @param Config parameters which control clustering process
+ *
+ * @return Pointer to a list of prototypes
+ */
+LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
+ //only create cluster tree if samples have never been clustered before
+ if (Clusterer->Root == nullptr)
+ CreateClusterTree(Clusterer);
+
+ //deallocate the old prototype list if one exists
+ FreeProtoList (&Clusterer->ProtoList);
+ Clusterer->ProtoList = NIL_LIST;
+
+ //compute prototypes starting at the root node in the tree
+ ComputePrototypes(Clusterer, Config);
+ // We don't need the cluster pointers in the protos any more, so null them
+ // out, which makes it safe to delete the clusterer.
+ LIST proto_list = Clusterer->ProtoList;
+ iterate(proto_list) {
+ auto *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
+ proto->Cluster = nullptr;
+ }
+ return Clusterer->ProtoList;
+} // ClusterSamples
+
+/**
+ * This routine frees all of the memory allocated to the
+ * specified data structure. It will not, however, free
+ * the memory used by the prototype list. The pointers to
+ * the clusters for each prototype in the list will be set
+ * to nullptr to indicate that the cluster data structures no
+ * longer exist. Any sample lists that have been obtained
+ * via calls to GetSamples are no longer valid.
+ * @param Clusterer pointer to data structure to be freed
+ */
+void FreeClusterer(CLUSTERER *Clusterer) {
+ if (Clusterer != nullptr) {
+ free(Clusterer->ParamDesc);
+ if (Clusterer->KDTree != nullptr)
+ FreeKDTree (Clusterer->KDTree);
+ if (Clusterer->Root != nullptr)
+ FreeCluster (Clusterer->Root);
+ // Free up all used buckets structures.
+ for (auto & d : Clusterer->bucket_cache) {
+ for (auto & c : d)
+ if (c != nullptr)
+ FreeBuckets(c);
+ }
+
+ free(Clusterer);
+ }
+} // FreeClusterer
+
+/**
+ * This routine frees all of the memory allocated to the
+ * specified list of prototypes. The clusters which are
+ * pointed to by the prototypes are not freed.
+ * @param ProtoList pointer to list of prototypes to be freed
+ */
+void FreeProtoList(LIST *ProtoList) {
+ destroy_nodes(*ProtoList, FreePrototype);
+} // FreeProtoList
+
+/**
+ * This routine deallocates the memory consumed by the specified
+ * prototype and modifies the corresponding cluster so that it
+ * is no longer marked as a prototype. The cluster is NOT
+ * deallocated by this routine.
+ * @param arg prototype data structure to be deallocated
+ */
+void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
+ auto *Prototype = static_cast<PROTOTYPE *>(arg);
+
+ // unmark the corresponding cluster (if there is one
+ if (Prototype->Cluster != nullptr)
+ Prototype->Cluster->Prototype = false;
+
+ // deallocate the prototype statistics and then the prototype itself
+ free(Prototype->Distrib);
+ free(Prototype->Mean);
+ if (Prototype->Style != spherical) {
+ free(Prototype->Variance.Elliptical);
+ free(Prototype->Magnitude.Elliptical);
+ free(Prototype->Weight.Elliptical);
+ }
+ free(Prototype);
+} // FreePrototype
+
+/**
+ * This routine is used to find all of the samples which
+ * belong to a cluster. It starts by removing the top
+ * cluster on the cluster list (SearchState). If this cluster is
+ * a leaf it is returned. Otherwise, the right subcluster
+ * is pushed on the list and we continue the search in the
+ * left subcluster. This continues until a leaf is found.
+ * If all samples have been found, nullptr is returned.
+ * InitSampleSearch() must be called
+ * before NextSample() to initialize the search.
+ * @param SearchState ptr to list containing clusters to be searched
+ * @return Pointer to the next leaf cluster (sample) or nullptr.
+ */
+CLUSTER *NextSample(LIST *SearchState) {
+ CLUSTER *Cluster;
+
+ if (*SearchState == NIL_LIST)
+ return (nullptr);
+ Cluster = reinterpret_cast<CLUSTER *>first_node (*SearchState);
+ *SearchState = pop (*SearchState);
+ for (;;) {
+ if (Cluster->Left == nullptr)
+ return (Cluster);
+ *SearchState = push (*SearchState, Cluster->Right);
+ Cluster = Cluster->Left;
+ }
+} // NextSample
+
+/**
+ * This routine returns the mean of the specified
+ * prototype in the indicated dimension.
+ * @param Proto prototype to return mean of
+ * @param Dimension dimension whose mean is to be returned
+ * @return Mean of Prototype in Dimension
+ */
+float Mean(PROTOTYPE *Proto, uint16_t Dimension) {
+ return (Proto->Mean[Dimension]);
+} // Mean
+
+/**
+ * This routine returns the standard deviation of the
+ * prototype in the indicated dimension.
+ * @param Proto prototype to return standard deviation of
+ * @param Dimension dimension whose stddev is to be returned
+ * @return Standard deviation of Prototype in Dimension
+ */
+float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {
+ switch (Proto->Style) {
+ case spherical:
+ return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Spherical))));
+ case elliptical:
+ return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
+ case mixed:
+ switch (Proto->Distrib[Dimension]) {
+ case normal:
+ return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
+ case uniform:
+ case D_random:
+ return (Proto->Variance.Elliptical[Dimension]);
+ case DISTRIBUTION_COUNT:
+ ASSERT_HOST(!"Distribution count not allowed!");
+ }
+ }
+ return 0.0f;
+} // StandardDeviation
+
+
+/*---------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+/**
+ * This routine performs a bottoms-up clustering on the samples
+ * held in the kd-tree of the Clusterer data structure. The
+ * result is a cluster tree. Each node in the tree represents
+ * a cluster which conceptually contains a subset of the samples.
+ * More precisely, the cluster contains all of the samples which
+ * are contained in its two sub-clusters. The leaves of the
+ * tree are the individual samples themselves; they have no
+ * sub-clusters. The root node of the tree conceptually contains
+ * all of the samples.
+ * The Clusterer data structure is changed.
+ * @param Clusterer data structure holdings samples to be clustered
+ */
+static void CreateClusterTree(CLUSTERER *Clusterer) {
+ ClusteringContext context;
+ ClusterPair HeapEntry;
+ TEMPCLUSTER *PotentialCluster;
+
+ // each sample and its nearest neighbor form a "potential" cluster
+ // save these in a heap with the "best" potential clusters on top
+ context.tree = Clusterer->KDTree;
+ context.candidates = static_cast<TEMPCLUSTER *>(malloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER)));
+ context.next = 0;
+ context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
+ KDWalk(context.tree, reinterpret_cast<void_proc>(MakePotentialClusters), &context);
+
+ // form potential clusters into actual clusters - always do "best" first
+ while (context.heap->Pop(&HeapEntry)) {
+ PotentialCluster = HeapEntry.data();
+
+ // if main cluster of potential cluster is already in another cluster
+ // then we don't need to worry about it
+ if (PotentialCluster->Cluster->Clustered) {
+ continue;
+ }
+
+ // if main cluster is not yet clustered, but its nearest neighbor is
+ // then we must find a new nearest neighbor
+ else if (PotentialCluster->Neighbor->Clustered) {
+ PotentialCluster->Neighbor =
+ FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
+ &HeapEntry.key());
+ if (PotentialCluster->Neighbor != nullptr) {
+ context.heap->Push(&HeapEntry);
+ }
+ }
+
+ // if neither cluster is already clustered, form permanent cluster
+ else {
+ PotentialCluster->Cluster =
+ MakeNewCluster(Clusterer, PotentialCluster);
+ PotentialCluster->Neighbor =
+ FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
+ &HeapEntry.key());
+ if (PotentialCluster->Neighbor != nullptr) {
+ context.heap->Push(&HeapEntry);
+ }
+ }
+ }
+
+ // the root node in the cluster tree is now the only node in the kd-tree
+ Clusterer->Root = static_cast<CLUSTER *>RootOf(Clusterer->KDTree);
+
+ // free up the memory used by the K-D tree, heap, and temp clusters
+ FreeKDTree(context.tree);
+ Clusterer->KDTree = nullptr;
+ delete context.heap;
+ free(context.candidates);
+} // CreateClusterTree
+
+/**
+ * This routine is designed to be used in concert with the
+ * KDWalk routine. It will create a potential cluster for
+ * each sample in the kd-tree that is being walked. This
+ * potential cluster will then be pushed on the heap.
+ * @param context ClusteringContext (see definition above)
+ * @param Cluster current cluster being visited in kd-tree walk
+ * @param Level level of this cluster in the kd-tree
+ */
+static void MakePotentialClusters(ClusteringContext* context,
+ CLUSTER* Cluster, int32_t /*Level*/) {
+ ClusterPair HeapEntry;
+ int next = context->next;
+ context->candidates[next].Cluster = Cluster;
+ HeapEntry.data() = &(context->candidates[next]);
+ context->candidates[next].Neighbor =
+ FindNearestNeighbor(context->tree,
+ context->candidates[next].Cluster,
+ &HeapEntry.key());
+ if (context->candidates[next].Neighbor != nullptr) {
+ context->heap->Push(&HeapEntry);
+ context->next++;
+ }
+} // MakePotentialClusters
+
+/**
+ * This routine searches the specified kd-tree for the nearest
+ * neighbor of the specified cluster. It actually uses the
+ * kd routines to find the 2 nearest neighbors since one of them
+ * will be the original cluster. A pointer to the nearest
+ * neighbor is returned, if it can be found, otherwise nullptr is
+ * returned. The distance between the 2 nodes is placed
+ * in the specified variable.
+ * @param Tree kd-tree to search in for nearest neighbor
+ * @param Cluster cluster whose nearest neighbor is to be found
+ * @param Distance ptr to variable to report distance found
+ * @return Pointer to the nearest neighbor of Cluster, or nullptr
+ */
+static CLUSTER*
+FindNearestNeighbor(KDTREE* Tree, CLUSTER* Cluster, float* Distance)
+#define MAXNEIGHBORS 2
+#define MAXDISTANCE FLT_MAX
+{
+ CLUSTER *Neighbor[MAXNEIGHBORS];
+ float Dist[MAXNEIGHBORS];
+ int NumberOfNeighbors;
+ int32_t i;
+ CLUSTER *BestNeighbor;
+
+ // find the 2 nearest neighbors of the cluster
+ KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE,
+ &NumberOfNeighbors, reinterpret_cast<void **>(Neighbor), Dist);
+
+ // search for the nearest neighbor that is not the cluster itself
+ *Distance = MAXDISTANCE;
+ BestNeighbor = nullptr;
+ for (i = 0; i < NumberOfNeighbors; i++) {
+ if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
+ *Distance = Dist[i];
+ BestNeighbor = Neighbor[i];
+ }
+ }
+ return BestNeighbor;
+} // FindNearestNeighbor
+
+/**
+ * This routine creates a new permanent cluster from the
+ * clusters specified in TempCluster. The 2 clusters in
+ * TempCluster are marked as "clustered" and deleted from
+ * the kd-tree. The new cluster is then added to the kd-tree.
+ * @param Clusterer current clustering environment
+ * @param TempCluster potential cluster to make permanent
+ * @return Pointer to the new permanent cluster
+ */
+static CLUSTER* MakeNewCluster(CLUSTERER* Clusterer,
+ TEMPCLUSTER* TempCluster) {
+ CLUSTER *Cluster;
+
+ // allocate the new cluster and initialize it
+ Cluster = static_cast<CLUSTER *>(malloc(
+ sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(float)));
+ Cluster->Clustered = false;
+ Cluster->Prototype = false;
+ Cluster->Left = TempCluster->Cluster;
+ Cluster->Right = TempCluster->Neighbor;
+ Cluster->CharID = -1;
+
+ // mark the old clusters as "clustered" and delete them from the kd-tree
+ Cluster->Left->Clustered = true;
+ Cluster->Right->Clustered = true;
+ KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
+ KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
+
+ // compute the mean and sample count for the new cluster
+ Cluster->SampleCount =
+ MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
+ Cluster->Left->SampleCount, Cluster->Right->SampleCount,
+ Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
+
+ // add the new cluster to the KD tree
+ KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
+ return Cluster;
+} // MakeNewCluster
+
+/**
+ * This routine merges two clusters into one larger cluster.
+ * To do this it computes the number of samples in the new
+ * cluster and the mean of the new cluster. The ParamDesc
+ * information is used to ensure that circular dimensions
+ * are handled correctly.
+ * @param N # of dimensions (size of arrays)
+ * @param ParamDesc array of dimension descriptions
+ * @param n1, n2 number of samples in each old cluster
+ * @param m array to hold mean of new cluster
+ * @param m1, m2 arrays containing means of old clusters
+ * @return The number of samples in the new cluster.
+ */
+int32_t MergeClusters(int16_t N,
+ PARAM_DESC ParamDesc[],
+ int32_t n1,
+ int32_t n2,
+ float m[],
+ float m1[], float m2[]) {
+ int32_t i, n;
+
+ n = n1 + n2;
+ for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
+ if (ParamDesc->Circular) {
+ // if distance between means is greater than allowed
+ // reduce upper point by one "rotation" to compute mean
+ // then normalize the mean back into the accepted range
+ if ((*m2 - *m1) > ParamDesc->HalfRange) {
+ *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
+ if (*m < ParamDesc->Min)
+ *m += ParamDesc->Range;
+ }
+ else if ((*m1 - *m2) > ParamDesc->HalfRange) {
+ *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
+ if (*m < ParamDesc->Min)
+ *m += ParamDesc->Range;
+ }
+ else
+ *m = (n1 * *m1 + n2 * *m2) / n;
+ }
+ else
+ *m = (n1 * *m1 + n2 * *m2) / n;
+ }
+ return n;
+} // MergeClusters
+
+/**
+ * This routine decides which clusters in the cluster tree
+ * should be represented by prototypes, forms a list of these
+ * prototypes, and places the list in the Clusterer data
+ * structure.
+ * @param Clusterer data structure holding cluster tree
+ * @param Config parameters used to control prototype generation
+ */
+static void ComputePrototypes(CLUSTERER* Clusterer, CLUSTERCONFIG* Config) {
+ LIST ClusterStack = NIL_LIST;
+ CLUSTER *Cluster;
+ PROTOTYPE *Prototype;
+
+ // use a stack to keep track of clusters waiting to be processed
+ // initially the only cluster on the stack is the root cluster
+ if (Clusterer->Root != nullptr)
+ ClusterStack = push (NIL_LIST, Clusterer->Root);
+
+ // loop until we have analyzed all clusters which are potential prototypes
+ while (ClusterStack != NIL_LIST) {
+ // remove the next cluster to be analyzed from the stack
+ // try to make a prototype from the cluster
+ // if successful, put it on the proto list, else split the cluster
+ Cluster = reinterpret_cast<CLUSTER *>first_node (ClusterStack);
+ ClusterStack = pop (ClusterStack);
+ Prototype = MakePrototype(Clusterer, Config, Cluster);
+ if (Prototype != nullptr) {
+ Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
+ }
+ else {
+ ClusterStack = push (ClusterStack, Cluster->Right);
+ ClusterStack = push (ClusterStack, Cluster->Left);
+ }
+ }
+} // ComputePrototypes
+
+/**
+ * This routine attempts to create a prototype from the
+ * specified cluster that conforms to the distribution
+ * specified in Config. If there are too few samples in the
+ * cluster to perform a statistical analysis, then a prototype
+ * is generated but labelled as insignificant. If the
+ * dimensions of the cluster are not independent, no prototype
+ * is generated and nullptr is returned. If a prototype can be
+ * found that matches the desired distribution then a pointer
+ * to it is returned, otherwise nullptr is returned.
+ * @param Clusterer data structure holding cluster tree
+ * @param Config parameters used to control prototype generation
+ * @param Cluster cluster to be made into a prototype
+ * @return Pointer to new prototype or nullptr
+ */
+static PROTOTYPE* MakePrototype(CLUSTERER* Clusterer, CLUSTERCONFIG* Config,
+ CLUSTER* Cluster) {
+ STATISTICS *Statistics;
+ PROTOTYPE *Proto;
+ BUCKETS *Buckets;
+
+ // filter out clusters which contain samples from the same character
+ if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
+ return nullptr;
+
+ // compute the covariance matrix and ranges for the cluster
+ Statistics =
+ ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
+
+ // check for degenerate clusters which need not be analyzed further
+ // note that the MinSamples test assumes that all clusters with multiple
+ // character samples have been removed (as above)
+ Proto = MakeDegenerateProto(
+ Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
+ static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));
+ if (Proto != nullptr) {
+ FreeStatistics(Statistics);
+ return Proto;
+ }
+ // check to ensure that all dimensions are independent
+ if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
+ Statistics->CoVariance, Config->Independence)) {
+ FreeStatistics(Statistics);
+ return nullptr;
+ }
+
+ if (HOTELLING && Config->ProtoStyle == elliptical) {
+ Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
+ if (Proto != nullptr) {
+ FreeStatistics(Statistics);
+ return Proto;
+ }
+ }
+
+ // create a histogram data structure used to evaluate distributions
+ Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
+ Config->Confidence);
+
+ // create a prototype based on the statistics and test it
+ switch (Config->ProtoStyle) {
+ case spherical:
+ Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
+ break;
+ case elliptical:
+ Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
+ break;
+ case mixed:
+ Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
+ Config->Confidence);
+ break;
+ case automatic:
+ Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
+ if (Proto != nullptr)
+ break;
+ Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
+ if (Proto != nullptr)
+ break;
+ Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
+ Config->Confidence);
+ break;
+ }
+ FreeStatistics(Statistics);
+ return Proto;
+} // MakePrototype
+
+/**
+ * This routine checks for clusters which are degenerate and
+ * therefore cannot be analyzed in a statistically valid way.
+ * A cluster is defined as degenerate if it does not have at
+ * least MINSAMPLESNEEDED samples in it. If the cluster is
+ * found to be degenerate, a prototype of the specified style
+ * is generated and marked as insignificant. A cluster is
+ * also degenerate if it does not have at least MinSamples
+ * samples in it.
+ *
+ * If the cluster is not degenerate, nullptr is returned.
+ *
+ * @param N number of dimensions
+ * @param Cluster cluster being analyzed
+ * @param Statistics statistical info about cluster
+ * @param Style type of prototype to be generated
+ * @param MinSamples minimum number of samples in a cluster
+ * @return Pointer to degenerate prototype or nullptr.
+ */
+static PROTOTYPE* MakeDegenerateProto( //this was MinSample
+ uint16_t N,
+ CLUSTER *Cluster,
+ STATISTICS *Statistics,
+ PROTOSTYLE Style,
+ int32_t MinSamples) {
+ PROTOTYPE *Proto = nullptr;
+
+ if (MinSamples < MINSAMPLESNEEDED)
+ MinSamples = MINSAMPLESNEEDED;
+
+ if (Cluster->SampleCount < MinSamples) {
+ switch (Style) {
+ case spherical:
+ Proto = NewSphericalProto (N, Cluster, Statistics);
+ break;
+ case elliptical:
+ case automatic:
+ Proto = NewEllipticalProto (N, Cluster, Statistics);
+ break;
+ case mixed:
+ Proto = NewMixedProto (N, Cluster, Statistics);
+ break;
+ }
+ Proto->Significant = false;
+ }
+ return (Proto);
+} // MakeDegenerateProto
+
+/**
+ * This routine tests the specified cluster to see if **
+ * there is a statistically significant difference between
+ * the sub-clusters that would be made if the cluster were to
+ * be split. If not, then a new prototype is formed and
+ * returned to the caller. If there is, then nullptr is returned
+ * to the caller.
+ * @param Clusterer data struct containing samples being clustered
+ * @param Config provides the magic number of samples that make a good cluster
+ * @param Cluster cluster to be made into an elliptical prototype
+ * @param Statistics statistical info about cluster
+ * @return Pointer to new elliptical prototype or nullptr.
+ */
+static PROTOTYPE* TestEllipticalProto(CLUSTERER* Clusterer,
+ CLUSTERCONFIG *Config, CLUSTER* Cluster,
+ STATISTICS* Statistics) {
+ // Fraction of the number of samples used as a range around 1 within
+ // which a cluster has the magic size that allows a boost to the
+ // FTable by kFTableBoostMargin, thus allowing clusters near the
+ // magic size (equal to the number of sample characters) to be more
+ // likely to stay together.
+ const double kMagicSampleMargin = 0.0625;
+ const double kFTableBoostMargin = 2.0;
+
+ int N = Clusterer->SampleSize;
+ CLUSTER* Left = Cluster->Left;
+ CLUSTER* Right = Cluster->Right;
+ if (Left == nullptr || Right == nullptr)
+ return nullptr;
+ int TotalDims = Left->SampleCount + Right->SampleCount;
+ if (TotalDims < N + 1 || TotalDims < 2)
+ return nullptr;
+ std::vector<float> Covariance(static_cast<size_t>(N) * N);
+ std::vector<float> Inverse(static_cast<size_t>(N) * N);
+ std::vector<float> Delta(N);
+ // Compute a new covariance matrix that only uses essential features.
+ for (int i = 0; i < N; ++i) {
+ int row_offset = i * N;
+ if (!Clusterer->ParamDesc[i].NonEssential) {
+ for (int j = 0; j < N; ++j) {
+ if (!Clusterer->ParamDesc[j].NonEssential)
+ Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
+ else
+ Covariance[j + row_offset] = 0.0f;
+ }
+ } else {
+ for (int j = 0; j < N; ++j) {
+ if (i == j)
+ Covariance[j + row_offset] = 1.0f;
+ else
+ Covariance[j + row_offset] = 0.0f;
+ }
+ }
+ }
+ double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
+ if (err > 1) {
+ tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
+ }
+ int EssentialN = 0;
+ for (int dim = 0; dim < N; ++dim) {
+ if (!Clusterer->ParamDesc[dim].NonEssential) {
+ Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
+ ++EssentialN;
+ } else {
+ Delta[dim] = 0.0f;
+ }
+ }
+ // Compute Hotelling's T-squared.
+ double Tsq = 0.0;
+ for (int x = 0; x < N; ++x) {
+ double temp = 0.0;
+ for (int y = 0; y < N; ++y) {
+ temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
+ }
+ Tsq += Delta[x] * temp;
+ }
+ // Changed this function to match the formula in
+ // Statistical Methods in Medical Research p 473
+ // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
+ // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
+ double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
+ int Fx = EssentialN;
+ if (Fx > FTABLE_X)
+ Fx = FTABLE_X;
+ --Fx;
+ int Fy = TotalDims - EssentialN - 1;
+ if (Fy > FTABLE_Y)
+ Fy = FTABLE_Y;
+ --Fy;
+ double FTarget = FTable[Fy][Fx];
+ if (Config->MagicSamples > 0 &&
+ TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
+ TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
+ // Give magic-sized clusters a magic FTable boost.
+ FTarget += kFTableBoostMargin;
+ }
+ if (F < FTarget) {
+ return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
+ }
+ return nullptr;
+}
+
+/**
+ * This routine tests the specified cluster to see if it can
+ * be approximated by a spherical normal distribution. If it
+ * can be, then a new prototype is formed and returned to the
+ * caller. If it can't be, then nullptr is returned to the caller.
+ * @param Clusterer data struct containing samples being clustered
+ * @param Cluster cluster to be made into a spherical prototype
+ * @param Statistics statistical info about cluster
+ * @param Buckets histogram struct used to analyze distribution
+ * @return Pointer to new spherical prototype or nullptr.
+ */
+static PROTOTYPE* MakeSphericalProto(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ BUCKETS* Buckets) {
+ PROTOTYPE *Proto = nullptr;
+ int i;
+
+ // check that each dimension is a normal distribution
+ for (i = 0; i < Clusterer->SampleSize; i++) {
+ if (Clusterer->ParamDesc[i].NonEssential)
+ continue;
+
+ FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
+ Cluster->Mean[i],
+ sqrt (static_cast<double>(Statistics->AvgVariance)));
+ if (!DistributionOK (Buckets))
+ break;
+ }
+ // if all dimensions matched a normal distribution, make a proto
+ if (i >= Clusterer->SampleSize)
+ Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
+ return (Proto);
+} // MakeSphericalProto
+
+/**
+ * This routine tests the specified cluster to see if it can
+ * be approximated by an elliptical normal distribution. If it
+ * can be, then a new prototype is formed and returned to the
+ * caller. If it can't be, then nullptr is returned to the caller.
+ * @param Clusterer data struct containing samples being clustered
+ * @param Cluster cluster to be made into an elliptical prototype
+ * @param Statistics statistical info about cluster
+ * @param Buckets histogram struct used to analyze distribution
+ * @return Pointer to new elliptical prototype or nullptr.
+ */
+static PROTOTYPE* MakeEllipticalProto(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ BUCKETS* Buckets) {
+ PROTOTYPE *Proto = nullptr;
+ int i;
+
+ // check that each dimension is a normal distribution
+ for (i = 0; i < Clusterer->SampleSize; i++) {
+ if (Clusterer->ParamDesc[i].NonEssential)
+ continue;
+
+ FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
+ Cluster->Mean[i],
+ sqrt (static_cast<double>(Statistics->
+ CoVariance[i * (Clusterer->SampleSize + 1)])));
+ if (!DistributionOK (Buckets))
+ break;
+ }
+ // if all dimensions matched a normal distribution, make a proto
+ if (i >= Clusterer->SampleSize)
+ Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
+ return (Proto);
+} // MakeEllipticalProto
+
+/**
+ * This routine tests each dimension of the specified cluster to
+ * see what distribution would best approximate that dimension.
+ * Each dimension is compared to the following distributions
+ * in order: normal, random, uniform. If each dimension can
+ * be represented by one of these distributions,
+ * then a new prototype is formed and returned to the
+ * caller. If it can't be, then nullptr is returned to the caller.
+ * @param Clusterer data struct containing samples being clustered
+ * @param Cluster cluster to be made into a prototype
+ * @param Statistics statistical info about cluster
+ * @param NormalBuckets histogram struct used to analyze distribution
+ * @param Confidence confidence level for alternate distributions
+ * @return Pointer to new mixed prototype or nullptr.
+ */
+static PROTOTYPE* MakeMixedProto(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, STATISTICS* Statistics,
+ BUCKETS* NormalBuckets, double Confidence) {
+ PROTOTYPE *Proto;
+ int i;
+ BUCKETS *UniformBuckets = nullptr;
+ BUCKETS *RandomBuckets = nullptr;
+
+ // create a mixed proto to work on - initially assume all dimensions normal*/
+ Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
+
+ // find the proper distribution for each dimension
+ for (i = 0; i < Clusterer->SampleSize; i++) {
+ if (Clusterer->ParamDesc[i].NonEssential)
+ continue;
+
+ FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
+ Proto->Mean[i],
+ sqrt (static_cast<double>(Proto->Variance.Elliptical[i])));
+ if (DistributionOK (NormalBuckets))
+ continue;
+
+ if (RandomBuckets == nullptr)
+ RandomBuckets =
+ GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
+ MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
+ FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
+ Proto->Mean[i], Proto->Variance.Elliptical[i]);
+ if (DistributionOK (RandomBuckets))
+ continue;
+
+ if (UniformBuckets == nullptr)
+ UniformBuckets =
+ GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
+ MakeDimUniform(i, Proto, Statistics);
+ FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
+ Proto->Mean[i], Proto->Variance.Elliptical[i]);
+ if (DistributionOK (UniformBuckets))
+ continue;
+ break;
+ }
+ // if any dimension failed to match a distribution, discard the proto
+ if (i < Clusterer->SampleSize) {
+ FreePrototype(Proto);
+ Proto = nullptr;
+ }
+ return (Proto);
+} // MakeMixedProto
+
+/**
+ * This routine alters the ith dimension of the specified
+ * mixed prototype to be D_random.
+ * @param i index of dimension to be changed
+ * @param Proto prototype whose dimension is to be altered
+ * @param ParamDesc description of specified dimension
+ */
+static void MakeDimRandom(uint16_t i, PROTOTYPE* Proto, PARAM_DESC* ParamDesc) {
+ Proto->Distrib[i] = D_random;
+ Proto->Mean[i] = ParamDesc->MidRange;
+ Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
+
+ // subtract out the previous magnitude of this dimension from the total
+ Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
+ Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
+ Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));
+
+ // note that the proto Weight is irrelevant for D_random protos
+} // MakeDimRandom
+
+/**
+ * This routine alters the ith dimension of the specified
+ * mixed prototype to be uniform.
+ * @param i index of dimension to be changed
+ * @param Proto prototype whose dimension is to be altered
+ * @param Statistics statistical info about prototype
+ */
+static void MakeDimUniform(uint16_t i, PROTOTYPE* Proto, STATISTICS* Statistics) {
+ Proto->Distrib[i] = uniform;
+ Proto->Mean[i] = Proto->Cluster->Mean[i] +
+ (Statistics->Min[i] + Statistics->Max[i]) / 2;
+ Proto->Variance.Elliptical[i] =
+ (Statistics->Max[i] - Statistics->Min[i]) / 2;
+ if (Proto->Variance.Elliptical[i] < MINVARIANCE)
+ Proto->Variance.Elliptical[i] = MINVARIANCE;
+
+ // subtract out the previous magnitude of this dimension from the total
+ Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
+ Proto->Magnitude.Elliptical[i] =
+ 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
+ Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));
+
+ // note that the proto Weight is irrelevant for uniform protos
+} // MakeDimUniform
+
+/**
+ * This routine searches the cluster tree for all leaf nodes
+ * which are samples in the specified cluster. It computes
+ * a full covariance matrix for these samples as well as
+ * keeping track of the ranges (min and max) for each
+ * dimension. A special data structure is allocated to
+ * return this information to the caller. An incremental
+ * algorithm for computing statistics is not used because
+ * it will not work with circular dimensions.
+ * @param N number of dimensions
+ * @param ParamDesc array of dimension descriptions
+ * @param Cluster cluster whose stats are to be computed
+ * @return Pointer to new data structure containing statistics
+ */
+static STATISTICS*
+ComputeStatistics (int16_t N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
+ STATISTICS *Statistics;
+ int i, j;
+ float *CoVariance;
+ float *Distance;
+ LIST SearchState;
+ SAMPLE *Sample;
+ uint32_t SampleCountAdjustedForBias;
+
+ // allocate memory to hold the statistics results
+ Statistics = static_cast<STATISTICS *>(malloc (sizeof (STATISTICS)));
+ Statistics->CoVariance = static_cast<float *>(malloc(sizeof(float) * N * N));
+ Statistics->Min = static_cast<float *>(malloc (N * sizeof (float)));
+ Statistics->Max = static_cast<float *>(malloc (N * sizeof (float)));
+
+ // allocate temporary memory to hold the sample to mean distances
+ Distance = static_cast<float *>(malloc (N * sizeof (float)));
+
+ // initialize the statistics
+ Statistics->AvgVariance = 1.0;
+ CoVariance = Statistics->CoVariance;
+ for (i = 0; i < N; i++) {
+ Statistics->Min[i] = 0.0;
+ Statistics->Max[i] = 0.0;
+ for (j = 0; j < N; j++, CoVariance++)
+ *CoVariance = 0;
+ }
+ // find each sample in the cluster and merge it into the statistics
+ InitSampleSearch(SearchState, Cluster);
+ while ((Sample = NextSample (&SearchState)) != nullptr) {
+ for (i = 0; i < N; i++) {
+ Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
+ if (ParamDesc[i].Circular) {
+ if (Distance[i] > ParamDesc[i].HalfRange)
+ Distance[i] -= ParamDesc[i].Range;
+ if (Distance[i] < -ParamDesc[i].HalfRange)
+ Distance[i] += ParamDesc[i].Range;
+ }
+ if (Distance[i] < Statistics->Min[i])
+ Statistics->Min[i] = Distance[i];
+ if (Distance[i] > Statistics->Max[i])
+ Statistics->Max[i] = Distance[i];
+ }
+ CoVariance = Statistics->CoVariance;
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++, CoVariance++)
+ *CoVariance += Distance[i] * Distance[j];
+ }
+ // normalize the variances by the total number of samples
+ // use SampleCount-1 instead of SampleCount to get an unbiased estimate
+ // also compute the geometic mean of the diagonal variances
+ // ensure that clusters with only 1 sample are handled correctly
+ if (Cluster->SampleCount > 1)
+ SampleCountAdjustedForBias = Cluster->SampleCount - 1;
+ else
+ SampleCountAdjustedForBias = 1;
+ CoVariance = Statistics->CoVariance;
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++, CoVariance++) {
+ *CoVariance /= SampleCountAdjustedForBias;
+ if (j == i) {
+ if (*CoVariance < MINVARIANCE)
+ *CoVariance = MINVARIANCE;
+ Statistics->AvgVariance *= *CoVariance;
+ }
+ }
+ Statistics->AvgVariance = static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance),
+ 1.0 / N));
+
+ // release temporary memory and return
+ free(Distance);
+ return (Statistics);
+} // ComputeStatistics
+
+/**
+ * This routine creates a spherical prototype data structure to
+ * approximate the samples in the specified cluster.
+ * Spherical prototypes have a single variance which is
+ * common across all dimensions. All dimensions are normally
+ * distributed and independent.
+ * @param N number of dimensions
+ * @param Cluster cluster to be made into a spherical prototype
+ * @param Statistics statistical info about samples in cluster
+ * @return Pointer to a new spherical prototype data structure
+ */
+static PROTOTYPE* NewSphericalProto(uint16_t N, CLUSTER* Cluster,
+ STATISTICS* Statistics) {
+ PROTOTYPE *Proto;
+
+ Proto = NewSimpleProto (N, Cluster);
+
+ Proto->Variance.Spherical = Statistics->AvgVariance;
+ if (Proto->Variance.Spherical < MINVARIANCE)
+ Proto->Variance.Spherical = MINVARIANCE;
+
+ Proto->Magnitude.Spherical =
+ 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
+ Proto->TotalMagnitude = static_cast<float>(pow(static_cast<double>(Proto->Magnitude.Spherical),
+ static_cast<double>(N)));
+ Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
+ Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));
+
+ return (Proto);
+} // NewSphericalProto
+
+/**
+ * This routine creates an elliptical prototype data structure to
+ * approximate the samples in the specified cluster.
+ * Elliptical prototypes have a variance for each dimension.
+ * All dimensions are normally distributed and independent.
+ * @param N number of dimensions
+ * @param Cluster cluster to be made into an elliptical prototype
+ * @param Statistics statistical info about samples in cluster
+ * @return Pointer to a new elliptical prototype data structure
+ */
+static PROTOTYPE* NewEllipticalProto(int16_t N, CLUSTER* Cluster,
+ STATISTICS* Statistics) {
+ PROTOTYPE *Proto;
+ float *CoVariance;
+ int i;
+
+ Proto = NewSimpleProto (N, Cluster);
+ Proto->Variance.Elliptical = static_cast<float *>(malloc (N * sizeof (float)));
+ Proto->Magnitude.Elliptical = static_cast<float *>(malloc (N * sizeof (float)));
+ Proto->Weight.Elliptical = static_cast<float *>(malloc (N * sizeof (float)));
+
+ CoVariance = Statistics->CoVariance;
+ Proto->TotalMagnitude = 1.0;
+ for (i = 0; i < N; i++, CoVariance += N + 1) {
+ Proto->Variance.Elliptical[i] = *CoVariance;
+ if (Proto->Variance.Elliptical[i] < MINVARIANCE)
+ Proto->Variance.Elliptical[i] = MINVARIANCE;
+
+ Proto->Magnitude.Elliptical[i] =
+ 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Elliptical[i]);
+ Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
+ }
+ Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));
+ Proto->Style = elliptical;
+ return (Proto);
+} // NewEllipticalProto
+
+/**
+ * This routine creates a mixed prototype data structure to
+ * approximate the samples in the specified cluster.
+ * Mixed prototypes can have different distributions for
+ * each dimension. All dimensions are independent. The
+ * structure is initially filled in as though it were an
+ * elliptical prototype. The actual distributions of the
+ * dimensions can be altered by other routines.
+ * @param N number of dimensions
+ * @param Cluster cluster to be made into a mixed prototype
+ * @param Statistics statistical info about samples in cluster
+ * @return Pointer to a new mixed prototype data structure
+ */
+static PROTOTYPE* NewMixedProto(int16_t N, CLUSTER* Cluster,
+ STATISTICS* Statistics) {
+ PROTOTYPE *Proto;
+ int i;
+
+ Proto = NewEllipticalProto (N, Cluster, Statistics);
+ Proto->Distrib = static_cast<DISTRIBUTION *>(malloc (N * sizeof (DISTRIBUTION)));
+
+ for (i = 0; i < N; i++) {
+ Proto->Distrib[i] = normal;
+ }
+ Proto->Style = mixed;
+ return (Proto);
+} // NewMixedProto
+
+/**
+ * This routine allocates memory to hold a simple prototype
+ * data structure, i.e. one without independent distributions
+ * and variances for each dimension.
+ * @param N number of dimensions
+ * @param Cluster cluster to be made into a prototype
+ * @return Pointer to new simple prototype
+ */
+static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
+ PROTOTYPE *Proto;
+ int i;
+
+ Proto = static_cast<PROTOTYPE *>(malloc (sizeof (PROTOTYPE)));
+ Proto->Mean = static_cast<float *>(malloc (N * sizeof (float)));
+
+ for (i = 0; i < N; i++)
+ Proto->Mean[i] = Cluster->Mean[i];
+ Proto->Distrib = nullptr;
+
+ Proto->Significant = true;
+ Proto->Merged = false;
+ Proto->Style = spherical;
+ Proto->NumSamples = Cluster->SampleCount;
+ Proto->Cluster = Cluster;
+ Proto->Cluster->Prototype = true;
+ return (Proto);
+} // NewSimpleProto
+
+/**
+ * This routine returns true if the specified covariance
+ * matrix indicates that all N dimensions are independent of
+ * one another. One dimension is judged to be independent of
+ * another when the magnitude of the corresponding correlation
+ * coefficient is
+ * less than the specified Independence factor. The
+ * correlation coefficient is calculated as: (see Duda and
+ * Hart, pg. 247)
+ * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
+ * The covariance matrix is assumed to be symmetric (which
+ * should always be true).
+ * @param ParamDesc descriptions of each feature space dimension
+ * @param N number of dimensions
+ * @param CoVariance ptr to a covariance matrix
+ * @param Independence max off-diagonal correlation coefficient
+ * @return true if dimensions are independent, false otherwise
+ */
+static bool
+Independent(PARAM_DESC* ParamDesc,
+ int16_t N, float* CoVariance, float Independence) {
+ int i, j;
+ float *VARii; // points to ith on-diagonal element
+ float *VARjj; // points to jth on-diagonal element
+ float CorrelationCoeff;
+
+ VARii = CoVariance;
+ for (i = 0; i < N; i++, VARii += N + 1) {
+ if (ParamDesc[i].NonEssential)
+ continue;
+
+ VARjj = VARii + N + 1;
+ CoVariance = VARii + 1;
+ for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
+ if (ParamDesc[j].NonEssential)
+ continue;
+
+ if ((*VARii == 0.0) || (*VARjj == 0.0))
+ CorrelationCoeff = 0.0;
+ else
+ CorrelationCoeff =
+ sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
+ if (CorrelationCoeff > Independence)
+ return false;
+ }
+ }
+ return true;
+} // Independent
+
+/**
+ * This routine returns a histogram data structure which can
+ * be used by other routines to place samples into histogram
+ * buckets, and then apply a goodness of fit test to the
+ * histogram data to determine if the samples belong to the
+ * specified probability distribution. The routine keeps
+ * a list of bucket data structures which have already been
+ * created so that it minimizes the computation time needed
+ * to create a new bucket.
+ * @param clusterer which keeps a bucket_cache for us.
+ * @param Distribution type of probability distribution to test for
+ * @param SampleCount number of samples that are available
+ * @param Confidence probability of a Type I error
+ * @return Bucket data structure
+ */
+static BUCKETS *GetBuckets(CLUSTERER* clusterer,
+ DISTRIBUTION Distribution,
+ uint32_t SampleCount,
+ double Confidence) {
+ // Get an old bucket structure with the same number of buckets.
+ uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
+ BUCKETS *Buckets =
+ clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
+
+ // If a matching bucket structure is not found, make one and save it.
+ if (Buckets == nullptr) {
+ Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
+ clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
+ Buckets;
+ } else {
+ // Just adjust the existing buckets.
+ if (SampleCount != Buckets->SampleCount)
+ AdjustBuckets(Buckets, SampleCount);
+ if (Confidence != Buckets->Confidence) {
+ Buckets->Confidence = Confidence;
+ Buckets->ChiSquared = ComputeChiSquared(
+ DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
+ Confidence);
+ }
+ InitBuckets(Buckets);
+ }
+ return Buckets;
+} // GetBuckets
+
+/**
+ * This routine creates a histogram data structure which can
+ * be used by other routines to place samples into histogram
+ * buckets, and then apply a goodness of fit test to the
+ * histogram data to determine if the samples belong to the
+ * specified probability distribution. The buckets are
+ * allocated in such a way that the expected frequency of
+ * samples in each bucket is approximately the same. In
+ * order to make this possible, a mapping table is
+ * computed which maps "normalized" samples into the
+ * appropriate bucket.
+ * @param Distribution type of probability distribution to test for
+ * @param SampleCount number of samples that are available
+ * @param Confidence probability of a Type I error
+ * @return Pointer to new histogram data structure
+ */
+static BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
+ uint32_t SampleCount,
+ double Confidence) {
+ const DENSITYFUNC DensityFunction[] =
+ { NormalDensity, UniformDensity, UniformDensity };
+ int i, j;
+ BUCKETS *Buckets;
+ double BucketProbability;
+ double NextBucketBoundary;
+ double Probability;
+ double ProbabilityDelta;
+ double LastProbDensity;
+ double ProbDensity;
+ uint16_t CurrentBucket;
+ bool Symmetrical;
+
+ // allocate memory needed for data structure
+ Buckets = static_cast<BUCKETS *>(malloc(sizeof(BUCKETS)));
+ Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
+ Buckets->SampleCount = SampleCount;
+ Buckets->Confidence = Confidence;
+ Buckets->Count =
+ static_cast<uint32_t *>(malloc(Buckets->NumberOfBuckets * sizeof(uint32_t)));
+ Buckets->ExpectedCount = static_cast<float *>(
+ malloc(Buckets->NumberOfBuckets * sizeof(float)));
+
+ // initialize simple fields
+ Buckets->Distribution = Distribution;
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
+ Buckets->Count[i] = 0;
+ Buckets->ExpectedCount[i] = 0.0;
+ }
+
+ // all currently defined distributions are symmetrical
+ Symmetrical = true;
+ Buckets->ChiSquared = ComputeChiSquared(
+ DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
+
+ if (Symmetrical) {
+ // allocate buckets so that all have approx. equal probability
+ BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);
+
+ // distribution is symmetric so fill in upper half then copy
+ CurrentBucket = Buckets->NumberOfBuckets / 2;
+ if (Odd (Buckets->NumberOfBuckets))
+ NextBucketBoundary = BucketProbability / 2;
+ else
+ NextBucketBoundary = BucketProbability;
+
+ Probability = 0.0;
+ LastProbDensity =
+ (*DensityFunction[static_cast<int>(Distribution)]) (BUCKETTABLESIZE / 2);
+ for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
+ ProbDensity = (*DensityFunction[static_cast<int>(Distribution)]) (i + 1);
+ ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
+ Probability += ProbabilityDelta;
+ if (Probability > NextBucketBoundary) {
+ if (CurrentBucket < Buckets->NumberOfBuckets - 1)
+ CurrentBucket++;
+ NextBucketBoundary += BucketProbability;
+ }
+ Buckets->Bucket[i] = CurrentBucket;
+ Buckets->ExpectedCount[CurrentBucket] +=
+ static_cast<float>(ProbabilityDelta * SampleCount);
+ LastProbDensity = ProbDensity;
+ }
+ // place any leftover probability into the last bucket
+ Buckets->ExpectedCount[CurrentBucket] +=
+ static_cast<float>((0.5 - Probability) * SampleCount);
+
+ // copy upper half of distribution to lower half
+ for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
+ Buckets->Bucket[i] =
+ Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
+
+ // copy upper half of expected counts to lower half
+ for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
+ Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
+ }
+ return Buckets;
+} // MakeBuckets
+
+/**
+ * This routine computes the optimum number of histogram
+ * buckets that should be used in a chi-squared goodness of
+ * fit test for the specified number of samples. The optimum
+ * number is computed based on Table 4.1 on pg. 147 of
+ * "Measurement and Analysis of Random Data" by Bendat & Piersol.
+ * Linear interpolation is used to interpolate between table
+ * values. The table is intended for a 0.05 level of
+ * significance (alpha). This routine assumes that it is
+ * equally valid for other alpha's, which may not be true.
+ * @param SampleCount number of samples to be tested
+ * @return Optimum number of histogram buckets
+ */
+static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
+ uint8_t Last, Next;
+ float Slope;
+
+ if (SampleCount < kCountTable[0])
+ return kBucketsTable[0];
+
+ for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
+ if (SampleCount <= kCountTable[Next]) {
+ Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
+ static_cast<float>(kCountTable[Next] - kCountTable[Last]);
+ return (static_cast<uint16_t>(kBucketsTable[Last] +
+ Slope * (SampleCount - kCountTable[Last])));
+ }
+ }
+ return kBucketsTable[Last];
+} // OptimumNumberOfBuckets
+
+/**
+ * This routine computes the chi-squared value which will
+ * leave a cumulative probability of Alpha in the right tail
+ * of a chi-squared distribution with the specified number of
+ * degrees of freedom. Alpha must be between 0 and 1.
+ * DegreesOfFreedom must be even. The routine maintains an
+ * array of lists. Each list corresponds to a different
+ * number of degrees of freedom. Each entry in the list
+ * corresponds to a different alpha value and its corresponding
+ * chi-squared value. Therefore, once a particular chi-squared
+ * value is computed, it is stored in the list and never
+ * needs to be computed again.
+ * @param DegreesOfFreedom determines shape of distribution
+ * @param Alpha probability of right tail
+ * @return Desired chi-squared value
+ */
+static double
+ComputeChiSquared (uint16_t DegreesOfFreedom, double Alpha)
+#define CHIACCURACY 0.01
+#define MINALPHA (1e-200)
+{
+ static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
+
+ CHISTRUCT *OldChiSquared;
+ CHISTRUCT SearchKey;
+
+ // limit the minimum alpha that can be used - if alpha is too small
+ // it may not be possible to compute chi-squared.
+ Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
+ if (Odd (DegreesOfFreedom))
+ DegreesOfFreedom++;
+
+ /* find the list of chi-squared values which have already been computed
+ for the specified number of degrees of freedom. Search the list for
+ the desired chi-squared. */
+ SearchKey.Alpha = Alpha;
+ OldChiSquared = reinterpret_cast<CHISTRUCT *>first_node (search (ChiWith[DegreesOfFreedom],
+ &SearchKey, AlphaMatch));
+
+ if (OldChiSquared == nullptr) {
+ OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
+ OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
+ static_cast<double>(DegreesOfFreedom),
+ CHIACCURACY);
+ ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
+ OldChiSquared);
+ }
+ else {
+ // further optimization might move OldChiSquared to front of list
+ }
+
+ return (OldChiSquared->ChiSquared);
+
+} // ComputeChiSquared
+
+/**
+ * This routine computes the probability density function
+ * of a discrete normal distribution defined by the global
+ * variables kNormalMean, kNormalVariance, and kNormalMagnitude.
+ * Normal magnitude could, of course, be computed in terms of
+ * the normal variance but it is precomputed for efficiency.
+ * @param x number to compute the normal probability density for
+ * @note Globals:
+ * kNormalMean mean of a discrete normal distribution
+ * kNormalVariance variance of a discrete normal distribution
+ * kNormalMagnitude magnitude of a discrete normal distribution
+ * @return The value of the normal distribution at x.
+ */
+static double NormalDensity(int32_t x) {
+ double Distance;
+
+ Distance = x - kNormalMean;
+ return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
+} // NormalDensity
+
+/**
+ * This routine computes the probability density function
+ * of a uniform distribution at the specified point. The
+ * range of the distribution is from 0 to BUCKETTABLESIZE.
+ * @param x number to compute the uniform probability density for
+ * @return The value of the uniform distribution at x.
+ */
+static double UniformDensity(int32_t x) {
+ constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;
+
+ if ((x >= 0) && (x <= BUCKETTABLESIZE)) {
+ return UniformDistributionDensity;
+ } else {
+ return 0.0;
+ }
+} // UniformDensity
+
+/**
+ * This routine computes a trapezoidal approximation to the
+ * integral of a function over a small delta in x.
+ * @param f1 value of function at x1
+ * @param f2 value of function at x2
+ * @param Dx x2 - x1 (should always be positive)
+ * @return Approximation of the integral of the function from x1 to x2.
+ */
+static double Integral(double f1, double f2, double Dx) {
+ return (f1 + f2) * Dx / 2.0;
+} // Integral
+
+/**
+ * This routine counts the number of cluster samples which
+ * fall within the various histogram buckets in Buckets. Only
+ * one dimension of each sample is examined. The exact meaning
+ * of the Mean and StdDev parameters depends on the
+ * distribution which is being analyzed (this info is in the
+ * Buckets data structure). For normal distributions, Mean
+ * and StdDev have the expected meanings. For uniform and
+ * random distributions the Mean is the center point of the
+ * range and the StdDev is 1/2 the range. A dimension with
+ * zero standard deviation cannot be statistically analyzed.
+ * In this case, a pseudo-analysis is used.
+ * The Buckets data structure is filled in.
+ * @param Buckets histogram buckets to count samples
+ * @param Cluster cluster whose samples are being analyzed
+ * @param Dim dimension of samples which is being analyzed
+ * @param ParamDesc description of the dimension
+ * @param Mean "mean" of the distribution
+ * @param StdDev "standard deviation" of the distribution
+ */
+static void FillBuckets(BUCKETS *Buckets,
+ CLUSTER *Cluster,
+ uint16_t Dim,
+ PARAM_DESC *ParamDesc,
+ float Mean,
+ float StdDev) {
+ uint16_t BucketID;
+ int i;
+ LIST SearchState;
+ SAMPLE *Sample;
+
+ // initialize the histogram bucket counts to 0
+ for (i = 0; i < Buckets->NumberOfBuckets; i++)
+ Buckets->Count[i] = 0;
+
+ if (StdDev == 0.0) {
+ /* if the standard deviation is zero, then we can't statistically
+ analyze the cluster. Use a pseudo-analysis: samples exactly on
+ the mean are distributed evenly across all buckets. Samples greater
+ than the mean are placed in the last bucket; samples less than the
+ mean are placed in the first bucket. */
+
+ InitSampleSearch(SearchState, Cluster);
+ i = 0;
+ while ((Sample = NextSample (&SearchState)) != nullptr) {
+ if (Sample->Mean[Dim] > Mean)
+ BucketID = Buckets->NumberOfBuckets - 1;
+ else if (Sample->Mean[Dim] < Mean)
+ BucketID = 0;
+ else
+ BucketID = i;
+ Buckets->Count[BucketID] += 1;
+ i++;
+ if (i >= Buckets->NumberOfBuckets)
+ i = 0;
+ }
+ }
+ else {
+ // search for all samples in the cluster and add to histogram buckets
+ InitSampleSearch(SearchState, Cluster);
+ while ((Sample = NextSample (&SearchState)) != nullptr) {
+ switch (Buckets->Distribution) {
+ case normal:
+ BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
+ Mean, StdDev);
+ break;
+ case D_random:
+ case uniform:
+ BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
+ Mean, StdDev);
+ break;
+ default:
+ BucketID = 0;
+ }
+ Buckets->Count[Buckets->Bucket[BucketID]] += 1;
+ }
+ }
+} // FillBuckets
+
+/**
+ * This routine determines which bucket x falls into in the
+ * discrete normal distribution defined by kNormalMean
+ * and kNormalStdDev. x values which exceed the range of
+ * the discrete distribution are clipped.
+ * @param ParamDesc used to identify circular dimensions
+ * @param x value to be normalized
+ * @param Mean mean of normal distribution
+ * @param StdDev standard deviation of normal distribution
+ * @return Bucket number into which x falls
+ */
+static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
+ float x,
+ float Mean,
+ float StdDev) {
+ float X;
+
+ // wraparound circular parameters if necessary
+ if (ParamDesc->Circular) {
+ if (x - Mean > ParamDesc->HalfRange)
+ x -= ParamDesc->Range;
+ else if (x - Mean < -ParamDesc->HalfRange)
+ x += ParamDesc->Range;
+ }
+
+ X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
+ if (X < 0)
+ return 0;
+ if (X > BUCKETTABLESIZE - 1)
+ return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));
+ return static_cast<uint16_t>(floor(static_cast<double>(X)));
+} // NormalBucket
+
+/**
+ * This routine determines which bucket x falls into in the
+ * discrete uniform distribution defined by
+ * BUCKETTABLESIZE. x values which exceed the range of
+ * the discrete distribution are clipped.
+ * @param ParamDesc used to identify circular dimensions
+ * @param x value to be normalized
+ * @param Mean center of range of uniform distribution
+ * @param StdDev 1/2 the range of the uniform distribution
+ * @return Bucket number into which x falls
+ */
+static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
+ float x,
+ float Mean,
+ float StdDev) {
+ float X;
+
+ // wraparound circular parameters if necessary
+ if (ParamDesc->Circular) {
+ if (x - Mean > ParamDesc->HalfRange)
+ x -= ParamDesc->Range;
+ else if (x - Mean < -ParamDesc->HalfRange)
+ x += ParamDesc->Range;
+ }
+
+ X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
+ if (X < 0)
+ return 0;
+ if (X > BUCKETTABLESIZE - 1)
+ return static_cast<uint16_t>(BUCKETTABLESIZE - 1);
+ return static_cast<uint16_t>(floor(static_cast<double>(X)));
+} // UniformBucket
+
+/**
+ * This routine performs a chi-square goodness of fit test
+ * on the histogram data in the Buckets data structure.
+ * true is returned if the histogram matches the probability
+ * distribution which was specified when the Buckets
+ * structure was originally created. Otherwise false is
+ * returned.
+ * @param Buckets histogram data to perform chi-square test on
+ * @return true if samples match distribution, false otherwise
+ */
+static bool DistributionOK(BUCKETS* Buckets) {
+ float FrequencyDifference;
+ float TotalDifference;
+ int i;
+
+ // compute how well the histogram matches the expected histogram
+ TotalDifference = 0.0;
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
+ FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
+ TotalDifference += (FrequencyDifference * FrequencyDifference) /
+ Buckets->ExpectedCount[i];
+ }
+
+ // test to see if the difference is more than expected
+ if (TotalDifference > Buckets->ChiSquared)
+ return false;
+ else
+ return true;
+} // DistributionOK
+
+/**
+ * This routine frees the memory used by the statistics
+ * data structure.
+ * @param Statistics pointer to data structure to be freed
+ */
+static void FreeStatistics(STATISTICS *Statistics) {
+ free(Statistics->CoVariance);
+ free(Statistics->Min);
+ free(Statistics->Max);
+ free(Statistics);
+} // FreeStatistics
+
+/**
+ * This routine properly frees the memory used by a BUCKETS.
+ *
+ * @param buckets pointer to data structure to be freed
+ */
+static void FreeBuckets(BUCKETS *buckets) {
+ free(buckets->Count);
+ free(buckets->ExpectedCount);
+ free(buckets);
+} // FreeBuckets
+
+/**
+ * This routine frees the memory consumed by the specified
+ * cluster and all of its subclusters. This is done by
+ * recursive calls to FreeCluster().
+ *
+ * @param Cluster pointer to cluster to be freed
+ */
+static void FreeCluster(CLUSTER *Cluster) {
+ if (Cluster != nullptr) {
+ FreeCluster (Cluster->Left);
+ FreeCluster (Cluster->Right);
+ free(Cluster);
+ }
+} // FreeCluster
+
+/**
+ * This routine computes the degrees of freedom that should
+ * be used in a chi-squared test with the specified number of
+ * histogram buckets. The result is always rounded up to
+ * the next even number so that the value of chi-squared can be
+ * computed more easily. This will cause the value of
+ * chi-squared to be higher than the optimum value, resulting
+ * in the chi-square test being more lenient than optimum.
+ * @param Distribution distribution being tested for
+ * @param HistogramBuckets number of buckets in chi-square test
+ * @return The number of degrees of freedom for a chi-square test
+ */
+static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
+ static uint8_t DegreeOffsets[] = { 3, 3, 1 };
+
+ uint16_t AdjustedNumBuckets;
+
+ AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
+ if (Odd (AdjustedNumBuckets))
+ AdjustedNumBuckets++;
+ return (AdjustedNumBuckets);
+
+} // DegreesOfFreedom
+
+/**
+ * This routine multiplies each ExpectedCount histogram entry
+ * by NewSampleCount/OldSampleCount so that the histogram
+ * is now adjusted to the new sample count.
+ * @param Buckets histogram data structure to adjust
+ * @param NewSampleCount new sample count to adjust to
+ */
+static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
+ int i;
+ double AdjustFactor;
+
+ AdjustFactor = ((static_cast<double>(NewSampleCount)) /
+ (static_cast<double>(Buckets->SampleCount)));
+
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
+ Buckets->ExpectedCount[i] *= AdjustFactor;
+ }
+
+ Buckets->SampleCount = NewSampleCount;
+
+} // AdjustBuckets
+
+/**
+ * This routine sets the bucket counts in the specified histogram
+ * to zero.
+ * @param Buckets histogram data structure to init
+ */
+static void InitBuckets(BUCKETS *Buckets) {
+ int i;
+
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
+ Buckets->Count[i] = 0;
+ }
+
+} // InitBuckets
+
+/**
+ * This routine is used to search a list of structures which
+ * hold pre-computed chi-squared values for a chi-squared
+ * value whose corresponding alpha field matches the alpha
+ * field of SearchKey.
+ *
+ * It is called by the list search routines.
+ *
+ * @param arg1 chi-squared struct being tested for a match
+ * @param arg2 chi-squared struct that is the search key
+ * @return true if ChiStruct's Alpha matches SearchKey's Alpha
+ */
+static int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
+ void *arg2) { //CHISTRUCT *SearchKey)
+ auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
+ auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
+
+ return (ChiStruct->Alpha == SearchKey->Alpha);
+
+} // AlphaMatch
+
+/**
+ * This routine allocates a new data structure which is used
+ * to hold a chi-squared value along with its associated
+ * number of degrees of freedom and alpha value.
+ *
+ * @param DegreesOfFreedom degrees of freedom for new chi value
+ * @param Alpha confidence level for new chi value
+ * @return newly allocated data structure
+ */
+static CHISTRUCT *NewChiStruct(uint16_t DegreesOfFreedom, double Alpha) {
+ CHISTRUCT *NewChiStruct;
+
+ NewChiStruct = static_cast<CHISTRUCT *>(malloc (sizeof (CHISTRUCT)));
+ NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
+ NewChiStruct->Alpha = Alpha;
+ return (NewChiStruct);
+
+} // NewChiStruct
+
+/**
+ * This routine attempts to find an x value at which Function
+ * goes to zero (i.e. a root of the function). It will only
+ * work correctly if a solution actually exists and there
+ * are no extrema between the solution and the InitialGuess.
+ * The algorithms used are extremely primitive.
+ *
+ * @param Function function whose zero is to be found
+ * @param FunctionParams arbitrary data to pass to function
+ * @param InitialGuess point to start solution search at
+ * @param Accuracy maximum allowed error
+ * @return Solution of function (x for which f(x) = 0).
+ */
+static double
+Solve (SOLVEFUNC Function,
+void *FunctionParams, double InitialGuess, double Accuracy)
+#define INITIALDELTA 0.1
+#define DELTARATIO 0.1
+{
+ double x;
+ double f;
+ double Slope;
+ double Delta;
+ double NewDelta;
+ double xDelta;
+ double LastPosX, LastNegX;
+
+ x = InitialGuess;
+ Delta = INITIALDELTA;
+ LastPosX = FLT_MAX;
+ LastNegX = -FLT_MAX;
+ f = (*Function) (static_cast<CHISTRUCT *>(FunctionParams), x);
+ while (Abs (LastPosX - LastNegX) > Accuracy) {
+ // keep track of outer bounds of current estimate
+ if (f < 0)
+ LastNegX = x;
+ else
+ LastPosX = x;
+
+ // compute the approx. slope of f(x) at the current point
+ Slope =
+ ((*Function) (static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
+
+ // compute the next solution guess */
+ xDelta = f / Slope;
+ x -= xDelta;
+
+ // reduce the delta used for computing slope to be a fraction of
+ //the amount moved to get to the new guess
+ NewDelta = Abs (xDelta) * DELTARATIO;
+ if (NewDelta < Delta)
+ Delta = NewDelta;
+
+ // compute the value of the function at the new guess
+ f = (*Function) (static_cast<CHISTRUCT *>(FunctionParams), x);
+ }
+ return (x);
+
+} // Solve
+
+/**
+ * This routine computes the area under a chi density curve
+ * from 0 to x, minus the desired area under the curve. The
+ * number of degrees of freedom of the chi curve is specified
+ * in the ChiParams structure. The desired area is also
+ * specified in the ChiParams structure as Alpha (or 1 minus
+ * the desired area). This routine is intended to be passed
+ * to the Solve() function to find the value of chi-squared
+ * which will yield a desired area under the right tail of
+ * the chi density curve. The function will only work for
+ * even degrees of freedom. The equations are based on
+ * integrating the chi density curve in parts to obtain
+ * a series that can be used to compute the area under the
+ * curve.
+ * @param ChiParams contains degrees of freedom and alpha
+ * @param x value of chi-squared to evaluate
+ * @return Error between actual and desired area under the chi curve.
+ */
+static double ChiArea(CHISTRUCT *ChiParams, double x) {
+ int i, N;
+ double SeriesTotal;
+ double Denominator;
+ double PowerOfx;
+
+ N = ChiParams->DegreesOfFreedom / 2 - 1;
+ SeriesTotal = 1;
+ Denominator = 1;
+ PowerOfx = 1;
+ for (i = 1; i <= N; i++) {
+ Denominator *= 2 * i;
+ PowerOfx *= x;
+ SeriesTotal += PowerOfx / Denominator;
+ }
+ return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
+
+} // ChiArea
+
+/**
+ * This routine looks at all samples in the specified cluster.
+ * It computes a running estimate of the percentage of the
+ * characters which have more than 1 sample in the cluster.
+ * When this percentage exceeds MaxIllegal, true is returned.
+ * Otherwise false is returned. The CharID
+ * fields must contain integers which identify the training
+ * characters which were used to generate the sample. One
+ * integer is used for each sample. The NumChar field in
+ * the Clusterer must contain the number of characters in the
+ * training set. All CharID fields must be between 0 and
+ * NumChar-1. The main function of this routine is to help
+ * identify clusters which need to be split further, i.e. if
+ * numerous training characters have 2 or more features which are
+ * contained in the same cluster, then the cluster should be
+ * split.
+ *
+ * @param Clusterer data structure holding cluster tree
+ * @param Cluster cluster containing samples to be tested
+ * @param MaxIllegal max percentage of samples allowed to have
+ * more than 1 feature in the cluster
+ * @return true if the cluster should be split, false otherwise.
+ */
+static bool
+MultipleCharSamples(CLUSTERER* Clusterer,
+ CLUSTER* Cluster, float MaxIllegal)
+#define ILLEGAL_CHAR 2
+{
+ static std::vector<uint8_t> CharFlags;
+ LIST SearchState;
+ SAMPLE *Sample;
+ int32_t CharID;
+ int32_t NumCharInCluster;
+ int32_t NumIllegalInCluster;
+ float PercentIllegal;
+
+ // initial estimate assumes that no illegal chars exist in the cluster
+ NumCharInCluster = Cluster->SampleCount;
+ NumIllegalInCluster = 0;
+
+ if (Clusterer->NumChar > CharFlags.size()) {
+ CharFlags.resize(Clusterer->NumChar);
+ }
+
+ for (auto& CharFlag : CharFlags)
+ CharFlag = false;
+
+ // find each sample in the cluster and check if we have seen it before
+ InitSampleSearch(SearchState, Cluster);
+ while ((Sample = NextSample (&SearchState)) != nullptr) {
+ CharID = Sample->CharID;
+ if (CharFlags[CharID] == false) {
+ CharFlags[CharID] = true;
+ }
+ else {
+ if (CharFlags[CharID] == true) {
+ NumIllegalInCluster++;
+ CharFlags[CharID] = ILLEGAL_CHAR;
+ }
+ NumCharInCluster--;
+ PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
+ if (PercentIllegal > MaxIllegal) {
+ destroy(SearchState);
+ return true;
+ }
+ }
+ }
+ return false;
+
+} // MultipleCharSamples
+
+/**
+ * Compute the inverse of a matrix using LU decomposition with partial pivoting.
+ * The return value is the sum of norms of the off-diagonal terms of the
+ * product of a and inv. (A measure of the error.)
+ */
+static double InvertMatrix(const float* input, int size, float* inv) {
+ // Allocate memory for the 2D arrays.
+ GENERIC_2D_ARRAY<double> U(size, size, 0.0);
+ GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
+ GENERIC_2D_ARRAY<double> L(size, size, 0.0);
+
+ // Initialize the working matrices. U starts as input, L as I and U_inv as O.
+ int row;
+ int col;
+ for (row = 0; row < size; row++) {
+ for (col = 0; col < size; col++) {
+ U[row][col] = input[row*size + col];
+ L[row][col] = row == col ? 1.0 : 0.0;
+ U_inv[row][col] = 0.0;
+ }
+ }
+
+ // Compute forward matrix by inversion by LU decomposition of input.
+ for (col = 0; col < size; ++col) {
+ // Find best pivot
+ int best_row = 0;
+ double best_pivot = -1.0;
+ for (row = col; row < size; ++row) {
+ if (Abs(U[row][col]) > best_pivot) {
+ best_pivot = Abs(U[row][col]);
+ best_row = row;
+ }
+ }
+ // Exchange pivot rows.
+ if (best_row != col) {
+ for (int k = 0; k < size; ++k) {
+ double tmp = U[best_row][k];
+ U[best_row][k] = U[col][k];
+ U[col][k] = tmp;
+ tmp = L[best_row][k];
+ L[best_row][k] = L[col][k];
+ L[col][k] = tmp;
+ }
+ }
+ // Now do the pivot itself.
+ for (row = col + 1; row < size; ++row) {
+ double ratio = -U[row][col] / U[col][col];
+ for (int j = col; j < size; ++j) {
+ U[row][j] += U[col][j] * ratio;
+ }
+ for (int k = 0; k < size; ++k) {
+ L[row][k] += L[col][k] * ratio;
+ }
+ }
+ }
+ // Next invert U.
+ for (col = 0; col < size; ++col) {
+ U_inv[col][col] = 1.0 / U[col][col];
+ for (row = col - 1; row >= 0; --row) {
+ double total = 0.0;
+ for (int k = col; k > row; --k) {
+ total += U[row][k] * U_inv[k][col];
+ }
+ U_inv[row][col] = -total / U[row][row];
+ }
+ }
+ // Now the answer is U_inv.L.
+ for (row = 0; row < size; row++) {
+ for (col = 0; col < size; col++) {
+ double sum = 0.0;
+ for (int k = row; k < size; ++k) {
+ sum += U_inv[row][k] * L[k][col];
+ }
+ inv[row*size + col] = sum;
+ }
+ }
+ // Check matrix product.
+ double error_sum = 0.0;
+ for (row = 0; row < size; row++) {
+ for (col = 0; col < size; col++) {
+ double sum = 0.0;
+ for (int k = 0; k < size; ++k) {
+ sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
+ }
+ if (row != col) {
+ error_sum += Abs(sum);
+ }
+ }
+ }
+ return error_sum;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/cluster.h b/tesseract/src/classify/cluster.h
new file mode 100644
index 00000000..8a6a270a
--- /dev/null
+++ b/tesseract/src/classify/cluster.h
@@ -0,0 +1,138 @@
+/******************************************************************************
+ ** Filename: cluster.h
+ ** Purpose: Definition of feature space clustering routines
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#ifndef CLUSTER_H
+#define CLUSTER_H
+
+#include "kdtree.h"
+#include "oldlist.h"
+
+namespace tesseract {
+
+struct BUCKETS;
+
+#define MINBUCKETS 5
+#define MAXBUCKETS 39
+
+/*----------------------------------------------------------------------
+ Types
+----------------------------------------------------------------------*/
+typedef struct sample {
+ bool Clustered : 1; // true if included in a higher cluster
+ bool Prototype : 1; // true if cluster represented by a proto
+ unsigned SampleCount : 30; // number of samples in this cluster
+ struct sample* Left; // ptr to left sub-cluster
+ struct sample* Right; // ptr to right sub-cluster
+ int32_t CharID; // identifier of char sample came from
+ float Mean[1]; // mean of cluster - SampleSize floats
+} CLUSTER;
+
+using SAMPLE = CLUSTER; // can refer to as either sample or cluster
+
+typedef enum { spherical, elliptical, mixed, automatic } PROTOSTYLE;
+
+typedef struct { // parameters to control clustering
+ PROTOSTYLE ProtoStyle; // specifies types of protos to be made
+ float MinSamples; // min # of samples per proto - % of total
+ float MaxIllegal; // max percentage of samples in a cluster which
+ // have more than 1 feature in that cluster
+ float Independence; // desired independence between dimensions
+ double Confidence; // desired confidence in prototypes created
+ int MagicSamples; // Ideal number of samples in a cluster.
+} CLUSTERCONFIG;
+
+typedef enum { normal, uniform, D_random, DISTRIBUTION_COUNT } DISTRIBUTION;
+
+typedef union {
+ float Spherical;
+ float* Elliptical;
+} FLOATUNION;
+
+typedef struct {
+ bool Significant : 1; // true if prototype is significant
+ bool Merged : 1; // Merged after clustering so do not output
+ // but kept for display purposes. If it has no
+ // samples then it was actually merged.
+ // Otherwise it matched an already significant
+ // cluster.
+ unsigned Style : 2; // spherical, elliptical, or mixed
+ unsigned NumSamples : 28; // number of samples in the cluster
+ CLUSTER* Cluster; // ptr to cluster which made prototype
+ DISTRIBUTION* Distrib; // different distribution for each dimension
+ float* Mean; // prototype mean
+ float TotalMagnitude; // total magnitude over all dimensions
+ float LogMagnitude; // log base e of TotalMagnitude
+ FLOATUNION Variance; // prototype variance
+ FLOATUNION Magnitude; // magnitude of density function
+ FLOATUNION Weight; // weight of density function
+} PROTOTYPE;
+
+typedef struct {
+ int16_t SampleSize; // number of parameters per sample
+ PARAM_DESC* ParamDesc; // description of each parameter
+ int32_t NumberOfSamples; // total number of samples being clustered
+ KDTREE* KDTree; // for optimal nearest neighbor searching
+ CLUSTER* Root; // ptr to root cluster of cluster tree
+ LIST ProtoList; // list of prototypes
+ int32_t NumChar; // # of characters represented by samples
+ // cache of reusable histograms by distribution type and number of buckets.
+ BUCKETS* bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
+} CLUSTERER;
+
+typedef struct {
+ int32_t NumSamples; // number of samples in list
+ int32_t MaxNumSamples; // maximum size of list
+ SAMPLE* Sample[1]; // array of ptrs to sample data structures
+} SAMPLELIST;
+
+// low level cluster tree analysis routines.
+#define InitSampleSearch(S, C) \
+ (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))
+
+/*--------------------------------------------------------------------------
+ Public Function Prototypes
+--------------------------------------------------------------------------*/
+TESS_API
+CLUSTERER* MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]);
+
+TESS_API
+SAMPLE* MakeSample(CLUSTERER* Clusterer, const float* Feature, int32_t CharID);
+
+TESS_API
+LIST ClusterSamples(CLUSTERER* Clusterer, CLUSTERCONFIG* Config);
+
+TESS_API
+void FreeClusterer(CLUSTERER* Clusterer);
+
+TESS_API
+void FreeProtoList(LIST* ProtoList);
+
+void FreePrototype(void* arg); // PROTOTYPE *Prototype);
+
+CLUSTER* NextSample(LIST* SearchState);
+
+float Mean(PROTOTYPE* Proto, uint16_t Dimension);
+
+float StandardDeviation(PROTOTYPE* Proto, uint16_t Dimension);
+
+TESS_API
+int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2,
+ float m[], float m1[], float m2[]);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/clusttool.cpp b/tesseract/src/classify/clusttool.cpp
new file mode 100644
index 00000000..4227a1f0
--- /dev/null
+++ b/tesseract/src/classify/clusttool.cpp
@@ -0,0 +1,319 @@
+/******************************************************************************
+ ** Filename: clusttool.cpp
+ ** Purpose: Misc. tools for use with the clustering routines
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#define _USE_MATH_DEFINES // for M_PI
+
+#include "clusttool.h"
+
+#include <cmath> // for M_PI, std::isnan
+#include <locale> // for std::locale::classic
+#include <sstream> // for std::stringstream
+
+namespace tesseract {
+
+//---------------Global Data Definitions and Declarations--------------------
+#define TOKENSIZE 80 ///< max size of tokens read from an input file
+#define QUOTED_TOKENSIZE "79"
+#define MAXSAMPLESIZE 65535 ///< max num of dimensions in feature space
+
+/**
+ * This routine reads N floats from the specified text file
+ * and places them into Buffer. If Buffer is nullptr, a buffer
+ * is created and passed back to the caller. If EOF is
+ * encountered before any floats can be read, nullptr is
+ * returned.
+ * @param fp open text file to read floats from
+ * @param N number of floats to read
+ * @param Buffer pointer to buffer to place floats into
+ * @return Pointer to buffer holding floats or nullptr if EOF
+ * @note Globals: None
+ */
+static float *ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {
+ const int kMaxLineSize = 1024;
+ char line[kMaxLineSize];
+ if (fp->FGets(line, kMaxLineSize) == nullptr) {
+ tprintf("Hit EOF in ReadNFloats!\n");
+ return nullptr;
+ }
+ bool needs_free = false;
+
+ if (Buffer == nullptr) {
+ Buffer = static_cast<float *>(malloc(N * sizeof(float)));
+ needs_free = true;
+ }
+
+ std::stringstream stream(line);
+ // Use "C" locale (needed for float values Buffer[i]).
+ stream.imbue(std::locale::classic());
+ for (uint16_t i = 0; i < N; i++) {
+ float f = NAN;
+ stream >> f;
+ if (std::isnan(f)) {
+ tprintf("Read of %u floats failed!\n", N);
+ if (needs_free) free(Buffer);
+ return nullptr;
+ }
+ Buffer[i] = f;
+ }
+ return Buffer;
+}
+
+/**
+ * This routine writes a text representation of N floats from
+ * an array to a file. All of the floats are placed on one line.
+ * @param File open text file to write N floats to
+ * @param N number of floats to write
+ * @param Array array of floats to write
+ */
+static void WriteNFloats(FILE * File, uint16_t N, float Array[]) {
+ for (int i = 0; i < N; i++)
+ fprintf(File, " %9.6f", Array[i]);
+ fprintf(File, "\n");
+}
+
+/**
+ * This routine writes to the specified text file a word
+ * which represents the ProtoStyle. It does not append
+ * a carriage return to the end.
+ * @param File open text file to write prototype style to
+ * @param ProtoStyle prototype style to write
+ */
+static void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
+ switch (ProtoStyle) {
+ case spherical:
+ fprintf (File, "spherical");
+ break;
+ case elliptical:
+ fprintf (File, "elliptical");
+ break;
+ case mixed:
+ fprintf (File, "mixed");
+ break;
+ case automatic:
+ fprintf (File, "automatic");
+ break;
+ }
+}
+
+/**
+ * This routine reads a single integer from the specified
+ * file and checks to ensure that it is between 0 and
+ * MAXSAMPLESIZE.
+ * @param fp open text file to read sample size from
+ * @return Sample size
+ * @note Globals: None
+ */
+uint16_t ReadSampleSize(TFile *fp) {
+ int SampleSize = 0;
+
+ const int kMaxLineSize = 100;
+ char line[kMaxLineSize];
+ ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
+ ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1);
+ ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);
+ return SampleSize;
+}
+
+/**
+ * This routine reads textual descriptions of sets of parameters
+ * which describe the characteristics of feature dimensions.
+ *
+ * @param fp open text file to read N parameter descriptions from
+ * @param N number of parameter descriptions to read
+ * @return Pointer to an array of parameter descriptors.
+ * @note Globals: None
+ */
+PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {
+ PARAM_DESC *ParamDesc;
+
+ ParamDesc = static_cast<PARAM_DESC *>(malloc (N * sizeof (PARAM_DESC)));
+ for (int i = 0; i < N; i++) {
+ const int kMaxLineSize = TOKENSIZE * 4;
+ char line[kMaxLineSize];
+ ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
+ std::istringstream stream(line);
+ // Use "C" locale (needed for float values Min, Max).
+ stream.imbue(std::locale::classic());
+ std::string linear_token;
+ stream >> linear_token;
+ std::string essential_token;
+ stream >> essential_token;
+ stream >> ParamDesc[i].Min;
+ stream >> ParamDesc[i].Max;
+ ASSERT_HOST(!stream.fail());
+ ParamDesc[i].Circular = (linear_token[0] == 'c');
+ ParamDesc[i].NonEssential = (essential_token[0] != 'e');
+ ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
+ ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
+ ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
+ }
+ return (ParamDesc);
+}
+
+/**
+ * This routine reads a textual description of a prototype from
+ * the specified file.
+ *
+ * @param fp open text file to read prototype from
+ * @param N number of dimensions used in prototype
+ * @return List of prototypes
+ * @note Globals: None
+ */
+PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {
+ char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
+ PROTOTYPE *Proto;
+ int SampleCount;
+ int i;
+
+ const int kMaxLineSize = TOKENSIZE * 4;
+ char line[kMaxLineSize];
+ if (fp->FGets(line, kMaxLineSize) == nullptr ||
+ sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d",
+ sig_token, shape_token, &SampleCount) != 3) {
+ tprintf("Invalid prototype: %s\n", line);
+ return nullptr;
+ }
+ Proto = static_cast<PROTOTYPE *>(malloc(sizeof(PROTOTYPE)));
+ Proto->Cluster = nullptr;
+ Proto->Significant = (sig_token[0] == 's');
+
+ switch (shape_token[0]) {
+ case 's':
+ Proto->Style = spherical;
+ break;
+ case 'e':
+ Proto->Style = elliptical;
+ break;
+ case 'a':
+ Proto->Style = automatic;
+ break;
+ default:
+ tprintf("Invalid prototype style specification:%s\n", shape_token);
+ Proto->Style = elliptical;
+ }
+
+ ASSERT_HOST(SampleCount >= 0);
+ Proto->NumSamples = SampleCount;
+
+ Proto->Mean = ReadNFloats(fp, N, nullptr);
+ ASSERT_HOST(Proto->Mean != nullptr);
+
+ switch (Proto->Style) {
+ case spherical:
+ ASSERT_HOST(ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) != nullptr);
+ Proto->Magnitude.Spherical =
+ 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
+ Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, static_cast<float>(N));
+ Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
+ Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
+ Proto->Distrib = nullptr;
+ break;
+ case elliptical:
+ Proto->Variance.Elliptical = ReadNFloats(fp, N, nullptr);
+ ASSERT_HOST(Proto->Variance.Elliptical != nullptr);
+ Proto->Magnitude.Elliptical = static_cast<float *>(malloc(N * sizeof(float)));
+ Proto->Weight.Elliptical = static_cast<float *>(malloc(N * sizeof(float)));
+ Proto->TotalMagnitude = 1.0;
+ for (i = 0; i < N; i++) {
+ Proto->Magnitude.Elliptical[i] =
+ 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Elliptical[i]);
+ Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
+ }
+ Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
+ Proto->Distrib = nullptr;
+ break;
+ default:
+ free(Proto);
+ tprintf("Invalid prototype style\n");
+ return nullptr;
+ }
+ return Proto;
+}
+
+/**
+ * This routine writes an array of dimension descriptors to
+ * the specified text file.
+ * @param File open text file to write param descriptors to
+ * @param N number of param descriptors to write
+ * @param ParamDesc array of param descriptors to write
+ */
+void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {
+ int i;
+
+ for (i = 0; i < N; i++) {
+ if (ParamDesc[i].Circular)
+ fprintf (File, "circular ");
+ else
+ fprintf (File, "linear ");
+
+ if (ParamDesc[i].NonEssential)
+ fprintf (File, "non-essential ");
+ else
+ fprintf (File, "essential ");
+
+ fprintf (File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
+ }
+}
+
+/**
+ * This routine writes a textual description of a prototype
+ * to the specified text file.
+ * @param File open text file to write prototype to
+ * @param N number of dimensions in feature space
+ * @param Proto prototype to write out
+ */
+void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {
+ int i;
+
+ if (Proto->Significant)
+ fprintf (File, "significant ");
+ else
+ fprintf (File, "insignificant ");
+ WriteProtoStyle (File, static_cast<PROTOSTYLE>(Proto->Style));
+ fprintf (File, "%6d\n\t", Proto->NumSamples);
+ WriteNFloats (File, N, Proto->Mean);
+ fprintf (File, "\t");
+
+ switch (Proto->Style) {
+ case spherical:
+ WriteNFloats (File, 1, &(Proto->Variance.Spherical));
+ break;
+ case elliptical:
+ WriteNFloats (File, N, Proto->Variance.Elliptical);
+ break;
+ case mixed:
+ for (i = 0; i < N; i++)
+ switch (Proto->Distrib[i]) {
+ case normal:
+ fprintf (File, " %9s", "normal");
+ break;
+ case uniform:
+ fprintf (File, " %9s", "uniform");
+ break;
+ case D_random:
+ fprintf (File, " %9s", "random");
+ break;
+ case DISTRIBUTION_COUNT:
+ ASSERT_HOST(!"Distribution count not allowed!");
+ }
+ fprintf (File, "\n\t");
+ WriteNFloats (File, N, Proto->Variance.Elliptical);
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/clusttool.h b/tesseract/src/classify/clusttool.h
new file mode 100644
index 00000000..ead65618
--- /dev/null
+++ b/tesseract/src/classify/clusttool.h
@@ -0,0 +1,43 @@
+/******************************************************************************
+ ** Filename: clusttool.h
+ ** Purpose: Definition of clustering utility tools
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef TESSERACT_CLASSIFY_CLUSTTOOL_H_
+#define TESSERACT_CLASSIFY_CLUSTTOOL_H_
+
+#include "cluster.h"
+
+#include "serialis.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+uint16_t ReadSampleSize(tesseract::TFile *fp);
+
+PARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uint16_t N);
+
+PROTOTYPE *ReadPrototype(tesseract::TFile *fp, uint16_t N);
+
+TESS_API
+void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]);
+
+TESS_API
+void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto);
+
+} // namespace tesseract
+
+#endif // TESSERACT_CLASSIFY_CLUSTTOOL_H_
diff --git a/tesseract/src/classify/cutoffs.cpp b/tesseract/src/classify/cutoffs.cpp
new file mode 100644
index 00000000..f75788d8
--- /dev/null
+++ b/tesseract/src/classify/cutoffs.cpp
@@ -0,0 +1,73 @@
+/******************************************************************************
+ ** Filename: cutoffs.c
+ ** Purpose: Routines to manipulate an array of class cutoffs.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+/*----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------*/
+
+#include <cstdio>
+#include <sstream> // for std::istringstream
+#include <string> // for std::string
+
+#include "classify.h"
+#include "helpers.h"
+#include "serialis.h"
+#include <tesseract/unichar.h>
+
+#define MAX_CUTOFF 1000
+
+namespace tesseract {
+/**
+ * Open file, read in all of the class-id/cutoff pairs
+ * and insert them into the Cutoffs array. Cutoffs are
+ * indexed in the array by class id. Unused entries in the
+ * array are set to an arbitrarily high cutoff value.
+ * @param fp file containing cutoff definitions
+ * @param Cutoffs array to put cutoffs into
+ */
+void Classify::ReadNewCutoffs(TFile* fp, uint16_t* Cutoffs) {
+ int Cutoff;
+
+ if (shape_table_ != nullptr) {
+ if (!shapetable_cutoffs_.DeSerialize(fp)) {
+ tprintf("Error during read of shapetable pffmtable!\n");
+ }
+ }
+ for (int i = 0; i < MAX_NUM_CLASSES; i++)
+ Cutoffs[i] = MAX_CUTOFF;
+
+ const int kMaxLineSize = 100;
+ char line[kMaxLineSize];
+ while (fp->FGets(line, kMaxLineSize) != nullptr) {
+ std::string Class;
+ CLASS_ID ClassId;
+ std::istringstream stream(line);
+ stream.imbue(std::locale::classic());
+ stream >> Class >> Cutoff;
+ if (stream.fail()) {
+ break;
+ }
+ if (Class.compare("NULL") == 0) {
+ ClassId = unicharset.unichar_to_id(" ");
+ } else {
+ ClassId = unicharset.unichar_to_id(Class.c_str());
+ }
+ ASSERT_HOST(ClassId >= 0 && ClassId < MAX_NUM_CLASSES);
+ Cutoffs[ClassId] = Cutoff;
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/featdefs.cpp b/tesseract/src/classify/featdefs.cpp
new file mode 100644
index 00000000..54647431
--- /dev/null
+++ b/tesseract/src/classify/featdefs.cpp
@@ -0,0 +1,280 @@
+/******************************************************************************
+ ** Filename: featdefs.cpp
+ ** Purpose: Definitions of currently defined feature types.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "featdefs.h"
+
+#include "picofeat.h" // for PicoFeatureLength
+#include "scanutils.h"
+
+#include <cstring>
+#include <cstdio>
+
+namespace tesseract {
+
+#define PICO_FEATURE_LENGTH 0.05
+
+/*-----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+-----------------------------------------------------------------------------*/
+const char* const kMicroFeatureType = "mf";
+const char* const kCNFeatureType = "cn";
+const char* const kIntFeatureType = "if";
+const char* const kGeoFeatureType = "tb";
+
+// Define all of the parameters for the MicroFeature type.
+StartParamDesc(MicroFeatureParams)
+DefineParam(0, 0, -0.5, 0.5)
+DefineParam(0, 0, -0.25, 0.75)
+DefineParam(0, 1, 0.0, 1.0)
+DefineParam(1, 0, 0.0, 1.0)
+DefineParam (0, 1, -0.5, 0.5)
+DefineParam (0, 1, -0.5, 0.5)
+EndParamDesc
+// Now define the feature type itself (see features.h for parameters).
+DefineFeature(MicroFeatureDesc, 5, 1, kMicroFeatureType, MicroFeatureParams)
+
+// Define all of the parameters for the NormFeat type.
+StartParamDesc (CharNormParams)
+DefineParam(0, 0, -0.25, 0.75)
+DefineParam(0, 1, 0.0, 1.0)
+DefineParam(0, 0, 0.0, 1.0)
+DefineParam(0, 0, 0.0, 1.0)
+EndParamDesc
+// Now define the feature type itself (see features.h for parameters).
+DefineFeature(CharNormDesc, 4, 0, kCNFeatureType, CharNormParams)
+
+// Define all of the parameters for the IntFeature type
+StartParamDesc(IntFeatParams)
+DefineParam(0, 0, 0.0, 255.0)
+DefineParam(0, 0, 0.0, 255.0)
+DefineParam(1, 0, 0.0, 255.0)
+EndParamDesc
+// Now define the feature type itself (see features.h for parameters).
+DefineFeature(IntFeatDesc, 2, 1, kIntFeatureType, IntFeatParams)
+
+// Define all of the parameters for the GeoFeature type
+StartParamDesc(GeoFeatParams)
+DefineParam(0, 0, 0.0, 255.0)
+DefineParam(0, 0, 0.0, 255.0)
+DefineParam(0, 0, 0.0, 255.0)
+EndParamDesc
+// Now define the feature type itself (see features.h for parameters).
+DefineFeature(GeoFeatDesc, 3, 0, kGeoFeatureType, GeoFeatParams)
+
+// Other features used for training the adaptive classifier, but not used
+// during normal training, therefore not in the DescDefs array.
+
+// Define all of the parameters for the PicoFeature type
+// define knob that can be used to adjust pico-feature length.
+float PicoFeatureLength = PICO_FEATURE_LENGTH;
+StartParamDesc(PicoFeatParams)
+DefineParam(0, 0, -0.25, 0.75)
+DefineParam(1, 0, 0.0, 1.0)
+DefineParam(0, 0, -0.5, 0.5)
+EndParamDesc
+// Now define the feature type itself (see features.h for parameters).
+DefineFeature(PicoFeatDesc, 2, 1, "pf", PicoFeatParams)
+
+// Define all of the parameters for the OutlineFeature type.
+StartParamDesc(OutlineFeatParams)
+DefineParam(0, 0, -0.5, 0.5)
+DefineParam(0, 0, -0.25, 0.75)
+DefineParam(0, 0, 0.0, 1.0)
+DefineParam(1, 0, 0.0, 1.0)
+EndParamDesc
+// Now define the feature type itself (see features.h for parameters).
+DefineFeature(OutlineFeatDesc, 3, 1, "of", OutlineFeatParams)
+
+// MUST be kept in-sync with ExtractorDefs in fxdefs.cpp.
+static const FEATURE_DESC_STRUCT *DescDefs[NUM_FEATURE_TYPES] = {
+ &MicroFeatureDesc,
+ &CharNormDesc,
+ &IntFeatDesc,
+ &GeoFeatDesc
+};
+
+/*-----------------------------------------------------------------------------
+ Public Code
+-----------------------------------------------------------------------------*/
+void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs) {
+ featuredefs->NumFeatureTypes = NUM_FEATURE_TYPES;
+ for (int i = 0; i < NUM_FEATURE_TYPES; ++i) {
+ featuredefs->FeatureDesc[i] = DescDefs[i];
+ }
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Release the memory consumed by the specified character
+ * description and all of the features in that description.
+ *
+ * @param CharDesc character description to be deallocated
+ *
+ * Globals:
+ * - none
+ */
+void FreeCharDescription(CHAR_DESC CharDesc) {
+ if (CharDesc) {
+ for (size_t i = 0; i < CharDesc->NumFeatureSets; i++)
+ FreeFeatureSet (CharDesc->FeatureSets[i]);
+ free(CharDesc);
+ }
+} /* FreeCharDescription */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Allocate a new character description, initialize its
+ * feature sets to be empty, and return it.
+ *
+ * Globals:
+ * - none
+ *
+ * @return New character description structure.
+ */
+CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs) {
+ CHAR_DESC CharDesc;
+ CharDesc = static_cast<CHAR_DESC>(malloc (sizeof (CHAR_DESC_STRUCT)));
+ CharDesc->NumFeatureSets = FeatureDefs.NumFeatureTypes;
+
+ for (size_t i = 0; i < CharDesc->NumFeatureSets; i++)
+ CharDesc->FeatureSets[i] = nullptr;
+
+ return (CharDesc);
+} /* NewCharDescription */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Appends a textual representation of CharDesc to str.
+ * The format used is to write out the number of feature
+ * sets which will be written followed by a representation of
+ * each feature set.
+ *
+ * Each set starts with the short name for that feature followed
+ * by a description of the feature set. Feature sets which are
+ * not present are not written.
+ *
+ * @param FeatureDefs definitions of feature types/extractors
+ * @param str string to append CharDesc to
+ * @param CharDesc character description to write to File
+ */
+void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs,
+ CHAR_DESC CharDesc, STRING* str) {
+ int NumSetsToWrite = 0;
+
+ for (size_t Type = 0; Type < CharDesc->NumFeatureSets; Type++)
+ if (CharDesc->FeatureSets[Type])
+ NumSetsToWrite++;
+
+ str->add_str_int(" ", NumSetsToWrite);
+ *str += "\n";
+ for (size_t Type = 0; Type < CharDesc->NumFeatureSets; Type++) {
+ if (CharDesc->FeatureSets[Type]) {
+ *str += FeatureDefs.FeatureDesc[Type]->ShortName;
+ *str += " ";
+ WriteFeatureSet(CharDesc->FeatureSets[Type], str);
+ }
+ }
+} /* WriteCharDescription */
+
+// Return whether all of the fields of the given feature set
+// are well defined (not inf or nan).
+bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
+ CHAR_DESC CharDesc) {
+ bool anything_written = false;
+ bool well_formed = true;
+ for (size_t Type = 0; Type < CharDesc->NumFeatureSets; Type++) {
+ if (CharDesc->FeatureSets[Type]) {
+ for (int i = 0; i < CharDesc->FeatureSets[Type]->NumFeatures; i++) {
+ FEATURE feat = CharDesc->FeatureSets[Type]->Features[i];
+ for (int p = 0; p < feat->Type->NumParams; p++) {
+ if (std::isnan(feat->Params[p]) || std::isinf(feat->Params[p]))
+ well_formed = false;
+ else
+ anything_written = true;
+ }
+ }
+ } else {
+ return false;
+ }
+ }
+ return anything_written && well_formed;
+} /* ValidCharDescription */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Read a character description from File, and return
+ * a data structure containing this information. The data
+ * is formatted as follows:
+ * @verbatim
+ NumberOfSets
+ ShortNameForSet1 Set1
+ ShortNameForSet2 Set2
+ ...
+ @endverbatim
+ *
+ * Globals:
+ * - none
+ *
+ * @param FeatureDefs definitions of feature types/extractors
+ * @param File open text file to read character description from
+ * @return Character description read from File.
+ */
+CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
+ FILE *File) {
+ int NumSetsToRead;
+ char ShortName[FEAT_NAME_SIZE];
+ CHAR_DESC CharDesc;
+ int Type;
+
+ ASSERT_HOST(tfscanf(File, "%d", &NumSetsToRead) == 1);
+ ASSERT_HOST(NumSetsToRead >= 0);
+ ASSERT_HOST(NumSetsToRead <= FeatureDefs.NumFeatureTypes);
+
+ CharDesc = NewCharDescription(FeatureDefs);
+ for (; NumSetsToRead > 0; NumSetsToRead--) {
+ tfscanf(File, "%s", ShortName);
+ Type = ShortNameToFeatureType(FeatureDefs, ShortName);
+ CharDesc->FeatureSets[Type] =
+ ReadFeatureSet (File, FeatureDefs.FeatureDesc[Type]);
+ }
+ return CharDesc;
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Search through all features currently defined and return
+ * the feature type for the feature with the specified short
+ * name. Trap an error if the specified name is not found.
+ *
+ * Globals:
+ * - none
+ *
+ * @param FeatureDefs definitions of feature types/extractors
+ * @param ShortName short name of a feature type
+ * @return Feature type which corresponds to ShortName.
+ */
+uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs,
+ const char *ShortName) {
+ for (int i = 0; i < FeatureDefs.NumFeatureTypes; i++)
+ if (!strcmp ((FeatureDefs.FeatureDesc[i]->ShortName), ShortName))
+ return static_cast<uint32_t>(i);
+ ASSERT_HOST(!"Illegal short name for a feature");
+ return 0;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/featdefs.h b/tesseract/src/classify/featdefs.h
new file mode 100644
index 00000000..eb8c66fe
--- /dev/null
+++ b/tesseract/src/classify/featdefs.h
@@ -0,0 +1,87 @@
+/******************************************************************************
+ ** Filename: featdefs.h
+ ** Purpose: Definitions of currently defined feature types.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef FEATDEFS_H
+#define FEATDEFS_H
+
+#include "ocrfeatures.h"
+
+namespace tesseract {
+
+/* Enumerate the different types of features currently defined. */
+#define NUM_FEATURE_TYPES 4
+extern TESS_API const char* const kMicroFeatureType;
+extern TESS_API const char* const kCNFeatureType;
+extern TESS_API const char* const kIntFeatureType;
+extern TESS_API const char* const kGeoFeatureType;
+
+/* A character is described by multiple sets of extracted features. Each
+ set contains a number of features of a particular type, for example, a
+ set of bays, or a set of closures, or a set of microfeatures. Each
+ feature consists of a number of parameters. All features within a
+ feature set contain the same number of parameters.*/
+
+struct CHAR_DESC_STRUCT {
+ uint32_t NumFeatureSets;
+ FEATURE_SET FeatureSets[NUM_FEATURE_TYPES];
+};
+using CHAR_DESC = CHAR_DESC_STRUCT *;
+
+struct FEATURE_DEFS_STRUCT {
+ int32_t NumFeatureTypes;
+ const FEATURE_DESC_STRUCT* FeatureDesc[NUM_FEATURE_TYPES];
+};
+using FEATURE_DEFS = FEATURE_DEFS_STRUCT *;
+
+/*----------------------------------------------------------------------
+ Generic functions for manipulating character descriptions
+----------------------------------------------------------------------*/
+TESS_API
+void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs);
+
+TESS_API
+void FreeCharDescription(CHAR_DESC CharDesc);
+
+CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs);
+
+bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
+ CHAR_DESC CharDesc);
+
+void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs,
+ CHAR_DESC CharDesc, STRING* str);
+
+TESS_API
+CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
+ FILE *File);
+
+TESS_API
+uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs,
+ const char *ShortName);
+
+/**----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+----------------------------------------------------------------------------**/
+extern const FEATURE_DESC_STRUCT MicroFeatureDesc;
+extern TESS_API const FEATURE_DESC_STRUCT PicoFeatDesc;
+extern const FEATURE_DESC_STRUCT CharNormDesc;
+extern const FEATURE_DESC_STRUCT OutlineFeatDesc;
+extern const FEATURE_DESC_STRUCT IntFeatDesc;
+extern const FEATURE_DESC_STRUCT GeoFeatDesc;
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/float2int.cpp b/tesseract/src/classify/float2int.cpp
new file mode 100644
index 00000000..1b48779b
--- /dev/null
+++ b/tesseract/src/classify/float2int.cpp
@@ -0,0 +1,109 @@
+/******************************************************************************
+ ** Filename: float2int.cpp
+ ** Purpose: Routines for converting float features to int features
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "float2int.h"
+
+#include "normmatch.h"
+#include "mfoutline.h"
+#include "classify.h"
+#include "picofeat.h"
+
+#include "helpers.h"
+
+#define MAX_INT_CHAR_NORM (INT_CHAR_NORM_RANGE - 1)
+
+/*---------------------------------------------------------------------------*/
+namespace tesseract {
+
+/**
+ * For each class in the unicharset, clears the corresponding
+ * entry in char_norm_array. char_norm_array is indexed by unichar_id.
+ *
+ * Globals:
+ * - none
+ *
+ * @param char_norm_array array to be cleared
+ */
+void Classify::ClearCharNormArray(uint8_t* char_norm_array) {
+ memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
+} /* ClearCharNormArray */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * For each class in unicharset, computes the match between
+ * norm_feature and the normalization protos for that class.
+ * Converts this number to the range from 0 - 255 and stores it
+ * into char_norm_array. CharNormArray is indexed by unichar_id.
+ *
+ * Globals:
+ * - PreTrainedTemplates current set of built-in templates
+ *
+ * @param norm_feature character normalization feature
+ * @param[out] char_norm_array place to put results of size unicharset.size()
+ */
+void Classify::ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
+ uint8_t* char_norm_array) {
+ for (int i = 0; i < unicharset.size(); i++) {
+ if (i < PreTrainedTemplates->NumClasses) {
+ int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
+ ComputeNormMatch(i, norm_feature, false));
+ char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
+ } else {
+ // Classes with no templates (eg. ambigs & ligatures) default
+ // to worst match.
+ char_norm_array[i] = MAX_INT_CHAR_NORM;
+ }
+ }
+} /* ComputeIntCharNormArray */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine converts each floating point pico-feature
+ * in Features into integer format and saves it into
+ * IntFeatures.
+ *
+ * Globals:
+ * - none
+ *
+ * @param Features floating point pico-features to be converted
+ * @param[out] IntFeatures array to put converted features into
+ */
+void Classify::ComputeIntFeatures(FEATURE_SET Features,
+ INT_FEATURE_ARRAY IntFeatures) {
+ float YShift;
+
+ if (classify_norm_method == baseline)
+ YShift = BASELINE_Y_SHIFT;
+ else
+ YShift = Y_SHIFT;
+
+ for (int Fid = 0; Fid < Features->NumFeatures; Fid++) {
+ FEATURE Feature = Features->Features[Fid];
+
+ IntFeatures[Fid].X =
+ Bucket8For(Feature->Params[PicoFeatX], X_SHIFT, INT_FEAT_RANGE);
+ IntFeatures[Fid].Y =
+ Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
+ IntFeatures[Fid].Theta = CircBucketFor(Feature->Params[PicoFeatDir],
+ ANGLE_SHIFT, INT_FEAT_RANGE);
+ IntFeatures[Fid].CP_misses = 0;
+ }
+} /* ComputeIntFeatures */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/float2int.h b/tesseract/src/classify/float2int.h
new file mode 100644
index 00000000..70a05ab6
--- /dev/null
+++ b/tesseract/src/classify/float2int.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ ** Filename: float2int.h
+ ** Purpose: Routines for converting float features to int features
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef FLOAT2INT_H
+#define FLOAT2INT_H
+
+/*-----------------------------------------------------------------------------
+ Include Files and Type Defines
+-----------------------------------------------------------------------------*/
+#include "intmatcher.h"
+#include "ocrfeatures.h"
+
+#define INT_FEAT_RANGE 256
+#define BASELINE_Y_SHIFT (0.25)
+
+#endif
diff --git a/tesseract/src/classify/fpoint.cpp b/tesseract/src/classify/fpoint.cpp
new file mode 100644
index 00000000..333b1fc7
--- /dev/null
+++ b/tesseract/src/classify/fpoint.cpp
@@ -0,0 +1,54 @@
+/******************************************************************************
+ ** Filename: fpoint.cpp
+ ** Purpose: Abstract data type for a 2D point (floating point coords)
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+/*----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------*/
+#define _USE_MATH_DEFINES // for M_PI
+#include "fpoint.h"
+#include <cstdio>
+#include <cmath> // for M_PI
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+
+float DistanceBetween(FPOINT A, FPOINT B) {
+ const double xd = XDelta(A, B);
+ const double yd = YDelta(A, B);
+ return sqrt(static_cast<double>(xd * xd + yd * yd));
+}
+
+/**
+ * Return the angle from Point1 to Point2 normalized to
+ * lie in the range 0 to FullScale (where FullScale corresponds
+ * to 2*pi or 360 degrees).
+ * @param Point1 points to compute angle between
+ * @param Point2 points to compute angle between
+ * @param FullScale value to associate with 2*pi
+ * @return angle
+ */
+float NormalizedAngleFrom(FPOINT *Point1, FPOINT *Point2, float FullScale) {
+ float NumRadsInCircle = 2.0 * M_PI;
+
+ float Angle = AngleFrom (*Point1, *Point2);
+ if (Angle < 0.0)
+ Angle += NumRadsInCircle;
+ Angle *= FullScale / NumRadsInCircle;
+ if (Angle < 0.0 || Angle >= FullScale)
+ Angle = 0.0;
+ return (Angle);
+}
diff --git a/tesseract/src/classify/fpoint.h b/tesseract/src/classify/fpoint.h
new file mode 100644
index 00000000..93f5a20f
--- /dev/null
+++ b/tesseract/src/classify/fpoint.h
@@ -0,0 +1,53 @@
+/******************************************************************************
+ ** Filename: fpoint.h
+ ** Purpose: Abstract data type for 2D points (floating point coords)
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef FPOINT_H
+#define FPOINT_H
+
+/**----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#include <cmath>
+#include <cstdio>
+
+/* define data structure to hold 2D points or vectors using floating point */
+typedef struct {
+ float x, y;
+} FPOINT;
+using FVECTOR = FPOINT;
+
+/**----------------------------------------------------------------------------
+ Macros
+----------------------------------------------------------------------------**/
+/* macros for computing miscellaneous functions of 2 points */
+#define XDelta(A, B) ((B).x - (A).x)
+#define YDelta(A, B) ((B).y - (A).y)
+#define SlopeFrom(A, B) (YDelta(A, B) / XDelta(A, B))
+#define AngleFrom(A, B) (atan2((double)YDelta(A, B), (double)XDelta(A, B)))
+
+#define XIntersectionOf(A, B, X) (SlopeFrom(A, B) * ((X)-A.x) + A.y)
+
+/*-------------------------------------------------------------------------
+ Public Function Prototypes
+---------------------------------------------------------------------------*/
+
+float DistanceBetween(FPOINT A, FPOINT B);
+
+float NormalizedAngleFrom(FPOINT* Point1, FPOINT* Point2, float FullScale);
+
+#endif
diff --git a/tesseract/src/classify/intfeaturespace.cpp b/tesseract/src/classify/intfeaturespace.cpp
new file mode 100644
index 00000000..9ddd9777
--- /dev/null
+++ b/tesseract/src/classify/intfeaturespace.cpp
@@ -0,0 +1,124 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: intfeaturespace.cpp
+// Description: Indexed feature space based on INT_FEATURE_STRUCT.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#define _USE_MATH_DEFINES // for M_PI
+#include "intfeaturespace.h"
+#include <cmath> // for M_PI
+#include "intfx.h"
+
+namespace tesseract {
+
+IntFeatureSpace::IntFeatureSpace()
+ : x_buckets_(0), y_buckets_(0), theta_buckets_(0) {
+}
+
+void IntFeatureSpace::Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets) {
+ x_buckets_ = xbuckets;
+ y_buckets_ = ybuckets;
+ theta_buckets_ = thetabuckets;
+}
+
+// Serializes the feature space definition to the given file.
+// Returns false on error.
+bool IntFeatureSpace::Serialize(FILE* fp) const {
+ if (fwrite(&x_buckets_, sizeof(x_buckets_), 1, fp) != 1)
+ return false;
+ if (fwrite(&y_buckets_, sizeof(y_buckets_), 1, fp) != 1)
+ return false;
+ if (fwrite(&theta_buckets_, sizeof(theta_buckets_), 1, fp) != 1)
+ return false;
+ return true;
+}
+
+// Returns an INT_FEATURE_STRUCT corresponding to the given index.
+// This is the inverse of the Index member.
+INT_FEATURE_STRUCT IntFeatureSpace::PositionFromIndex(int index) const {
+ return PositionFromBuckets(index / (y_buckets_ * theta_buckets_),
+ index / theta_buckets_ % y_buckets_,
+ index % theta_buckets_);
+}
+
+// Bulk calls to Index. Maps the given array of features to a vector of
+// int32_t indices in the same order as the input.
+void IntFeatureSpace::IndexFeatures(const INT_FEATURE_STRUCT* features,
+ int num_features,
+ GenericVector<int>* mapped_features) const {
+ mapped_features->truncate(0);
+ for (int f = 0; f < num_features; ++f)
+ mapped_features->push_back(Index(features[f]));
+}
+
+// Bulk calls to Index. Maps the given array of features to a vector of
+// sorted int32_t indices.
+void IntFeatureSpace::IndexAndSortFeatures(
+ const INT_FEATURE_STRUCT* features, int num_features,
+ GenericVector<int>* sorted_features) const {
+ sorted_features->truncate(0);
+ for (int f = 0; f < num_features; ++f)
+ sorted_features->push_back(Index(features[f]));
+ sorted_features->sort();
+}
+
+// Returns a feature space index for the given x,y position in a display
+// window, or -1 if the feature is a miss.
+int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
+ // Round the x,y position to a feature. Search for a valid theta.
+ INT_FEATURE_STRUCT feature(x, y, 0);
+ int index = -1;
+ for (int theta = 0; theta <= UINT8_MAX && index < 0; ++theta) {
+ feature.Theta = theta;
+ index = Index(feature);
+ }
+ if (index < 0) {
+ tprintf("(%d,%d) does not exist in feature space!\n", x, y);
+ return -1;
+ }
+ feature = PositionFromIndex(index);
+ tprintf("Click at (%d, %d) ->(%d, %d), ->(%d, %d)\n",
+ x, y, feature.X, feature.Y, x - feature.X, y - feature.Y);
+ // Get the relative position of x,y from the rounded feature.
+ x -= feature.X;
+ y -= feature.Y;
+ if (x != 0 || y != 0) {
+ double angle = atan2(static_cast<double>(y), static_cast<double>(x)) + M_PI;
+ angle *= kIntFeatureExtent / (2.0 * M_PI);
+ feature.Theta = static_cast<uint8_t>(angle + 0.5);
+ index = Index(feature);
+ if (index < 0) {
+ tprintf("Feature failed to map to a valid index:");
+ feature.print();
+ return -1;
+ }
+ feature = PositionFromIndex(index);
+ }
+ feature.print();
+ return index;
+}
+
+// Returns an INT_FEATURE_STRUCT corresponding to the given bucket coords.
+INT_FEATURE_STRUCT IntFeatureSpace::PositionFromBuckets(int x,
+ int y,
+ int theta) const {
+ INT_FEATURE_STRUCT pos(
+ (x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
+ (y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
+ DivRounded(theta * kIntFeatureExtent, theta_buckets_));
+ return pos;
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/classify/intfeaturespace.h b/tesseract/src/classify/intfeaturespace.h
new file mode 100644
index 00000000..3f21e4d3
--- /dev/null
+++ b/tesseract/src/classify/intfeaturespace.h
@@ -0,0 +1,104 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: intfeaturespace.h
+// Description: Indexed feature space based on INT_FEATURE_STRUCT.
+// Created: Wed Mar 24 10:55:30 PDT 2010
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CLASSIFY_INTFEATURESPACE_H_
+#define TESSERACT_CLASSIFY_INTFEATURESPACE_H_
+
+#include "genericvector.h"
+#include "intproto.h"
+
+// Extent of x,y,theta in the input feature space. [0,255].
+const int kIntFeatureExtent = 256;
+// Extent of x,y,theta dimensions in the quantized feature space.
+const int kBoostXYBuckets = 16;
+const int kBoostDirBuckets = 16;
+
+namespace tesseract {
+
+class IndexMap;
+
+// Down-sampling quantization of the INT_FEATURE_STRUCT feature space and
+// conversion to a single scalar index value, used as a binary feature space.
+class TESS_API IntFeatureSpace {
+ public:
+ IntFeatureSpace();
+ // Default copy constructors and assignment OK!
+
+ // Setup the feature space with the given dimensions.
+ void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets);
+
+ // Serializes the feature space definition to the given file.
+ // Returns false on error.
+ bool Serialize(FILE* fp) const;
+
+ // Returns the total size of the feature space.
+ int Size() const {
+ return static_cast<int>(x_buckets_) * y_buckets_ * theta_buckets_;
+ }
+ // Returns an INT_FEATURE_STRUCT corresponding to the given index.
+ // This is the inverse of the Index member.
+ INT_FEATURE_STRUCT PositionFromIndex(int index) const;
+
+ // Returns a 1-dimensional index corresponding to the given feature value.
+ // Range is [0, Size()-1]. Inverse of PositionFromIndex member.
+ int Index(const INT_FEATURE_STRUCT& f) const {
+ return (XBucket(f.X) * y_buckets_ + YBucket(f.Y)) * theta_buckets_ +
+ ThetaBucket(f.Theta);
+ }
+ // Bulk calls to Index. Maps the given array of features to a vector of
+ // int32_t indices in the same order as the input.
+ void IndexFeatures(const INT_FEATURE_STRUCT* features, int num_features,
+ GenericVector<int>* mapped_features) const;
+ // Bulk calls to Index. Maps the given array of features to a vector of
+ // sorted int32_t indices.
+ void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features,
+ int num_features,
+ GenericVector<int>* sorted_features) const;
+ // Returns a feature space index for the given x,y position in a display
+ // window, or -1 if the feature is a miss.
+ int XYToFeatureIndex(int x, int y) const;
+
+ protected:
+ // Converters to generate indices for individual feature dimensions.
+ int XBucket(int x) const {
+ int bucket = x * x_buckets_ / kIntFeatureExtent;
+ return ClipToRange(bucket, 0, static_cast<int>(x_buckets_) - 1);
+ }
+ int YBucket(int y) const {
+ int bucket = y * y_buckets_ / kIntFeatureExtent;
+ return ClipToRange(bucket, 0, static_cast<int>(y_buckets_) - 1);
+ }
+ // Use DivRounded for theta so that exactly vertical and horizontal are in
+ // the middle of a bucket. The Modulo takes care of the wrap-around.
+ int ThetaBucket(int theta) const {
+ int bucket = DivRounded(theta * theta_buckets_, kIntFeatureExtent);
+ return Modulo(bucket, theta_buckets_);
+ }
+ // Returns an INT_FEATURE_STRUCT corresponding to the given buckets.
+ INT_FEATURE_STRUCT PositionFromBuckets(int x, int y, int theta) const;
+
+ // Feature space definition - serialized.
+ uint8_t x_buckets_;
+ uint8_t y_buckets_;
+ uint8_t theta_buckets_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CLASSIFY_INTFEATURESPACE_H_
diff --git a/tesseract/src/classify/intfx.cpp b/tesseract/src/classify/intfx.cpp
new file mode 100644
index 00000000..062b0f1e
--- /dev/null
+++ b/tesseract/src/classify/intfx.cpp
@@ -0,0 +1,488 @@
+/******************************************************************************
+ ** Filename: intfx.c
+ ** Purpose: Integer character normalization & feature extraction
+ ** Author: Robert Moss, rays@google.com (Ray Smith)
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+/**----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------**/
+
+#define _USE_MATH_DEFINES // for M_PI
+
+#include "intfx.h"
+
+#include "classify.h"
+#include "intmatcher.h"
+#include "linlsq.h"
+#include "normalis.h"
+#include "statistc.h"
+#include "trainingsample.h"
+
+#include "helpers.h"
+
+#include "allheaders.h"
+
+#include <cmath> // for M_PI
+#include <mutex> // for std::mutex
+
+namespace tesseract {
+
+/**----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+----------------------------------------------------------------------------**/
+// Look up table for cos and sin to turn the intfx feature angle to a vector.
+// Protected by atan_table_mutex.
+// The entries are in binary degrees where a full circle is 256 binary degrees.
+static float cos_table[INT_CHAR_NORM_RANGE];
+static float sin_table[INT_CHAR_NORM_RANGE];
+
+/**----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------**/
+
+void InitIntegerFX() {
+ // Guards write access to AtanTable so we don't create it more than once.
+ static std::mutex atan_table_mutex;
+ static bool atan_table_init = false;
+ std::lock_guard<std::mutex> guard(atan_table_mutex);
+ if (!atan_table_init) {
+ for (int i = 0; i < INT_CHAR_NORM_RANGE; ++i) {
+ cos_table[i] = cos(i * 2 * M_PI / INT_CHAR_NORM_RANGE + M_PI);
+ sin_table[i] = sin(i * 2 * M_PI / INT_CHAR_NORM_RANGE + M_PI);
+ }
+ atan_table_init = true;
+ }
+}
+
+// Returns a vector representing the direction of a feature with the given
+// theta direction in an INT_FEATURE_STRUCT.
+FCOORD FeatureDirection(uint8_t theta) {
+ return FCOORD(cos_table[theta], sin_table[theta]);
+}
+
+// Generates a TrainingSample from a TBLOB. Extracts features and sets
+// the bounding box, so classifiers that operate on the image can work.
+// TODO(rays) Make BlobToTrainingSample a member of Classify now that
+// the FlexFx and FeatureDescription code have been removed and LearnBlob
+// is now a member of Classify.
+TrainingSample* BlobToTrainingSample(
+ const TBLOB& blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT* fx_info,
+ std::vector<INT_FEATURE_STRUCT>* bl_features) {
+ std::vector<INT_FEATURE_STRUCT> cn_features;
+ Classify::ExtractFeatures(blob, nonlinear_norm, bl_features,
+ &cn_features, fx_info, nullptr);
+ // TODO(rays) Use blob->PreciseBoundingBox() instead.
+ TBOX box = blob.bounding_box();
+ TrainingSample* sample = nullptr;
+ int num_features = fx_info->NumCN;
+ if (num_features > 0) {
+ sample = TrainingSample::CopyFromFeatures(*fx_info, box, &cn_features[0],
+ num_features);
+ }
+ if (sample != nullptr) {
+ // Set the bounding box (in original image coordinates) in the sample.
+ TPOINT topleft, botright;
+ topleft.x = box.left();
+ topleft.y = box.top();
+ botright.x = box.right();
+ botright.y = box.bottom();
+ TPOINT original_topleft, original_botright;
+ blob.denorm().DenormTransform(nullptr, topleft, &original_topleft);
+ blob.denorm().DenormTransform(nullptr, botright, &original_botright);
+ sample->set_bounding_box(TBOX(original_topleft.x, original_botright.y,
+ original_botright.x, original_topleft.y));
+ }
+ return sample;
+}
+
+// Computes the DENORMS for bl(baseline) and cn(character) normalization
+// during feature extraction. The input denorm describes the current state
+// of the blob, which is usually a baseline-normalized word.
+// The Transforms setup are as follows:
+// Baseline Normalized (bl) Output:
+// We center the grapheme by aligning the x-coordinate of its centroid with
+// x=128 and leaving the already-baseline-normalized y as-is.
+//
+// Character Normalized (cn) Output:
+// We align the grapheme's centroid at the origin and scale it
+// asymmetrically in x and y so that the 2nd moments are a standard value
+// (51.2) ie the result is vaguely square.
+// If classify_nonlinear_norm is true:
+// A non-linear normalization is setup that attempts to evenly distribute
+// edges across x and y.
+//
+// Some of the fields of fx_info are also setup:
+// Length: Total length of outline.
+// Rx: Rounded y second moment. (Reversed by convention.)
+// Ry: rounded x second moment.
+// Xmean: Rounded x center of mass of the blob.
+// Ymean: Rounded y center of mass of the blob.
+void Classify::SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
+ DENORM* bl_denorm, DENORM* cn_denorm,
+ INT_FX_RESULT_STRUCT* fx_info) {
+ // Compute 1st and 2nd moments of the original outline.
+ FCOORD center, second_moments;
+ int length = blob.ComputeMoments(&center, &second_moments);
+ if (fx_info != nullptr) {
+ fx_info->Length = length;
+ fx_info->Rx = IntCastRounded(second_moments.y());
+ fx_info->Ry = IntCastRounded(second_moments.x());
+
+ fx_info->Xmean = IntCastRounded(center.x());
+ fx_info->Ymean = IntCastRounded(center.y());
+ }
+ // Setup the denorm for Baseline normalization.
+ bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f,
+ 1.0f, 1.0f, 128.0f, 128.0f);
+ // Setup the denorm for character normalization.
+ if (nonlinear_norm) {
+ GenericVector<GenericVector<int> > x_coords;
+ GenericVector<GenericVector<int> > y_coords;
+ TBOX box;
+ blob.GetPreciseBoundingBox(&box);
+ box.pad(1, 1);
+ blob.GetEdgeCoords(box, &x_coords, &y_coords);
+ cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX,
+ 0.0f, 0.0f, x_coords, y_coords);
+ } else {
+ cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(),
+ center.x(), center.y(),
+ 51.2f / second_moments.x(),
+ 51.2f / second_moments.y(),
+ 128.0f, 128.0f);
+ }
+}
+
+// Helper normalizes the direction, assuming that it is at the given
+// unnormed_pos, using the given denorm, starting at the root_denorm.
+static uint8_t NormalizeDirection(uint8_t dir, const FCOORD& unnormed_pos,
+ const DENORM& denorm,
+ const DENORM* root_denorm) {
+ // Convert direction to a vector.
+ FCOORD unnormed_end;
+ unnormed_end.from_direction(dir);
+ unnormed_end += unnormed_pos;
+ FCOORD normed_pos, normed_end;
+ denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);
+ denorm.NormTransform(root_denorm, unnormed_end, &normed_end);
+ normed_end -= normed_pos;
+ return normed_end.to_direction();
+}
+
+// Helper returns the mean direction vector from the given stats. Use the
+// mean direction from dirs if there is information available, otherwise, use
+// the fit_vector from point_diffs.
+static FCOORD MeanDirectionVector(const LLSQ& point_diffs, const LLSQ& dirs,
+ const FCOORD& start_pt,
+ const FCOORD& end_pt) {
+ FCOORD fit_vector;
+ if (dirs.count() > 0) {
+ // There were directions, so use them. To avoid wrap-around problems, we
+ // have 2 accumulators in dirs: x for normal directions and y for
+ // directions offset by 128. We will use the one with the least variance.
+ FCOORD mean_pt = dirs.mean_point();
+ double mean_dir = 0.0;
+ if (dirs.x_variance() <= dirs.y_variance()) {
+ mean_dir = mean_pt.x();
+ } else {
+ mean_dir = mean_pt.y() + 128;
+ }
+ fit_vector.from_direction(Modulo(IntCastRounded(mean_dir), 256));
+ } else {
+ // There were no directions, so we rely on the vector_fit to the points.
+ // Since the vector_fit is 180 degrees ambiguous, we align with the
+ // supplied feature_dir by making the scalar product non-negative.
+ FCOORD feature_dir(end_pt - start_pt);
+ fit_vector = point_diffs.vector_fit();
+ if (fit_vector.x() == 0.0f && fit_vector.y() == 0.0f) {
+ // There was only a single point. Use feature_dir directly.
+ fit_vector = feature_dir;
+ } else {
+ // Sometimes the least mean squares fit is wrong, due to the small sample
+ // of points and scaling. Use a 90 degree rotated vector if that matches
+ // feature_dir better.
+ FCOORD fit_vector2 = !fit_vector;
+ // The fit_vector is 180 degrees ambiguous, so resolve the ambiguity by
+ // insisting that the scalar product with the feature_dir should be +ve.
+ if (fit_vector % feature_dir < 0.0)
+ fit_vector = -fit_vector;
+ if (fit_vector2 % feature_dir < 0.0)
+ fit_vector2 = -fit_vector2;
+ // Even though fit_vector2 has a higher mean squared error, it might be
+ // a better fit, so use it if the dot product with feature_dir is bigger.
+ if (fit_vector2 % feature_dir > fit_vector % feature_dir)
+ fit_vector = fit_vector2;
+ }
+ }
+ return fit_vector;
+}
+
+// Helper computes one or more features corresponding to the given points.
+// Emitted features are on the line defined by:
+// start_pt + lambda * (end_pt - start_pt) for scalar lambda.
+// Features are spaced at feature_length intervals.
+static int ComputeFeatures(const FCOORD& start_pt, const FCOORD& end_pt,
+ double feature_length,
+ std::vector<INT_FEATURE_STRUCT>* features) {
+ FCOORD feature_vector(end_pt - start_pt);
+ if (feature_vector.x() == 0.0f && feature_vector.y() == 0.0f) return 0;
+ // Compute theta for the feature based on its direction.
+ uint8_t theta = feature_vector.to_direction();
+ // Compute the number of features and lambda_step.
+ double target_length = feature_vector.length();
+ int num_features = IntCastRounded(target_length / feature_length);
+ if (num_features == 0) return 0;
+ // Divide the length evenly into num_features pieces.
+ double lambda_step = 1.0 / num_features;
+ double lambda = lambda_step / 2.0;
+ for (int f = 0; f < num_features; ++f, lambda += lambda_step) {
+ FCOORD feature_pt(start_pt);
+ feature_pt += feature_vector * lambda;
+ INT_FEATURE_STRUCT feature(feature_pt, theta);
+ features->push_back(feature);
+ }
+ return num_features;
+}
+
+// Gathers outline points and their directions from start_index into dirs by
+// stepping along the outline and normalizing the coordinates until the
+// required feature_length has been collected or end_index is reached.
+// On input pos must point to the position corresponding to start_index and on
+// return pos is updated to the current raw position, and pos_normed is set to
+// the normed version of pos.
+// Since directions wrap-around, they need special treatment to get the mean.
+// Provided the cluster of directions doesn't straddle the wrap-around point,
+// the simple mean works. If they do, then, unless the directions are wildly
+// varying, the cluster rotated by 180 degrees will not straddle the wrap-
+// around point, so mean(dir + 180 degrees) - 180 degrees will work. Since
+// LLSQ conveniently stores the mean of 2 variables, we use it to store
+// dir and dir+128 (128 is 180 degrees) and then use the resulting mean
+// with the least variance.
+static int GatherPoints(const C_OUTLINE* outline, double feature_length,
+ const DENORM& denorm, const DENORM* root_denorm,
+ int start_index, int end_index,
+ ICOORD* pos, FCOORD* pos_normed,
+ LLSQ* points, LLSQ* dirs) {
+ int step_length = outline->pathlength();
+ ICOORD step = outline->step(start_index % step_length);
+ // Prev_normed is the start point of this collection and will be set on the
+ // first iteration, and on later iterations used to determine the length
+ // that has been collected.
+ FCOORD prev_normed;
+ points->clear();
+ dirs->clear();
+ int num_points = 0;
+ int index;
+ for (index = start_index; index <= end_index; ++index, *pos += step) {
+ step = outline->step(index % step_length);
+ int edge_weight = outline->edge_strength_at_index(index % step_length);
+ if (edge_weight == 0) {
+ // This point has conflicting gradient and step direction, so ignore it.
+ continue;
+ }
+ // Get the sub-pixel precise location and normalize.
+ FCOORD f_pos = outline->sub_pixel_pos_at_index(*pos, index % step_length);
+ denorm.NormTransform(root_denorm, f_pos, pos_normed);
+ if (num_points == 0) {
+ // The start of this segment.
+ prev_normed = *pos_normed;
+ } else {
+ FCOORD offset = *pos_normed - prev_normed;
+ float length = offset.length();
+ if (length > feature_length) {
+ // We have gone far enough from the start. We will use this point in
+ // the next set so return what we have so far.
+ return index;
+ }
+ }
+ points->add(pos_normed->x(), pos_normed->y(), edge_weight);
+ int direction = outline->direction_at_index(index % step_length);
+ if (direction >= 0) {
+ direction = NormalizeDirection(direction, f_pos, denorm, root_denorm);
+ // Use both the direction and direction +128 so we are not trying to
+ // take the mean of something straddling the wrap-around point.
+ dirs->add(direction, Modulo(direction + 128, 256));
+ }
+ ++num_points;
+ }
+ return index;
+}
+
+// Extracts Tesseract features and appends them to the features vector.
+// Startpt to lastpt, inclusive, MUST have the same src_outline member,
+// which may be nullptr. The vector from lastpt to its next is included in
+// the feature extraction. Hidden edges should be excluded by the caller.
+// If force_poly is true, the features will be extracted from the polygonal
+// approximation even if more accurate data is available.
+static void ExtractFeaturesFromRun(
+ const EDGEPT* startpt, const EDGEPT* lastpt,
+ const DENORM& denorm, double feature_length, bool force_poly,
+ std::vector<INT_FEATURE_STRUCT>* features) {
+ const EDGEPT* endpt = lastpt->next;
+ const C_OUTLINE* outline = startpt->src_outline;
+ if (outline != nullptr && !force_poly) {
+ // Detailed information is available. We have to normalize only from
+ // the root_denorm to denorm.
+ const DENORM* root_denorm = denorm.RootDenorm();
+ int total_features = 0;
+ // Get the features from the outline.
+ int step_length = outline->pathlength();
+ int start_index = startpt->start_step;
+ // pos is the integer coordinates of the binary image steps.
+ ICOORD pos = outline->position_at_index(start_index);
+ // We use an end_index that allows us to use a positive increment, but that
+ // may be beyond the bounds of the outline steps/ due to wrap-around, to
+ // so we use % step_length everywhere, except for start_index.
+ int end_index = lastpt->start_step + lastpt->step_count;
+ if (end_index <= start_index)
+ end_index += step_length;
+ LLSQ prev_points;
+ LLSQ prev_dirs;
+ FCOORD prev_normed_pos = outline->sub_pixel_pos_at_index(pos, start_index);
+ denorm.NormTransform(root_denorm, prev_normed_pos, &prev_normed_pos);
+ LLSQ points;
+ LLSQ dirs;
+ FCOORD normed_pos(0.0f, 0.0f);
+ int index = GatherPoints(outline, feature_length, denorm, root_denorm,
+ start_index, end_index, &pos, &normed_pos,
+ &points, &dirs);
+ while (index <= end_index) {
+ // At each iteration we nominally have 3 accumulated sets of points and
+ // dirs: prev_points/dirs, points/dirs, next_points/dirs and sum them
+ // into sum_points/dirs, but we don't necessarily get any features out,
+ // so if that is the case, we keep accumulating instead of rotating the
+ // accumulators.
+ LLSQ next_points;
+ LLSQ next_dirs;
+ FCOORD next_normed_pos(0.0f, 0.0f);
+ index = GatherPoints(outline, feature_length, denorm, root_denorm,
+ index, end_index, &pos, &next_normed_pos,
+ &next_points, &next_dirs);
+ LLSQ sum_points(prev_points);
+ // TODO(rays) find out why it is better to use just dirs and next_dirs
+ // in sum_dirs, instead of using prev_dirs as well.
+ LLSQ sum_dirs(dirs);
+ sum_points.add(points);
+ sum_points.add(next_points);
+ sum_dirs.add(next_dirs);
+ bool made_features = false;
+ // If we have some points, we can try making some features.
+ if (sum_points.count() > 0) {
+ // We have gone far enough from the start. Make a feature and restart.
+ FCOORD fit_pt = sum_points.mean_point();
+ FCOORD fit_vector = MeanDirectionVector(sum_points, sum_dirs,
+ prev_normed_pos, normed_pos);
+ // The segment to which we fit features is the line passing through
+ // fit_pt in direction of fit_vector that starts nearest to
+ // prev_normed_pos and ends nearest to normed_pos.
+ FCOORD start_pos = prev_normed_pos.nearest_pt_on_line(fit_pt,
+ fit_vector);
+ FCOORD end_pos = normed_pos.nearest_pt_on_line(fit_pt, fit_vector);
+ // Possible correction to match the adjacent polygon segment.
+ if (total_features == 0 && startpt != endpt) {
+ FCOORD poly_pos(startpt->pos.x, startpt->pos.y);
+ denorm.LocalNormTransform(poly_pos, &start_pos);
+ }
+ if (index > end_index && startpt != endpt) {
+ FCOORD poly_pos(endpt->pos.x, endpt->pos.y);
+ denorm.LocalNormTransform(poly_pos, &end_pos);
+ }
+ int num_features = ComputeFeatures(start_pos, end_pos, feature_length,
+ features);
+ if (num_features > 0) {
+ // We made some features so shuffle the accumulators.
+ prev_points = points;
+ prev_dirs = dirs;
+ prev_normed_pos = normed_pos;
+ points = next_points;
+ dirs = next_dirs;
+ made_features = true;
+ total_features += num_features;
+ }
+ // The end of the next set becomes the end next time around.
+ normed_pos = next_normed_pos;
+ }
+ if (!made_features) {
+ // We didn't make any features, so keep the prev accumulators and
+ // add the next ones into the current.
+ points.add(next_points);
+ dirs.add(next_dirs);
+ }
+ }
+ } else {
+ // There is no outline, so we are forced to use the polygonal approximation.
+ const EDGEPT* pt = startpt;
+ do {
+ FCOORD start_pos(pt->pos.x, pt->pos.y);
+ FCOORD end_pos(pt->next->pos.x, pt->next->pos.y);
+ denorm.LocalNormTransform(start_pos, &start_pos);
+ denorm.LocalNormTransform(end_pos, &end_pos);
+ ComputeFeatures(start_pos, end_pos, feature_length, features);
+ } while ((pt = pt->next) != endpt);
+ }
+}
+
+// Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
+// (x,y) position and angle as measured counterclockwise from the vector
+// <-1, 0>, from blob using two normalizations defined by bl_denorm and
+// cn_denorm. See SetpuBLCNDenorms for definitions.
+// If outline_cn_counts is not nullptr, on return it contains the cumulative
+// number of cn features generated for each outline in the blob (in order).
+// Thus after the first outline, there were (*outline_cn_counts)[0] features,
+// after the second outline, there were (*outline_cn_counts)[1] features etc.
+void Classify::ExtractFeatures(const TBLOB& blob,
+ bool nonlinear_norm,
+ std::vector<INT_FEATURE_STRUCT>* bl_features,
+ std::vector<INT_FEATURE_STRUCT>* cn_features,
+ INT_FX_RESULT_STRUCT* results,
+ GenericVector<int>* outline_cn_counts) {
+ DENORM bl_denorm, cn_denorm;
+ tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
+ &bl_denorm, &cn_denorm, results);
+ if (outline_cn_counts != nullptr)
+ outline_cn_counts->truncate(0);
+ // Iterate the outlines.
+ for (TESSLINE* ol = blob.outlines; ol != nullptr; ol = ol->next) {
+ // Iterate the polygon.
+ EDGEPT* loop_pt = ol->FindBestStartPt();
+ EDGEPT* pt = loop_pt;
+ if (pt == nullptr) continue;
+ do {
+ if (pt->IsHidden()) continue;
+ // Find a run of equal src_outline.
+ EDGEPT* last_pt = pt;
+ do {
+ last_pt = last_pt->next;
+ } while (last_pt != loop_pt && !last_pt->IsHidden() &&
+ last_pt->src_outline == pt->src_outline);
+ last_pt = last_pt->prev;
+ // Until the adaptive classifier can be weaned off polygon segments,
+ // we have to force extraction from the polygon for the bl_features.
+ ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
+ true, bl_features);
+ ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
+ false, cn_features);
+ pt = last_pt;
+ } while ((pt = pt->next) != loop_pt);
+ if (outline_cn_counts != nullptr)
+ outline_cn_counts->push_back(cn_features->size());
+ }
+ results->NumBL = bl_features->size();
+ results->NumCN = cn_features->size();
+ results->YBottom = blob.bounding_box().bottom();
+ results->YTop = blob.bounding_box().top();
+ results->Width = blob.bounding_box().width();
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/intfx.h b/tesseract/src/classify/intfx.h
new file mode 100644
index 00000000..f4f8fd1a
--- /dev/null
+++ b/tesseract/src/classify/intfx.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ ** Filename: intfx.h
+ ** Purpose: Interface to high level integer feature extractor.
+ ** Author: Robert Moss
+ ** History: Tue May 21 15:51:57 MDT 1991, RWM, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+#ifndef INTFX_H
+#define INTFX_H
+
+#include "blobs.h"
+#include "intproto.h"
+#include "normalis.h"
+
+#include <cmath>
+
+namespace tesseract {
+
+class DENORM;
+
+class TrainingSample;
+
+struct INT_FX_RESULT_STRUCT {
+ int32_t Length; // total length of all outlines
+ int16_t Xmean, Ymean; // center of mass of all outlines
+ int16_t Rx, Ry; // radius of gyration
+ int16_t NumBL, NumCN; // number of features extracted
+ int16_t Width; // Width of blob in BLN coords.
+ uint8_t YBottom; // Bottom of blob in BLN coords.
+ uint8_t YTop; // Top of blob in BLN coords.
+};
+
+// The standard feature length
+const double kStandardFeatureLength = 64.0 / 5;
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+TESS_API
+void InitIntegerFX();
+
+// Returns a vector representing the direction of a feature with the given
+// theta direction in an INT_FEATURE_STRUCT.
+TESS_API
+FCOORD FeatureDirection(uint8_t theta);
+
+// Generates a TrainingSample from a TBLOB. Extracts features and sets
+// the bounding box, so classifiers that operate on the image can work.
+// TODO(rays) BlobToTrainingSample must remain a global function until
+// the FlexFx and FeatureDescription code can be removed and LearnBlob
+// made a member of Classify.
+TrainingSample* BlobToTrainingSample(
+ const TBLOB& blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT* fx_info,
+ std::vector<INT_FEATURE_STRUCT>* bl_features);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/intmatcher.cpp b/tesseract/src/classify/intmatcher.cpp
new file mode 100644
index 00000000..b78c700f
--- /dev/null
+++ b/tesseract/src/classify/intmatcher.cpp
@@ -0,0 +1,1226 @@
+/******************************************************************************
+ ** Filename: intmatcher.cpp
+ ** Purpose: Generic high level classification routines.
+ ** Author: Robert Moss
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "intmatcher.h"
+
+#include "fontinfo.h"
+#include "intproto.h"
+#include "scrollview.h"
+#include "float2int.h"
+#include "classify.h"
+#include "shapetable.h"
+
+#include "helpers.h"
+
+#include <cassert>
+#include <cmath>
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+----------------------------------------------------------------------------*/
+// Parameters of the sigmoid used to convert similarity to evidence in the
+// similarity_evidence_table_ that is used to convert distance metric to an
+// 8 bit evidence value in the secondary matcher. (See IntMatcher::Init).
+const float IntegerMatcher::kSEExponentialMultiplier = 0.0f;
+const float IntegerMatcher::kSimilarityCenter = 0.0075f;
+
+static const uint8_t offset_table[] = {
+ 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6,
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,
+ 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+};
+
+static const uint8_t next_table[] = {
+ 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
+ 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a,
+ 0x18, 0x1c, 0x1c, 0x1e, 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,
+ 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, 0x20, 0x30, 0x30, 0x32,
+ 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
+ 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a,
+ 0x48, 0x4c, 0x4c, 0x4e, 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56,
+ 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, 0x40, 0x60, 0x60, 0x62,
+ 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
+ 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a,
+ 0x78, 0x7c, 0x7c, 0x7e, 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,
+ 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, 0x80, 0x90, 0x90, 0x92,
+ 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
+ 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa,
+ 0xa8, 0xac, 0xac, 0xae, 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6,
+ 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, 0x80, 0xc0, 0xc0, 0xc2,
+ 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
+ 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda,
+ 0xd8, 0xdc, 0xdc, 0xde, 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6,
+ 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, 0xe0, 0xf0, 0xf0, 0xf2,
+ 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
+};
+
+// See http://b/19318793 (#6) for a complete discussion.
+
+/**
+ * Sort Key array in ascending order using heap sort
+ * algorithm. Also sort Index array that is tied to
+ * the key array.
+ * @param n Number of elements to sort
+ * @param ra Key array [1..n]
+ * @param rb Index array [1..n]
+ */
+static void
+HeapSort (int n, int ra[], int rb[]) {
+ int i, rra, rrb;
+ int l, j, ir;
+
+ l = (n >> 1) + 1;
+ ir = n;
+ for (;;) {
+ if (l > 1) {
+ rra = ra[--l];
+ rrb = rb[l];
+ }
+ else {
+ rra = ra[ir];
+ rrb = rb[ir];
+ ra[ir] = ra[1];
+ rb[ir] = rb[1];
+ if (--ir == 1) {
+ ra[1] = rra;
+ rb[1] = rrb;
+ return;
+ }
+ }
+ i = l;
+ j = l << 1;
+ while (j <= ir) {
+ if (j < ir && ra[j] < ra[j + 1])
+ ++j;
+ if (rra < ra[j]) {
+ ra[i] = ra[j];
+ rb[i] = rb[j];
+ j += (i = j);
+ }
+ else
+ j = ir + 1;
+ }
+ ra[i] = rra;
+ rb[i] = rrb;
+ }
+}
+
+// Encapsulation of the intermediate data and computations made by the class
+// pruner. The class pruner implements a simple linear classifier on binary
+// features by heavily quantizing the feature space, and applying
+// NUM_BITS_PER_CLASS (2)-bit weights to the features. Lack of resolution in
+// weights is compensated by a non-constant bias that is dependent on the
+// number of features present.
+class ClassPruner {
+ public:
+ ClassPruner(int max_classes) {
+ // The unrolled loop in ComputeScores means that the array sizes need to
+ // be rounded up so that the array is big enough to accommodate the extra
+ // entries accessed by the unrolling. Each pruner word is of sized
+ // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
+ // BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
+ // See ComputeScores.
+ max_classes_ = max_classes;
+ rounded_classes_ = RoundUp(
+ max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS);
+ class_count_ = new int[rounded_classes_];
+ norm_count_ = new int[rounded_classes_];
+ sort_key_ = new int[rounded_classes_ + 1];
+ sort_index_ = new int[rounded_classes_ + 1];
+ for (int i = 0; i < rounded_classes_; i++) {
+ class_count_[i] = 0;
+ }
+ pruning_threshold_ = 0;
+ num_features_ = 0;
+ num_classes_ = 0;
+ }
+
+ ~ClassPruner() {
+ delete []class_count_;
+ delete []norm_count_;
+ delete []sort_key_;
+ delete []sort_index_;
+ }
+
+ /// Computes the scores for every class in the character set, by summing the
+ /// weights for each feature and stores the sums internally in class_count_.
+ void ComputeScores(const INT_TEMPLATES_STRUCT* int_templates,
+ int num_features, const INT_FEATURE_STRUCT* features) {
+ num_features_ = num_features;
+ int num_pruners = int_templates->NumClassPruners;
+ for (int f = 0; f < num_features; ++f) {
+ const INT_FEATURE_STRUCT* feature = &features[f];
+ // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
+ int x = feature->X * NUM_CP_BUCKETS >> 8;
+ int y = feature->Y * NUM_CP_BUCKETS >> 8;
+ int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
+ int class_id = 0;
+ // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
+ // we need a collection of them, indexed by pruner_set.
+ for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
+ // Look up quantized feature in a 3-D array, an array of weights for
+ // each class.
+ const uint32_t* pruner_word_ptr =
+ int_templates->ClassPruners[pruner_set]->p[x][y][theta];
+ for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
+ uint32_t pruner_word = *pruner_word_ptr++;
+ // This inner loop is unrolled to speed up the ClassPruner.
+ // Currently gcc would not unroll it unless it is set to O3
+ // level of optimization or -funroll-loops is specified.
+ /*
+ uint32_t class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
+ for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
+ class_count_[class_id++] += pruner_word & class_mask;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ }
+ */
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
+ }
+ }
+ }
+ }
+
+ /// Adjusts the scores according to the number of expected features. Used
+ /// in lieu of a constant bias, this penalizes classes that expect more
+ /// features than there are present. Thus an actual c will score higher for c
+ /// than e, even though almost all the features match e as well as c, because
+ /// e expects more features to be present.
+ void AdjustForExpectedNumFeatures(const uint16_t* expected_num_features,
+ int cutoff_strength) {
+ for (int class_id = 0; class_id < max_classes_; ++class_id) {
+ if (num_features_ < expected_num_features[class_id]) {
+ int deficit = expected_num_features[class_id] - num_features_;
+ class_count_[class_id] -= class_count_[class_id] * deficit /
+ (num_features_ * cutoff_strength + deficit);
+ }
+ }
+ }
+
+ /// Zeros the scores for classes disabled in the unicharset.
+ /// Implements the black-list to recognize a subset of the character set.
+ void DisableDisabledClasses(const UNICHARSET& unicharset) {
+ for (int class_id = 0; class_id < max_classes_; ++class_id) {
+ if (!unicharset.get_enabled(class_id))
+ class_count_[class_id] = 0; // This char is disabled!
+ }
+ }
+
+ /** Zeros the scores of fragments. */
+ void DisableFragments(const UNICHARSET& unicharset) {
+ for (int class_id = 0; class_id < max_classes_; ++class_id) {
+ // Do not include character fragments in the class pruner
+ // results if disable_character_fragments is true.
+ if (unicharset.get_fragment(class_id)) {
+ class_count_[class_id] = 0;
+ }
+ }
+ }
+
+ /// Normalizes the counts for xheight, putting the normalized result in
+ /// norm_count_. Applies a simple subtractive penalty for incorrect vertical
+ /// position provided by the normalization_factors array, indexed by
+ /// character class, and scaled by the norm_multiplier.
+ void NormalizeForXheight(int norm_multiplier,
+ const uint8_t* normalization_factors) {
+ for (int class_id = 0; class_id < max_classes_; class_id++) {
+ norm_count_[class_id] = class_count_[class_id] -
+ ((norm_multiplier * normalization_factors[class_id]) >> 8);
+ }
+ }
+
+ /** The nop normalization copies the class_count_ array to norm_count_. */
+ void NoNormalization() {
+ for (int class_id = 0; class_id < max_classes_; class_id++) {
+ norm_count_[class_id] = class_count_[class_id];
+ }
+ }
+
+ /// Prunes the classes using &lt;the maximum count> * pruning_factor/256 as a
+ /// threshold for keeping classes. If max_of_non_fragments, then ignore
+ /// fragments in computing the maximum count.
+ void PruneAndSort(int pruning_factor, int keep_this,
+ bool max_of_non_fragments, const UNICHARSET& unicharset) {
+ int max_count = 0;
+ for (int c = 0; c < max_classes_; ++c) {
+ if (norm_count_[c] > max_count &&
+ // This additional check is added in order to ensure that
+ // the classifier will return at least one non-fragmented
+ // character match.
+ // TODO(daria): verify that this helps accuracy and does not
+ // hurt performance.
+ (!max_of_non_fragments || !unicharset.get_fragment(c))) {
+ max_count = norm_count_[c];
+ }
+ }
+ // Prune Classes.
+ pruning_threshold_ = (max_count * pruning_factor) >> 8;
+ // Select Classes.
+ if (pruning_threshold_ < 1)
+ pruning_threshold_ = 1;
+ num_classes_ = 0;
+ for (int class_id = 0; class_id < max_classes_; class_id++) {
+ if (norm_count_[class_id] >= pruning_threshold_ ||
+ class_id == keep_this) {
+ ++num_classes_;
+ sort_index_[num_classes_] = class_id;
+ sort_key_[num_classes_] = norm_count_[class_id];
+ }
+ }
+
+ // Sort Classes using Heapsort Algorithm.
+ if (num_classes_ > 1)
+ HeapSort(num_classes_, sort_key_, sort_index_);
+ }
+
+ /** Prints debug info on the class pruner matches for the pruned classes only.
+ */
+ void DebugMatch(const Classify& classify,
+ const INT_TEMPLATES_STRUCT* int_templates,
+ const INT_FEATURE_STRUCT* features) const {
+ int num_pruners = int_templates->NumClassPruners;
+ int max_num_classes = int_templates->NumClasses;
+ for (int f = 0; f < num_features_; ++f) {
+ const INT_FEATURE_STRUCT* feature = &features[f];
+ tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta);
+ // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
+ int x = feature->X * NUM_CP_BUCKETS >> 8;
+ int y = feature->Y * NUM_CP_BUCKETS >> 8;
+ int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
+ int class_id = 0;
+ for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
+ // Look up quantized feature in a 3-D array, an array of weights for
+ // each class.
+ const uint32_t* pruner_word_ptr =
+ int_templates->ClassPruners[pruner_set]->p[x][y][theta];
+ for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
+ uint32_t pruner_word = *pruner_word_ptr++;
+ for (int word_class = 0; word_class < 16 &&
+ class_id < max_num_classes; ++word_class, ++class_id) {
+ if (norm_count_[class_id] >= pruning_threshold_) {
+ tprintf(" %s=%d,",
+ classify.ClassIDToDebugStr(int_templates,
+ class_id, 0).c_str(),
+ pruner_word & CLASS_PRUNER_CLASS_MASK);
+ }
+ pruner_word >>= NUM_BITS_PER_CLASS;
+ }
+ }
+ tprintf("\n");
+ }
+ }
+ }
+
+ /** Prints a summary of the pruner result. */
+ void SummarizeResult(const Classify& classify,
+ const INT_TEMPLATES_STRUCT* int_templates,
+ const uint16_t* expected_num_features,
+ int norm_multiplier,
+ const uint8_t* normalization_factors) const {
+ tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_);
+ for (int i = 0; i < num_classes_; ++i) {
+ int class_id = sort_index_[num_classes_ - i];
+ STRING class_string = classify.ClassIDToDebugStr(int_templates,
+ class_id, 0);
+ tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n",
+ class_string.c_str(),
+ class_count_[class_id],
+ expected_num_features[class_id],
+ (norm_multiplier * normalization_factors[class_id]) >> 8,
+ sort_key_[num_classes_ - i],
+ 100.0 - 100.0 * sort_key_[num_classes_ - i] /
+ (CLASS_PRUNER_CLASS_MASK * num_features_));
+ }
+ }
+
+ /// Copies the pruned, sorted classes into the output results and returns
+ /// the number of classes.
+ int SetupResults(std::vector<CP_RESULT_STRUCT>* results) const {
+ results->resize(num_classes_);
+ for (int c = 0; c < num_classes_; ++c) {
+ (*results)[c].Class = sort_index_[num_classes_ - c];
+ (*results)[c].Rating = 1.0f - sort_key_[num_classes_ - c] /
+ (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);
+ }
+ return num_classes_;
+ }
+
+ private:
+ /** Array[rounded_classes_] of initial counts for each class. */
+ int *class_count_;
+ /// Array[rounded_classes_] of modified counts for each class after
+ /// normalizing for expected number of features, disabled classes, fragments,
+ /// and xheights.
+ int *norm_count_;
+ /** Array[rounded_classes_ +1] of pruned counts that gets sorted */
+ int *sort_key_;
+ /** Array[rounded_classes_ +1] of classes corresponding to sort_key_. */
+ int *sort_index_;
+ /** Number of classes in this class pruner. */
+ int max_classes_;
+ /** Rounded up number of classes used for array sizes. */
+ int rounded_classes_;
+ /** Threshold count applied to prune classes. */
+ int pruning_threshold_;
+ /** The number of features used to compute the scores. */
+ int num_features_;
+ /** Final number of pruned classes. */
+ int num_classes_;
+};
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+/**
+ * Runs the class pruner from int_templates on the given features, returning
+ * the number of classes output in results.
+ * @param int_templates Class pruner tables
+ * @param num_features Number of features in blob
+ * @param features Array of features
+ * @param normalization_factors Array of fudge factors from blob
+ * normalization process (by CLASS_INDEX)
+ * @param expected_num_features Array of expected number of features
+ * for each class (by CLASS_INDEX)
+ * @param results Sorted Array of pruned classes. Must be an
+ * array of size at least
+ * int_templates->NumClasses.
+ * @param keep_this
+ */
+int Classify::PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
+ int num_features, int keep_this,
+ const INT_FEATURE_STRUCT* features,
+ const uint8_t* normalization_factors,
+ const uint16_t* expected_num_features,
+ std::vector<CP_RESULT_STRUCT>* results) {
+ ClassPruner pruner(int_templates->NumClasses);
+ // Compute initial match scores for all classes.
+ pruner.ComputeScores(int_templates, num_features, features);
+ // Adjust match scores for number of expected features.
+ pruner.AdjustForExpectedNumFeatures(expected_num_features,
+ classify_cp_cutoff_strength);
+ // Apply disabled classes in unicharset - only works without a shape_table.
+ if (shape_table_ == nullptr)
+ pruner.DisableDisabledClasses(unicharset);
+ // If fragments are disabled, remove them, also only without a shape table.
+ if (disable_character_fragments && shape_table_ == nullptr)
+ pruner.DisableFragments(unicharset);
+
+ // If we have good x-heights, apply the given normalization factors.
+ if (normalization_factors != nullptr) {
+ pruner.NormalizeForXheight(classify_class_pruner_multiplier,
+ normalization_factors);
+ } else {
+ pruner.NoNormalization();
+ }
+ // Do the actual pruning and sort the short-list.
+ pruner.PruneAndSort(classify_class_pruner_threshold, keep_this,
+ shape_table_ == nullptr, unicharset);
+
+ if (classify_debug_level > 2) {
+ pruner.DebugMatch(*this, int_templates, features);
+ }
+ if (classify_debug_level > 1) {
+ pruner.SummarizeResult(*this, int_templates, expected_num_features,
+ classify_class_pruner_multiplier,
+ normalization_factors);
+ }
+ // Convert to the expected output format.
+ return pruner.SetupResults(results);
+}
+
+/**
+ * IntegerMatcher returns the best configuration and rating
+ * for a single class. The class matched against is determined
+ * by the uniqueness of the ClassTemplate parameter. The
+ * best rating and its associated configuration are returned.
+ *
+ * Globals:
+ * - local_matcher_multiplier_ Normalization factor multiplier
+ * param ClassTemplate Prototypes & tables for a class
+ * param NumFeatures Number of features in blob
+ * param Features Array of features
+ * param NormalizationFactor Fudge factor from blob normalization process
+ * param Result Class rating & configuration: (0.0 -> 1.0), 0=bad, 1=good
+ * param Debug Debugger flag: 1=debugger on
+ */
+void IntegerMatcher::Match(INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ const INT_FEATURE_STRUCT* Features,
+ UnicharRating* Result,
+ int AdaptFeatureThreshold,
+ int Debug,
+ bool SeparateDebugWindows) {
+ auto *tables = new ScratchEvidence();
+ int Feature;
+
+ if (MatchDebuggingOn (Debug))
+ tprintf ("Integer Matcher -------------------------------------------\n");
+
+ tables->Clear(ClassTemplate);
+ Result->feature_misses = 0;
+
+ for (Feature = 0; Feature < NumFeatures; Feature++) {
+ int csum = UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask,
+ Feature, &Features[Feature],
+ tables, Debug);
+ // Count features that were missed over all configs.
+ if (csum == 0)
+ ++Result->feature_misses;
+ }
+
+#ifndef GRAPHICS_DISABLED
+ if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) {
+ DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables,
+ NumFeatures, Debug);
+ }
+
+ if (DisplayProtoMatchesOn(Debug)) {
+ DisplayProtoDebugInfo(ClassTemplate, ConfigMask,
+ *tables, SeparateDebugWindows);
+ }
+
+ if (DisplayFeatureMatchesOn(Debug)) {
+ DisplayFeatureDebugInfo(ClassTemplate, ProtoMask, ConfigMask, NumFeatures,
+ Features, AdaptFeatureThreshold, Debug,
+ SeparateDebugWindows);
+ }
+#endif
+
+ tables->UpdateSumOfProtoEvidences(ClassTemplate, ConfigMask);
+ tables->NormalizeSums(ClassTemplate, NumFeatures);
+
+ FindBestMatch(ClassTemplate, *tables, Result);
+
+#ifndef GRAPHICS_DISABLED
+ if (PrintMatchSummaryOn(Debug))
+ Result->Print();
+
+ if (MatchDebuggingOn(Debug))
+ tprintf("Match Complete --------------------------------------------\n");
+#endif
+
+ delete tables;
+}
+
+/**
+ * FindGoodProtos finds all protos whose normalized proto-evidence
+ * exceed AdaptProtoThreshold. The list is ordered by increasing
+ * proto id number.
+ *
+ * Globals:
+ * - local_matcher_multiplier_ Normalization factor multiplier
+ * param ClassTemplate Prototypes & tables for a class
+ * param ProtoMask AND Mask for proto word
+ * param ConfigMask AND Mask for config word
+ * param NumFeatures Number of features in blob
+ * param Features Array of features
+ * param ProtoArray Array of good protos
+ * param AdaptProtoThreshold Threshold for good protos
+ * param Debug Debugger flag: 1=debugger on
+ * @return Number of good protos in ProtoArray.
+ */
+int IntegerMatcher::FindGoodProtos(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ INT_FEATURE_ARRAY Features,
+ PROTO_ID *ProtoArray,
+ int AdaptProtoThreshold,
+ int Debug) {
+ auto *tables = new ScratchEvidence();
+ int NumGoodProtos = 0;
+
+ /* DEBUG opening heading */
+ if (MatchDebuggingOn (Debug))
+ tprintf
+ ("Find Good Protos -------------------------------------------\n");
+
+ tables->Clear(ClassTemplate);
+
+ for (int Feature = 0; Feature < NumFeatures; Feature++)
+ UpdateTablesForFeature(
+ ClassTemplate, ProtoMask, ConfigMask, Feature, &(Features[Feature]),
+ tables, Debug);
+
+#ifndef GRAPHICS_DISABLED
+ if (PrintProtoMatchesOn (Debug) || PrintMatchSummaryOn (Debug))
+ DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables,
+ NumFeatures, Debug);
+#endif
+
+ /* Average Proto Evidences & Find Good Protos */
+ for (int proto = 0; proto < ClassTemplate->NumProtos; proto++) {
+ /* Compute Average for Actual Proto */
+ int Temp = 0;
+ for (uint8_t i = 0;
+ i < MAX_PROTO_INDEX && i < ClassTemplate->ProtoLengths[proto]; i++)
+ Temp += tables->proto_evidence_[proto][i];
+
+ Temp /= ClassTemplate->ProtoLengths[proto];
+
+ /* Find Good Protos */
+ if (Temp >= AdaptProtoThreshold) {
+ *ProtoArray = proto;
+ ProtoArray++;
+ NumGoodProtos++;
+ }
+ }
+
+ if (MatchDebuggingOn (Debug))
+ tprintf ("Match Complete --------------------------------------------\n");
+ delete tables;
+
+ return NumGoodProtos;
+}
+
+/**
+ * FindBadFeatures finds all features with maximum feature-evidence <
+ * AdaptFeatureThresh. The list is ordered by increasing feature number.
+ * @param ClassTemplate Prototypes & tables for a class
+ * @param ProtoMask AND Mask for proto word
+ * @param ConfigMask AND Mask for config word
+ * @param NumFeatures Number of features in blob
+ * @param Features Array of features
+ * @param FeatureArray Array of bad features
+ * @param AdaptFeatureThreshold Threshold for bad features
+ * @param Debug Debugger flag: 1=debugger on
+ * @return Number of bad features in FeatureArray.
+ */
+int IntegerMatcher::FindBadFeatures(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ INT_FEATURE_ARRAY Features,
+ FEATURE_ID *FeatureArray,
+ int AdaptFeatureThreshold,
+ int Debug) {
+ auto *tables = new ScratchEvidence();
+ int NumBadFeatures = 0;
+
+ /* DEBUG opening heading */
+ if (MatchDebuggingOn(Debug))
+ tprintf("Find Bad Features -------------------------------------------\n");
+
+ tables->Clear(ClassTemplate);
+
+ for (int Feature = 0; Feature < NumFeatures; Feature++) {
+ UpdateTablesForFeature(
+ ClassTemplate, ProtoMask, ConfigMask, Feature, &Features[Feature],
+ tables, Debug);
+
+ /* Find Best Evidence for Current Feature */
+ int best = 0;
+ assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS);
+ for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++)
+ if (tables->feature_evidence_[i] > best)
+ best = tables->feature_evidence_[i];
+
+ /* Find Bad Features */
+ if (best < AdaptFeatureThreshold) {
+ *FeatureArray = Feature;
+ FeatureArray++;
+ NumBadFeatures++;
+ }
+ }
+
+#ifndef GRAPHICS_DISABLED
+ if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug))
+ DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables,
+ NumFeatures, Debug);
+#endif
+
+ if (MatchDebuggingOn(Debug))
+ tprintf("Match Complete --------------------------------------------\n");
+
+ delete tables;
+ return NumBadFeatures;
+}
+
+
+IntegerMatcher::IntegerMatcher(tesseract::IntParam *classify_debug_level)
+ : classify_debug_level_(classify_debug_level)
+{
+ /* Initialize table for evidence to similarity lookup */
+ for (int i = 0; i < SE_TABLE_SIZE; i++) {
+ uint32_t IntSimilarity = i << (27 - SE_TABLE_BITS);
+ double Similarity = (static_cast<double>(IntSimilarity)) / 65536.0 / 65536.0;
+ double evidence = Similarity / kSimilarityCenter;
+ evidence = 255.0 / (evidence * evidence + 1.0);
+
+ if (kSEExponentialMultiplier > 0.0) {
+ double scale = 1.0 - exp(-kSEExponentialMultiplier) *
+ exp(kSEExponentialMultiplier * (static_cast<double>(i) / SE_TABLE_SIZE));
+ evidence *= ClipToRange(scale, 0.0, 1.0);
+ }
+
+ similarity_evidence_table_[i] = static_cast<uint8_t>(evidence + 0.5);
+ }
+
+ /* Initialize evidence computation variables */
+ evidence_table_mask_ =
+ ((1 << kEvidenceTableBits) - 1) << (9 - kEvidenceTableBits);
+ mult_trunc_shift_bits_ = (14 - kIntEvidenceTruncBits);
+ table_trunc_shift_bits_ = (27 - SE_TABLE_BITS - (mult_trunc_shift_bits_ << 1));
+ evidence_mult_mask_ = ((1 << kIntEvidenceTruncBits) - 1);
+}
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+void ScratchEvidence::Clear(const INT_CLASS class_template) {
+ memset(sum_feature_evidence_, 0,
+ class_template->NumConfigs * sizeof(sum_feature_evidence_[0]));
+ memset(proto_evidence_, 0,
+ class_template->NumProtos * sizeof(proto_evidence_[0]));
+}
+
+void ScratchEvidence::ClearFeatureEvidence(const INT_CLASS class_template) {
+ memset(feature_evidence_, 0,
+ class_template->NumConfigs * sizeof(feature_evidence_[0]));
+}
+
+/**
+ * Print debugging information for Configurations
+ */
+static void IMDebugConfiguration(int FeatureNum, uint16_t ActualProtoNum,
+ uint8_t Evidence, uint32_t ConfigWord) {
+ tprintf ("F = %3d, P = %3d, E = %3d, Configs = ",
+ FeatureNum, static_cast<int>(ActualProtoNum), static_cast<int>(Evidence));
+ while (ConfigWord) {
+ if (ConfigWord & 1)
+ tprintf ("1");
+ else
+ tprintf ("0");
+ ConfigWord >>= 1;
+ }
+ tprintf ("\n");
+}
+
+/**
+ * Print debugging information for Configurations
+ */
+static void IMDebugConfigurationSum(int FeatureNum, uint8_t *FeatureEvidence,
+ int32_t ConfigCount) {
+ tprintf("F=%3d, C=", FeatureNum);
+ for (int ConfigNum = 0; ConfigNum < ConfigCount; ConfigNum++) {
+ tprintf("%4d", FeatureEvidence[ConfigNum]);
+ }
+ tprintf("\n");
+}
+
+/**
+ * For the given feature: prune protos, compute evidence,
+ * update Feature Evidence, Proto Evidence, and Sum of Feature
+ * Evidence tables.
+ * @param ClassTemplate Prototypes & tables for a class
+ * @param FeatureNum Current feature number (for DEBUG only)
+ * @param Feature Pointer to a feature struct
+ * @param tables Evidence tables
+ * @param Debug Debugger flag: 1=debugger on
+ * @return sum of feature evidence tables
+ */
+int IntegerMatcher::UpdateTablesForFeature(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int FeatureNum,
+ const INT_FEATURE_STRUCT* Feature,
+ ScratchEvidence *tables,
+ int Debug) {
+ uint32_t ConfigWord;
+ uint32_t ProtoWord;
+ uint32_t ProtoNum;
+ uint32_t ActualProtoNum;
+ uint8_t proto_byte;
+ int32_t proto_word_offset;
+ int32_t proto_offset;
+ PROTO_SET ProtoSet;
+ uint32_t *ProtoPrunerPtr;
+ INT_PROTO Proto;
+ int ProtoSetIndex;
+ uint8_t Evidence;
+ uint32_t XFeatureAddress;
+ uint32_t YFeatureAddress;
+ uint32_t ThetaFeatureAddress;
+
+ tables->ClearFeatureEvidence(ClassTemplate);
+
+ /* Precompute Feature Address offset for Proto Pruning */
+ XFeatureAddress = ((Feature->X >> 2) << 1);
+ YFeatureAddress = (NUM_PP_BUCKETS << 1) + ((Feature->Y >> 2) << 1);
+ ThetaFeatureAddress = (NUM_PP_BUCKETS << 2) + ((Feature->Theta >> 2) << 1);
+
+ for (ProtoSetIndex = 0, ActualProtoNum = 0;
+ ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) {
+ ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];
+ ProtoPrunerPtr = reinterpret_cast<uint32_t *>((*ProtoSet).ProtoPruner);
+ for (ProtoNum = 0; ProtoNum < PROTOS_PER_PROTO_SET;
+ ProtoNum += (PROTOS_PER_PROTO_SET >> 1), ActualProtoNum +=
+ (PROTOS_PER_PROTO_SET >> 1), ProtoMask++, ProtoPrunerPtr++) {
+ /* Prune Protos of current Proto Set */
+ ProtoWord = *(ProtoPrunerPtr + XFeatureAddress);
+ ProtoWord &= *(ProtoPrunerPtr + YFeatureAddress);
+ ProtoWord &= *(ProtoPrunerPtr + ThetaFeatureAddress);
+ ProtoWord &= *ProtoMask;
+
+ if (ProtoWord != 0) {
+ proto_byte = ProtoWord & 0xff;
+ ProtoWord >>= 8;
+ proto_word_offset = 0;
+ while (ProtoWord != 0 || proto_byte != 0) {
+ while (proto_byte == 0) {
+ proto_byte = ProtoWord & 0xff;
+ ProtoWord >>= 8;
+ proto_word_offset += 8;
+ }
+ proto_offset = offset_table[proto_byte] + proto_word_offset;
+ proto_byte = next_table[proto_byte];
+ Proto = &(ProtoSet->Protos[ProtoNum + proto_offset]);
+ ConfigWord = Proto->Configs[0];
+ int32_t A3 = (((Proto->A * (Feature->X - 128)) * 2)
+ - (Proto->B * (Feature->Y - 128)) + (Proto->C * 512));
+ int32_t M3 = ((static_cast<int8_t>(Feature->Theta - Proto->Angle)) *
+ kIntThetaFudge) * 2;
+
+ if (A3 < 0)
+ A3 = ~A3;
+ if (M3 < 0)
+ M3 = ~M3;
+ A3 >>= mult_trunc_shift_bits_;
+ M3 >>= mult_trunc_shift_bits_;
+ if (static_cast<uint32_t>(A3) > evidence_mult_mask_)
+ A3 = evidence_mult_mask_;
+ if (static_cast<uint32_t>(M3) > evidence_mult_mask_)
+ M3 = evidence_mult_mask_;
+
+ uint32_t A4 = (A3 * A3) + (M3 * M3);
+ A4 >>= table_trunc_shift_bits_;
+ if (A4 > evidence_table_mask_)
+ Evidence = 0;
+ else
+ Evidence = similarity_evidence_table_[A4];
+
+ if (PrintFeatureMatchesOn (Debug))
+ IMDebugConfiguration (FeatureNum,
+ ActualProtoNum + proto_offset,
+ Evidence, ConfigWord);
+
+ ConfigWord &= *ConfigMask;
+
+ uint8_t feature_evidence_index = 0;
+ uint8_t config_byte = 0;
+ while (ConfigWord != 0 || config_byte != 0) {
+ while (config_byte == 0) {
+ config_byte = ConfigWord & 0xff;
+ ConfigWord >>= 8;
+ feature_evidence_index += 8;
+ }
+ const uint8_t config_offset =
+ offset_table[config_byte] + feature_evidence_index - 8;
+ config_byte = next_table[config_byte];
+ if (Evidence > tables->feature_evidence_[config_offset])
+ tables->feature_evidence_[config_offset] = Evidence;
+ }
+
+ uint8_t ProtoIndex =
+ ClassTemplate->ProtoLengths[ActualProtoNum + proto_offset];
+ if (ProtoIndex > MAX_PROTO_INDEX) {
+ // Avoid buffer overflow.
+ // TODO: A better fix is still open.
+ ProtoIndex = MAX_PROTO_INDEX;
+ }
+ uint8_t* UINT8Pointer =
+ &(tables->proto_evidence_[ActualProtoNum + proto_offset][0]);
+ for (; Evidence > 0 && ProtoIndex > 0; ProtoIndex--, UINT8Pointer++) {
+ if (Evidence > *UINT8Pointer) {
+ uint8_t Temp = *UINT8Pointer;
+ *UINT8Pointer = Evidence;
+ Evidence = Temp;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (PrintFeatureMatchesOn(Debug)) {
+ IMDebugConfigurationSum(FeatureNum, tables->feature_evidence_,
+ ClassTemplate->NumConfigs);
+ }
+
+ int* IntPointer = tables->sum_feature_evidence_;
+ uint8_t* UINT8Pointer = tables->feature_evidence_;
+ int SumOverConfigs = 0;
+ for (int ConfigNum = ClassTemplate->NumConfigs; ConfigNum > 0; ConfigNum--) {
+ int evidence = *UINT8Pointer++;
+ SumOverConfigs += evidence;
+ *IntPointer++ += evidence;
+ }
+ return SumOverConfigs;
+}
+
+/**
+ * Print debugging information for Configurations
+ */
+#ifndef GRAPHICS_DISABLED
+void IntegerMatcher::DebugFeatureProtoError(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ const ScratchEvidence& tables,
+ int16_t NumFeatures,
+ int Debug) {
+ float ProtoConfigs[MAX_NUM_CONFIGS];
+ int ConfigNum;
+ uint32_t ConfigWord;
+ int ProtoSetIndex;
+ uint16_t ProtoNum;
+ uint8_t ProtoWordNum;
+ PROTO_SET ProtoSet;
+ uint16_t ActualProtoNum;
+
+ if (PrintMatchSummaryOn(Debug)) {
+ tprintf("Configuration Mask:\n");
+ for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++)
+ tprintf("%1d", (((*ConfigMask) >> ConfigNum) & 1));
+ tprintf("\n");
+
+ tprintf("Feature Error for Configurations:\n");
+ for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {
+ tprintf(
+ " %5.1f",
+ 100.0 * (1.0 - static_cast<float>(tables.sum_feature_evidence_[ConfigNum])
+ / NumFeatures / 256.0));
+ }
+ tprintf("\n\n\n");
+ }
+
+ if (PrintMatchSummaryOn (Debug)) {
+ tprintf ("Proto Mask:\n");
+ for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets;
+ ProtoSetIndex++) {
+ ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);
+ for (ProtoWordNum = 0; ProtoWordNum < 2;
+ ProtoWordNum++, ProtoMask++) {
+ ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);
+ for (ProtoNum = 0;
+ ((ProtoNum < (PROTOS_PER_PROTO_SET >> 1))
+ && (ActualProtoNum < ClassTemplate->NumProtos));
+ ProtoNum++, ActualProtoNum++)
+ tprintf ("%1d", (((*ProtoMask) >> ProtoNum) & 1));
+ tprintf ("\n");
+ }
+ }
+ tprintf ("\n");
+ }
+
+ for (int i = 0; i < ClassTemplate->NumConfigs; i++)
+ ProtoConfigs[i] = 0;
+
+ if (PrintProtoMatchesOn (Debug)) {
+ tprintf ("Proto Evidence:\n");
+ for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets;
+ ProtoSetIndex++) {
+ ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];
+ ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);
+ for (ProtoNum = 0;
+ ((ProtoNum < PROTOS_PER_PROTO_SET) &&
+ (ActualProtoNum < ClassTemplate->NumProtos));
+ ProtoNum++, ActualProtoNum++) {
+ tprintf ("P %3d =", ActualProtoNum);
+ int temp = 0;
+ for (uint8_t j = 0; j < ClassTemplate->ProtoLengths[ActualProtoNum]; j++) {
+ uint8_t data = tables.proto_evidence_[ActualProtoNum][j];
+ tprintf(" %d", data);
+ temp += data;
+ }
+
+ tprintf(" = %6.4f%%\n",
+ temp / 256.0 / ClassTemplate->ProtoLengths[ActualProtoNum]);
+
+ ConfigWord = ProtoSet->Protos[ProtoNum].Configs[0];
+ ConfigNum = 0;
+ while (ConfigWord) {
+ tprintf ("%5d", ConfigWord & 1 ? temp : 0);
+ if (ConfigWord & 1)
+ ProtoConfigs[ConfigNum] += temp;
+ ConfigNum++;
+ ConfigWord >>= 1;
+ }
+ tprintf("\n");
+ }
+ }
+ }
+
+ if (PrintMatchSummaryOn (Debug)) {
+ tprintf ("Proto Error for Configurations:\n");
+ for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++)
+ tprintf (" %5.1f",
+ 100.0 * (1.0 -
+ ProtoConfigs[ConfigNum] /
+ ClassTemplate->ConfigLengths[ConfigNum] / 256.0));
+ tprintf ("\n\n");
+ }
+
+ if (PrintProtoMatchesOn (Debug)) {
+ tprintf ("Proto Sum for Configurations:\n");
+ for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++)
+ tprintf (" %4.1f", ProtoConfigs[ConfigNum] / 256.0);
+ tprintf ("\n\n");
+
+ tprintf ("Proto Length for Configurations:\n");
+ for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++)
+ tprintf (" %4.1f",
+ static_cast<float>(ClassTemplate->ConfigLengths[ConfigNum]));
+ tprintf ("\n\n");
+ }
+
+}
+
+void IntegerMatcher::DisplayProtoDebugInfo(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ConfigMask,
+ const ScratchEvidence& tables,
+ bool SeparateDebugWindows) {
+ uint16_t ProtoNum;
+ uint16_t ActualProtoNum;
+ PROTO_SET ProtoSet;
+ int ProtoSetIndex;
+
+ InitIntMatchWindowIfReqd();
+ if (SeparateDebugWindows) {
+ InitFeatureDisplayWindowIfReqd();
+ InitProtoDisplayWindowIfReqd();
+ }
+
+ for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets;
+ ProtoSetIndex++) {
+ ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];
+ ActualProtoNum = ProtoSetIndex * PROTOS_PER_PROTO_SET;
+ for (ProtoNum = 0;
+ ((ProtoNum < PROTOS_PER_PROTO_SET) &&
+ (ActualProtoNum < ClassTemplate->NumProtos));
+ ProtoNum++, ActualProtoNum++) {
+ /* Compute Average for Actual Proto */
+ int temp = 0;
+ for (uint8_t i = 0; i < ClassTemplate->ProtoLengths[ActualProtoNum]; i++)
+ temp += tables.proto_evidence_[ActualProtoNum][i];
+
+ temp /= ClassTemplate->ProtoLengths[ActualProtoNum];
+
+ if ((ProtoSet->Protos[ProtoNum]).Configs[0] & (*ConfigMask)) {
+ DisplayIntProto(ClassTemplate, ActualProtoNum, temp / 255.0);
+ }
+ }
+ }
+}
+
+
+void IntegerMatcher::DisplayFeatureDebugInfo(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ const INT_FEATURE_STRUCT* Features,
+ int AdaptFeatureThreshold,
+ int Debug,
+ bool SeparateDebugWindows) {
+ auto *tables = new ScratchEvidence();
+
+ tables->Clear(ClassTemplate);
+
+ InitIntMatchWindowIfReqd();
+ if (SeparateDebugWindows) {
+ InitFeatureDisplayWindowIfReqd();
+ InitProtoDisplayWindowIfReqd();
+ }
+
+ for (int Feature = 0; Feature < NumFeatures; Feature++) {
+ UpdateTablesForFeature(
+ ClassTemplate, ProtoMask, ConfigMask, Feature, &Features[Feature],
+ tables, 0);
+
+ /* Find Best Evidence for Current Feature */
+ int best = 0;
+ assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS);
+ for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++)
+ if (tables->feature_evidence_[i] > best)
+ best = tables->feature_evidence_[i];
+
+ /* Update display for current feature */
+ if (ClipMatchEvidenceOn(Debug)) {
+ if (best < AdaptFeatureThreshold)
+ DisplayIntFeature(&Features[Feature], 0.0);
+ else
+ DisplayIntFeature(&Features[Feature], 1.0);
+ } else {
+ DisplayIntFeature(&Features[Feature], best / 255.0);
+ }
+ }
+
+ delete tables;
+}
+#endif
+
+/**
+ * Add sum of Proto Evidences into Sum Of Feature Evidence Array
+ */
+void ScratchEvidence::UpdateSumOfProtoEvidences(
+ INT_CLASS ClassTemplate, BIT_VECTOR ConfigMask) {
+
+ int *IntPointer;
+ uint32_t ConfigWord;
+ int ProtoSetIndex;
+ uint16_t ProtoNum;
+ PROTO_SET ProtoSet;
+ int NumProtos;
+ uint16_t ActualProtoNum;
+
+ NumProtos = ClassTemplate->NumProtos;
+
+ for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets;
+ ProtoSetIndex++) {
+ ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex];
+ ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET);
+ for (ProtoNum = 0;
+ ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < NumProtos));
+ ProtoNum++, ActualProtoNum++) {
+ int temp = 0;
+ for (uint8_t i = 0; i < MAX_PROTO_INDEX &&
+ i < ClassTemplate->ProtoLengths[ActualProtoNum]; i++)
+ temp += proto_evidence_[ActualProtoNum] [i];
+
+ ConfigWord = ProtoSet->Protos[ProtoNum].Configs[0];
+ ConfigWord &= *ConfigMask;
+ IntPointer = sum_feature_evidence_;
+ while (ConfigWord) {
+ if (ConfigWord & 1)
+ *IntPointer += temp;
+ IntPointer++;
+ ConfigWord >>= 1;
+ }
+ }
+ }
+}
+
+/**
+ * Normalize Sum of Proto and Feature Evidence by dividing by the sum of
+ * the Feature Lengths and the Proto Lengths for each configuration.
+ */
+void ScratchEvidence::NormalizeSums(
+ INT_CLASS ClassTemplate, int16_t NumFeatures) {
+
+ assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS);
+ for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) {
+ sum_feature_evidence_[i] = (sum_feature_evidence_[i] << 8) /
+ (NumFeatures + ClassTemplate->ConfigLengths[i]);
+ }
+}
+
+/**
+ * Find the best match for the current class and update the Result
+ * with the configuration and match rating.
+ * @return The best normalized sum of evidences
+ */
+int IntegerMatcher::FindBestMatch(
+ INT_CLASS class_template,
+ const ScratchEvidence &tables,
+ UnicharRating* result) {
+ int best_match = 0;
+ result->config = 0;
+ result->fonts.clear();
+ result->fonts.reserve(class_template->NumConfigs);
+
+ /* Find best match */
+ assert(class_template->NumConfigs < MAX_NUM_CONFIGS);
+ for (int c = 0; c < MAX_NUM_CONFIGS && c < class_template->NumConfigs; ++c) {
+ int rating = tables.sum_feature_evidence_[c];
+ if (*classify_debug_level_ > 2)
+ tprintf("Config %d, rating=%d\n", c, rating);
+ if (rating > best_match) {
+ result->config = c;
+ best_match = rating;
+ }
+ result->fonts.push_back(ScoredFont(c, rating));
+ }
+
+ // Compute confidence on a Probability scale.
+ result->rating = best_match / 65536.0f;
+
+ return best_match;
+}
+
+/**
+ * Applies the CN normalization factor to the given rating and returns
+ * the modified rating.
+ */
+float IntegerMatcher::ApplyCNCorrection(float rating, int blob_length,
+ int normalization_factor,
+ int matcher_multiplier) {
+ int divisor = blob_length + matcher_multiplier;
+ return divisor == 0 ? 1.0f : (rating * blob_length +
+ matcher_multiplier * normalization_factor / 256.0f) / divisor;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/intmatcher.h b/tesseract/src/classify/intmatcher.h
new file mode 100644
index 00000000..8c6a1251
--- /dev/null
+++ b/tesseract/src/classify/intmatcher.h
@@ -0,0 +1,165 @@
+/******************************************************************************
+ ** Filename: intmatcher.h
+ ** Purpose: Interface to high level generic classifier routines.
+ ** Author: Robert Moss
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+#ifndef INTMATCHER_H
+#define INTMATCHER_H
+
+#include "params.h"
+#include "intproto.h"
+
+namespace tesseract {
+
+// Character fragments could be present in the trained templaes
+// but turned on/off on the language-by-language basis or depending
+// on particular properties of the corpus (e.g. when we expect the
+// images to have low exposure).
+extern BOOL_VAR_H(disable_character_fragments, false,
+ "Do not include character fragments in the"
+ " results of the classifier");
+
+extern INT_VAR_H(classify_integer_matcher_multiplier, 10,
+ "Integer Matcher Multiplier 0-255: ");
+
+struct UnicharRating;
+
+struct CP_RESULT_STRUCT {
+ CP_RESULT_STRUCT() : Rating(0.0f), Class(0) {}
+
+ float Rating;
+ CLASS_ID Class;
+};
+
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+
+#define SE_TABLE_BITS 9
+#define SE_TABLE_SIZE 512
+
+struct ScratchEvidence {
+ uint8_t feature_evidence_[MAX_NUM_CONFIGS];
+ int sum_feature_evidence_[MAX_NUM_CONFIGS];
+ uint8_t proto_evidence_[MAX_NUM_PROTOS][MAX_PROTO_INDEX];
+
+ void Clear(const INT_CLASS class_template);
+ void ClearFeatureEvidence(const INT_CLASS class_template);
+ void NormalizeSums(INT_CLASS ClassTemplate, int16_t NumFeatures);
+ void UpdateSumOfProtoEvidences(
+ INT_CLASS ClassTemplate, BIT_VECTOR ConfigMask);
+};
+
+
+class IntegerMatcher {
+ public:
+ // Integer Matcher Theta Fudge (0-255).
+ static const int kIntThetaFudge = 128;
+ // Bits in Similarity to Evidence Lookup (8-9).
+ static const int kEvidenceTableBits = 9;
+ // Integer Evidence Truncation Bits (8-14).
+ static const int kIntEvidenceTruncBits = 14;
+ // Similarity to Evidence Table Exponential Multiplier.
+ static const float kSEExponentialMultiplier;
+ // Center of Similarity Curve.
+ static const float kSimilarityCenter;
+
+ IntegerMatcher(tesseract::IntParam *classify_debug_level);
+
+ void Match(INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ const INT_FEATURE_STRUCT* Features,
+ tesseract::UnicharRating* Result,
+ int AdaptFeatureThreshold,
+ int Debug,
+ bool SeparateDebugWindows);
+
+ // Applies the CN normalization factor to the given rating and returns
+ // the modified rating.
+ float ApplyCNCorrection(float rating, int blob_length,
+ int normalization_factor, int matcher_multiplier);
+
+ int FindGoodProtos(INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ INT_FEATURE_ARRAY Features,
+ PROTO_ID *ProtoArray,
+ int AdaptProtoThreshold,
+ int Debug);
+
+ int FindBadFeatures(INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ INT_FEATURE_ARRAY Features,
+ FEATURE_ID *FeatureArray,
+ int AdaptFeatureThreshold,
+ int Debug);
+
+ private:
+ int UpdateTablesForFeature(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int FeatureNum,
+ const INT_FEATURE_STRUCT* Feature,
+ ScratchEvidence *evidence,
+ int Debug);
+
+ int FindBestMatch(INT_CLASS ClassTemplate,
+ const ScratchEvidence &tables,
+ tesseract::UnicharRating* Result);
+
+#ifndef GRAPHICS_DISABLED
+ void DebugFeatureProtoError(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ const ScratchEvidence &tables,
+ int16_t NumFeatures,
+ int Debug);
+
+ void DisplayProtoDebugInfo(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ConfigMask,
+ const ScratchEvidence &tables,
+ bool SeparateDebugWindows);
+
+ void DisplayFeatureDebugInfo(
+ INT_CLASS ClassTemplate,
+ BIT_VECTOR ProtoMask,
+ BIT_VECTOR ConfigMask,
+ int16_t NumFeatures,
+ const INT_FEATURE_STRUCT* Features,
+ int AdaptFeatureThreshold,
+ int Debug,
+ bool SeparateDebugWindows);
+#endif
+
+ private:
+ tesseract::IntParam *classify_debug_level_;
+ uint8_t similarity_evidence_table_[SE_TABLE_SIZE];
+ uint32_t evidence_table_mask_;
+ uint32_t mult_trunc_shift_bits_;
+ uint32_t table_trunc_shift_bits_;
+ uint32_t evidence_mult_mask_;
+};
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/intproto.cpp b/tesseract/src/classify/intproto.cpp
new file mode 100644
index 00000000..37a92f7b
--- /dev/null
+++ b/tesseract/src/classify/intproto.cpp
@@ -0,0 +1,1743 @@
+/******************************************************************************
+ ** Filename: intproto.c
+ ** Purpose: Definition of data structures for integer protos.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+/*-----------------------------------------------------------------------------
+ Include Files and Type Defines
+-----------------------------------------------------------------------------*/
+
+#define _USE_MATH_DEFINES // for M_PI
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "intproto.h"
+
+#include "classify.h"
+#include "fontinfo.h"
+#include "mfoutline.h"
+#include "picofeat.h"
+#include "points.h"
+#include "shapetable.h"
+#include "svmnode.h"
+
+#include "helpers.h"
+
+#include <algorithm>
+#include <cmath> // for M_PI, std::floor
+#include <cstdio>
+#include <cassert>
+
+namespace tesseract {
+
+/* match debug display constants*/
+#define PROTO_PRUNER_SCALE (4.0)
+
+#define INT_DESCENDER (0.0 * INT_CHAR_NORM_RANGE)
+#define INT_BASELINE (0.25 * INT_CHAR_NORM_RANGE)
+#define INT_XHEIGHT (0.75 * INT_CHAR_NORM_RANGE)
+#define INT_CAPHEIGHT (1.0 * INT_CHAR_NORM_RANGE)
+
+#define INT_XCENTER (0.5 * INT_CHAR_NORM_RANGE)
+#define INT_YCENTER (0.5 * INT_CHAR_NORM_RANGE)
+#define INT_XRADIUS (0.2 * INT_CHAR_NORM_RANGE)
+#define INT_YRADIUS (0.2 * INT_CHAR_NORM_RANGE)
+#define INT_MIN_X 0
+#define INT_MIN_Y 0
+#define INT_MAX_X INT_CHAR_NORM_RANGE
+#define INT_MAX_Y INT_CHAR_NORM_RANGE
+
+/** define pad used to snap near horiz/vertical protos to horiz/vertical */
+#define HV_TOLERANCE (0.0025) /* approx 0.9 degrees */
+
+typedef enum
+{ StartSwitch, EndSwitch, LastSwitch }
+SWITCH_TYPE;
+#define MAX_NUM_SWITCHES 3
+
+typedef struct
+{
+ SWITCH_TYPE Type;
+ int8_t X, Y;
+ int16_t YInit;
+ int16_t Delta;
+}
+FILL_SWITCH;
+
+typedef struct
+{
+ uint8_t NextSwitch;
+ uint8_t AngleStart, AngleEnd;
+ int8_t X;
+ int16_t YStart, YEnd;
+ int16_t StartDelta, EndDelta;
+ FILL_SWITCH Switch[MAX_NUM_SWITCHES];
+}
+TABLE_FILLER;
+
+typedef struct
+{
+ int8_t X;
+ int8_t YStart, YEnd;
+ uint8_t AngleStart, AngleEnd;
+}
+FILL_SPEC;
+
+
+/* constants for conversion from old inttemp format */
+#define OLD_MAX_NUM_CONFIGS 32
+#define OLD_WERDS_PER_CONFIG_VEC ((OLD_MAX_NUM_CONFIGS + BITS_PER_WERD - 1) /\
+ BITS_PER_WERD)
+
+/*-----------------------------------------------------------------------------
+ Macros
+-----------------------------------------------------------------------------*/
+/** macro for performing circular increments of bucket indices */
+#define CircularIncrement(i,r) (((i) < (r) - 1)?((i)++):((i) = 0))
+
+/** macro for mapping floats to ints without bounds checking */
+#define MapParam(P,O,N) (std::floor(((P) + (O)) * (N)))
+
+/*---------------------------------------------------------------------------
+ Private Function Prototypes
+----------------------------------------------------------------------------*/
+float BucketStart(int Bucket, float Offset, int NumBuckets);
+
+float BucketEnd(int Bucket, float Offset, int NumBuckets);
+
+void DoFill(FILL_SPEC *FillSpec,
+ CLASS_PRUNER_STRUCT* Pruner,
+ uint32_t ClassMask,
+ uint32_t ClassCount,
+ uint32_t WordIndex);
+
+bool FillerDone(TABLE_FILLER* Filler);
+
+void FillPPCircularBits(uint32_t
+ ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR],
+ int Bit, float Center, float Spread, bool debug);
+
+void FillPPLinearBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR],
+ int Bit, float Center, float Spread, bool debug);
+
+void GetCPPadsForLevel(int Level,
+ float *EndPad,
+ float *SidePad,
+ float *AnglePad);
+
+ScrollView::Color GetMatchColorFor(float Evidence);
+
+void GetNextFill(TABLE_FILLER *Filler, FILL_SPEC *Fill);
+
+void InitTableFiller(float EndPad,
+ float SidePad,
+ float AnglePad,
+ PROTO Proto,
+ TABLE_FILLER *Filler);
+
+#ifndef GRAPHICS_DISABLED
+void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT* Feature,
+ ScrollView::Color color);
+
+void RenderIntProto(ScrollView *window,
+ INT_CLASS Class,
+ PROTO_ID ProtoId,
+ ScrollView::Color color);
+#endif // !GRAPHICS_DISABLED
+
+/*-----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+-----------------------------------------------------------------------------*/
+
+#ifndef GRAPHICS_DISABLED
+/* global display lists used to display proto and feature match information*/
+static ScrollView* IntMatchWindow = nullptr;
+static ScrollView* FeatureDisplayWindow = nullptr;
+static ScrollView* ProtoDisplayWindow = nullptr;
+#endif
+
+/*-----------------------------------------------------------------------------
+ Variables
+-----------------------------------------------------------------------------*/
+
+/* control knobs */
+static INT_VAR(classify_num_cp_levels, 3, "Number of Class Pruner Levels");
+static double_VAR(classify_cp_angle_pad_loose, 45.0,
+ "Class Pruner Angle Pad Loose");
+static double_VAR(classify_cp_angle_pad_medium, 20.0,
+ "Class Pruner Angle Pad Medium");
+static double_VAR(classify_cp_angle_pad_tight, 10.0,
+ "CLass Pruner Angle Pad Tight");
+static double_VAR(classify_cp_end_pad_loose, 0.5, "Class Pruner End Pad Loose");
+static double_VAR(classify_cp_end_pad_medium, 0.5, "Class Pruner End Pad Medium");
+static double_VAR(classify_cp_end_pad_tight, 0.5, "Class Pruner End Pad Tight");
+static double_VAR(classify_cp_side_pad_loose, 2.5, "Class Pruner Side Pad Loose");
+static double_VAR(classify_cp_side_pad_medium, 1.2, "Class Pruner Side Pad Medium");
+static double_VAR(classify_cp_side_pad_tight, 0.6, "Class Pruner Side Pad Tight");
+static double_VAR(classify_pp_angle_pad, 45.0, "Proto Pruner Angle Pad");
+static double_VAR(classify_pp_end_pad, 0.5, "Proto Prune End Pad");
+static double_VAR(classify_pp_side_pad, 2.5, "Proto Pruner Side Pad");
+
+/**
+ * This routine truncates Param to lie within the range
+ * of Min-Max inclusive.
+ *
+ * @param Param parameter value to be truncated
+ * @param Min, Max parameter limits (inclusive)
+ *
+ * @return Truncated parameter.
+ */
+static int TruncateParam(float Param, int Min, int Max) {
+ int result;
+ if (Param < Min) {
+ result = Min;
+ } else if (Param > Max) {
+ result = Max;
+ } else {
+ result = static_cast<int>(std::floor(Param));
+ }
+ return result;
+}
+
+/*-----------------------------------------------------------------------------
+ Public Code
+-----------------------------------------------------------------------------*/
+/// Builds a feature from an FCOORD for position with all the necessary
+/// clipping and rounding.
+INT_FEATURE_STRUCT::INT_FEATURE_STRUCT(const FCOORD& pos, uint8_t theta)
+ : X(ClipToRange<int16_t>(static_cast<int16_t>(pos.x() + 0.5), 0, 255)),
+ Y(ClipToRange<int16_t>(static_cast<int16_t>(pos.y() + 0.5), 0, 255)),
+ Theta(theta),
+ CP_misses(0) {
+}
+/** Builds a feature from ints with all the necessary clipping and casting. */
+INT_FEATURE_STRUCT::INT_FEATURE_STRUCT(int x, int y, int theta)
+ : X(static_cast<uint8_t>(ClipToRange<int>(x, 0, UINT8_MAX))),
+ Y(static_cast<uint8_t>(ClipToRange<int>(y, 0, UINT8_MAX))),
+ Theta(static_cast<uint8_t>(ClipToRange<int>(theta, 0, UINT8_MAX))),
+ CP_misses(0) {
+}
+
+/**
+ * This routine adds a new class structure to a set of
+ * templates. Classes have to be added to Templates in
+ * the order of increasing ClassIds.
+ *
+ * @param Templates templates to add new class to
+ * @param ClassId class id to associate new class with
+ * @param Class class data structure to add to templates
+ *
+ * Globals: none
+ */
+void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class) {
+ int Pruner;
+
+ assert (LegalClassId (ClassId));
+ if (ClassId != Templates->NumClasses) {
+ fprintf(stderr, "Please make sure that classes are added to templates");
+ fprintf(stderr, " in increasing order of ClassIds\n");
+ exit(1);
+ }
+ ClassForClassId (Templates, ClassId) = Class;
+ Templates->NumClasses++;
+
+ if (Templates->NumClasses > MaxNumClassesIn (Templates)) {
+ Pruner = Templates->NumClassPruners++;
+ Templates->ClassPruners[Pruner] = new CLASS_PRUNER_STRUCT;
+ memset(Templates->ClassPruners[Pruner], 0, sizeof(CLASS_PRUNER_STRUCT));
+ }
+} /* AddIntClass */
+
+
+/**
+ * This routine returns the index of the next free config
+ * in Class.
+ *
+ * @param Class class to add new configuration to
+ *
+ * Globals: none
+ *
+ * @return Index of next free config.
+ */
+int AddIntConfig(INT_CLASS Class) {
+ int Index;
+
+ assert(Class->NumConfigs < MAX_NUM_CONFIGS);
+
+ Index = Class->NumConfigs++;
+ Class->ConfigLengths[Index] = 0;
+ return Index;
+} /* AddIntConfig */
+
+
+/**
+ * This routine allocates the next free proto in Class and
+ * returns its index.
+ *
+ * @param Class class to add new proto to
+ *
+ * Globals: none
+ *
+ * @return Proto index of new proto.
+ */
+int AddIntProto(INT_CLASS Class) {
+ int Index;
+ int ProtoSetId;
+ PROTO_SET ProtoSet;
+ INT_PROTO Proto;
+ uint32_t *Word;
+
+ if (Class->NumProtos >= MAX_NUM_PROTOS)
+ return (NO_PROTO);
+
+ Index = Class->NumProtos++;
+
+ if (Class->NumProtos > MaxNumIntProtosIn(Class)) {
+ ProtoSetId = Class->NumProtoSets++;
+
+ ProtoSet = static_cast<PROTO_SET>(malloc(sizeof(PROTO_SET_STRUCT)));
+ Class->ProtoSets[ProtoSetId] = ProtoSet;
+ memset(ProtoSet, 0, sizeof(*ProtoSet));
+
+ /* reallocate space for the proto lengths and install in class */
+ Class->ProtoLengths =
+ static_cast<uint8_t *>(realloc(Class->ProtoLengths,
+ MaxNumIntProtosIn(Class) * sizeof(uint8_t)));
+ memset(&Class->ProtoLengths[Index], 0,
+ sizeof(*Class->ProtoLengths) * (MaxNumIntProtosIn(Class) - Index));
+ }
+
+ /* initialize proto so its length is zero and it isn't in any configs */
+ Class->ProtoLengths[Index] = 0;
+ Proto = ProtoForProtoId (Class, Index);
+ for (Word = Proto->Configs;
+ Word < Proto->Configs + WERDS_PER_CONFIG_VEC; *Word++ = 0);
+
+ return (Index);
+}
+
+/**
+ * This routine adds Proto to the class pruning tables
+ * for the specified class in Templates.
+ *
+ * Globals:
+ * - classify_num_cp_levels number of levels used in the class pruner
+ * @param Proto floating-pt proto to add to class pruner
+ * @param ClassId class id corresponding to Proto
+ * @param Templates set of templates containing class pruner
+ */
+void AddProtoToClassPruner (PROTO Proto, CLASS_ID ClassId,
+ INT_TEMPLATES Templates)
+#define MAX_LEVEL 2
+{
+ CLASS_PRUNER_STRUCT* Pruner;
+ uint32_t ClassMask;
+ uint32_t ClassCount;
+ uint32_t WordIndex;
+ int Level;
+ float EndPad, SidePad, AnglePad;
+ TABLE_FILLER TableFiller;
+ FILL_SPEC FillSpec;
+
+ Pruner = CPrunerFor (Templates, ClassId);
+ WordIndex = CPrunerWordIndexFor (ClassId);
+ ClassMask = CPrunerMaskFor (MAX_LEVEL, ClassId);
+
+ for (Level = classify_num_cp_levels - 1; Level >= 0; Level--) {
+ GetCPPadsForLevel(Level, &EndPad, &SidePad, &AnglePad);
+ ClassCount = CPrunerMaskFor (Level, ClassId);
+ InitTableFiller(EndPad, SidePad, AnglePad, Proto, &TableFiller);
+
+ while (!FillerDone (&TableFiller)) {
+ GetNextFill(&TableFiller, &FillSpec);
+ DoFill(&FillSpec, Pruner, ClassMask, ClassCount, WordIndex);
+ }
+ }
+} /* AddProtoToClassPruner */
+
+/**
+ * This routine updates the proto pruner lookup tables
+ * for Class to include a new proto identified by ProtoId
+ * and described by Proto.
+ * @param Proto floating-pt proto to be added to proto pruner
+ * @param ProtoId id of proto
+ * @param Class integer class that contains desired proto pruner
+ * @param debug debug flag
+ * @note Globals: none
+ */
+void AddProtoToProtoPruner(PROTO Proto, int ProtoId,
+ INT_CLASS Class, bool debug) {
+ float Angle, X, Y, Length;
+ float Pad;
+ int Index;
+ PROTO_SET ProtoSet;
+
+ if (ProtoId >= Class->NumProtos)
+ tprintf("AddProtoToProtoPruner:assert failed: %d < %d",
+ ProtoId, Class->NumProtos);
+ assert(ProtoId < Class->NumProtos);
+
+ Index = IndexForProto (ProtoId);
+ ProtoSet = Class->ProtoSets[SetForProto (ProtoId)];
+
+ Angle = Proto->Angle;
+#ifndef _WIN32
+ assert(!std::isnan(Angle));
+#endif
+
+ FillPPCircularBits (ProtoSet->ProtoPruner[PRUNER_ANGLE], Index,
+ Angle + ANGLE_SHIFT, classify_pp_angle_pad / 360.0,
+ debug);
+
+ Angle *= 2.0 * M_PI;
+ Length = Proto->Length;
+
+ X = Proto->X + X_SHIFT;
+ Pad = std::max(fabs (cos (Angle)) * (Length / 2.0 +
+ classify_pp_end_pad *
+ GetPicoFeatureLength ()),
+ fabs (sin (Angle)) * (classify_pp_side_pad *
+ GetPicoFeatureLength ()));
+
+ FillPPLinearBits(ProtoSet->ProtoPruner[PRUNER_X], Index, X, Pad, debug);
+
+ Y = Proto->Y + Y_SHIFT;
+ Pad = std::max(fabs (sin (Angle)) * (Length / 2.0 +
+ classify_pp_end_pad *
+ GetPicoFeatureLength ()),
+ fabs (cos (Angle)) * (classify_pp_side_pad *
+ GetPicoFeatureLength ()));
+
+ FillPPLinearBits(ProtoSet->ProtoPruner[PRUNER_Y], Index, Y, Pad, debug);
+} /* AddProtoToProtoPruner */
+
+/**
+ * Returns a quantized bucket for the given param shifted by offset,
+ * notionally (param + offset) * num_buckets, but clipped and casted to the
+ * appropriate type.
+ */
+uint8_t Bucket8For(float param, float offset, int num_buckets) {
+ int bucket = IntCastRounded(MapParam(param, offset, num_buckets));
+ return static_cast<uint8_t>(ClipToRange<int>(bucket, 0, num_buckets - 1));
+}
+uint16_t Bucket16For(float param, float offset, int num_buckets) {
+ int bucket = IntCastRounded(MapParam(param, offset, num_buckets));
+ return static_cast<uint16_t>(ClipToRange<int>(bucket, 0, num_buckets - 1));
+}
+
+/**
+ * Returns a quantized bucket for the given circular param shifted by offset,
+ * notionally (param + offset) * num_buckets, but modded and casted to the
+ * appropriate type.
+ */
+uint8_t CircBucketFor(float param, float offset, int num_buckets) {
+ int bucket = IntCastRounded(MapParam(param, offset, num_buckets));
+ return static_cast<uint8_t>(Modulo(bucket, num_buckets));
+} /* CircBucketFor */
+
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * This routine clears the global feature and proto
+ * display lists.
+ *
+ * Globals:
+ * - FeatureShapes display list for features
+ * - ProtoShapes display list for protos
+ */
+void UpdateMatchDisplay() {
+ if (IntMatchWindow != nullptr)
+ IntMatchWindow->Update();
+} /* ClearMatchDisplay */
+#endif
+
+/**
+ * This operation updates the config vectors of all protos
+ * in Class to indicate that the protos with 1's in Config
+ * belong to a new configuration identified by ConfigId.
+ * It is assumed that the length of the Config bit vector is
+ * equal to the number of protos in Class.
+ * @param Config config to be added to class
+ * @param ConfigId id to be used for new config
+ * @param Class class to add new config to
+ */
+void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class) {
+ int ProtoId;
+ INT_PROTO Proto;
+ int TotalLength;
+
+ for (ProtoId = 0, TotalLength = 0;
+ ProtoId < Class->NumProtos; ProtoId++) {
+ if (test_bit(Config, ProtoId)) {
+ Proto = ProtoForProtoId(Class, ProtoId);
+ SET_BIT(Proto->Configs, ConfigId);
+ TotalLength += Class->ProtoLengths[ProtoId];
+ }
+ }
+ Class->ConfigLengths[ConfigId] = TotalLength;
+} /* ConvertConfig */
+
+/**
+ * This routine converts Proto to integer format and
+ * installs it as ProtoId in Class.
+ * @param Proto floating-pt proto to be converted to integer format
+ * @param ProtoId id of proto
+ * @param Class integer class to add converted proto to
+ */
+void Classify::ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class) {
+ assert(ProtoId < Class->NumProtos);
+
+ INT_PROTO P = ProtoForProtoId(Class, ProtoId);
+
+ float Param = Proto->A * 128;
+ P->A = TruncateParam(Param, -128, 127);
+
+ Param = -Proto->B * 256;
+ P->B = TruncateParam(Param, 0, 255);
+
+ Param = Proto->C * 128;
+ P->C = TruncateParam(Param, -128, 127);
+
+ Param = Proto->Angle * 256;
+ if (Param < 0 || Param >= 256)
+ P->Angle = 0;
+ else
+ P->Angle = static_cast<uint8_t>(Param);
+
+ /* round proto length to nearest integer number of pico-features */
+ Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
+ Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255);
+ if (classify_learning_debug_level >= 2)
+ tprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)",
+ P->A, P->B, P->C, Class->ProtoLengths[ProtoId]);
+} /* ConvertProto */
+
+/**
+ * This routine converts from the old floating point format
+ * to the new integer format.
+ * @param FloatProtos prototypes in old floating pt format
+ * @param target_unicharset the UNICHARSET to use
+ * @return New set of training templates in integer format.
+ * @note Globals: none
+ */
+INT_TEMPLATES Classify::CreateIntTemplates(CLASSES FloatProtos,
+ const UNICHARSET&
+ target_unicharset) {
+ INT_TEMPLATES IntTemplates;
+ CLASS_TYPE FClass;
+ INT_CLASS IClass;
+ int ClassId;
+ int ProtoId;
+ int ConfigId;
+
+ IntTemplates = NewIntTemplates();
+
+ for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
+ FClass = &(FloatProtos[ClassId]);
+ if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
+ strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
+ tprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
+ target_unicharset.id_to_unichar(ClassId));
+ }
+ assert(UnusedClassIdIn(IntTemplates, ClassId));
+ IClass = NewIntClass(FClass->NumProtos, FClass->NumConfigs);
+ FontSet fs;
+ fs.size = FClass->font_set.size();
+ fs.configs = new int[fs.size];
+ for (int i = 0; i < fs.size; ++i) {
+ fs.configs[i] = FClass->font_set.get(i);
+ }
+ if (this->fontset_table_.contains(fs)) {
+ IClass->font_set_id = this->fontset_table_.get_id(fs);
+ delete[] fs.configs;
+ } else {
+ IClass->font_set_id = this->fontset_table_.push_back(fs);
+ }
+ AddIntClass(IntTemplates, ClassId, IClass);
+
+ for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
+ AddIntProto(IClass);
+ ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
+ AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
+ classify_learning_debug_level >= 2);
+ AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
+ }
+
+ for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
+ AddIntConfig(IClass);
+ ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
+ }
+ }
+ return (IntTemplates);
+} /* CreateIntTemplates */
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * This routine renders the specified feature into a
+ * global display list.
+ *
+ * Globals:
+ * - FeatureShapes global display list for features
+ * @param Feature pico-feature to be displayed
+ * @param Evidence best evidence for this feature (0-1)
+ */
+void DisplayIntFeature(const INT_FEATURE_STRUCT *Feature, float Evidence) {
+ ScrollView::Color color = GetMatchColorFor(Evidence);
+ RenderIntFeature(IntMatchWindow, Feature, color);
+ if (FeatureDisplayWindow) {
+ RenderIntFeature(FeatureDisplayWindow, Feature, color);
+ }
+} /* DisplayIntFeature */
+
+/**
+ * This routine renders the specified proto into a
+ * global display list.
+ *
+ * Globals:
+ * - ProtoShapes global display list for protos
+ * @param Class class to take proto from
+ * @param ProtoId id of proto in Class to be displayed
+ * @param Evidence total evidence for proto (0-1)
+ */
+void DisplayIntProto(INT_CLASS Class, PROTO_ID ProtoId, float Evidence) {
+ ScrollView::Color color = GetMatchColorFor(Evidence);
+ RenderIntProto(IntMatchWindow, Class, ProtoId, color);
+ if (ProtoDisplayWindow) {
+ RenderIntProto(ProtoDisplayWindow, Class, ProtoId, color);
+ }
+} /* DisplayIntProto */
+#endif
+
+/**
+ * This routine creates a new integer class data structure
+ * and returns it. Sufficient space is allocated
+ * to handle the specified number of protos and configs.
+ * @param MaxNumProtos number of protos to allocate space for
+ * @param MaxNumConfigs number of configs to allocate space for
+ * @return New class created.
+ * @note Globals: none
+ */
+INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs) {
+ INT_CLASS Class;
+ PROTO_SET ProtoSet;
+ int i;
+
+ assert(MaxNumConfigs <= MAX_NUM_CONFIGS);
+
+ Class = static_cast<INT_CLASS>(malloc(sizeof(INT_CLASS_STRUCT)));
+ Class->NumProtoSets = ((MaxNumProtos + PROTOS_PER_PROTO_SET - 1) /
+ PROTOS_PER_PROTO_SET);
+
+ assert(Class->NumProtoSets <= MAX_NUM_PROTO_SETS);
+
+ Class->NumProtos = 0;
+ Class->NumConfigs = 0;
+
+ for (i = 0; i < Class->NumProtoSets; i++) {
+ /* allocate space for a proto set, install in class, and initialize */
+ ProtoSet = static_cast<PROTO_SET>(malloc(sizeof(PROTO_SET_STRUCT)));
+ memset(ProtoSet, 0, sizeof(*ProtoSet));
+ Class->ProtoSets[i] = ProtoSet;
+
+ /* allocate space for the proto lengths and install in class */
+ }
+ if (MaxNumIntProtosIn (Class) > 0) {
+ Class->ProtoLengths =
+ static_cast<uint8_t *>(malloc(MaxNumIntProtosIn (Class) * sizeof (uint8_t)));
+ memset(Class->ProtoLengths, 0,
+ MaxNumIntProtosIn(Class) * sizeof(*Class->ProtoLengths));
+ } else {
+ Class->ProtoLengths = nullptr;
+ }
+ memset(Class->ConfigLengths, 0, sizeof(Class->ConfigLengths));
+
+ return (Class);
+
+} /* NewIntClass */
+
+static void free_int_class(INT_CLASS int_class) {
+ int i;
+
+ for (i = 0; i < int_class->NumProtoSets; i++) {
+ free (int_class->ProtoSets[i]);
+ }
+ if (int_class->ProtoLengths != nullptr) {
+ free (int_class->ProtoLengths);
+ }
+ free(int_class);
+}
+
+/**
+ * This routine allocates a new set of integer templates
+ * initialized to hold 0 classes.
+ * @return The integer templates created.
+ * @note Globals: none
+ */
+INT_TEMPLATES NewIntTemplates() {
+ INT_TEMPLATES T;
+ int i;
+
+ T = static_cast<INT_TEMPLATES>(malloc (sizeof (INT_TEMPLATES_STRUCT)));
+ T->NumClasses = 0;
+ T->NumClassPruners = 0;
+
+ for (i = 0; i < MAX_NUM_CLASSES; i++)
+ ClassForClassId (T, i) = nullptr;
+
+ return (T);
+} /* NewIntTemplates */
+
+
+/*---------------------------------------------------------------------------*/
+void free_int_templates(INT_TEMPLATES templates) {
+ int i;
+
+ for (i = 0; i < templates->NumClasses; i++)
+ free_int_class(templates->Class[i]);
+ for (i = 0; i < templates->NumClassPruners; i++)
+ delete templates->ClassPruners[i];
+ free(templates);
+}
+
+/**
+ * This routine reads a set of integer templates from
+ * File. File must already be open and must be in the
+ * correct binary format.
+ * @param fp open file to read templates from
+ * @return Pointer to integer templates read from File.
+ * @note Globals: none
+ */
+INT_TEMPLATES Classify::ReadIntTemplates(TFile *fp) {
+ int i, j, w, x, y, z;
+ int unicharset_size;
+ int version_id = 0;
+ INT_TEMPLATES Templates;
+ CLASS_PRUNER_STRUCT* Pruner;
+ INT_CLASS Class;
+ uint8_t *Lengths;
+ PROTO_SET ProtoSet;
+
+ /* variables for conversion from older inttemp formats */
+ int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
+ CLASS_ID class_id, max_class_id;
+ auto *IndexFor = new int16_t[MAX_NUM_CLASSES];
+ auto *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
+ auto **TempClassPruner =
+ new CLASS_PRUNER_STRUCT*[MAX_NUM_CLASS_PRUNERS];
+ uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
+ (1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
+ uint32_t Mask, NewMask, ClassBits;
+ int MaxNumConfigs = MAX_NUM_CONFIGS;
+ int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
+
+ /* first read the high level template struct */
+ Templates = NewIntTemplates();
+ // Read Templates in parts for 64 bit compatibility.
+ if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1)
+ tprintf("Bad read of inttemp!\n");
+ if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
+ 1) != 1 ||
+ fp->FReadEndian(&Templates->NumClassPruners,
+ sizeof(Templates->NumClassPruners), 1) != 1)
+ tprintf("Bad read of inttemp!\n");
+ if (Templates->NumClasses < 0) {
+ // This file has a version id!
+ version_id = -Templates->NumClasses;
+ if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
+ 1) != 1)
+ tprintf("Bad read of inttemp!\n");
+ }
+
+ if (version_id < 3) {
+ MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
+ WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
+ }
+
+ if (version_id < 2) {
+ if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size) !=
+ unicharset_size) {
+ tprintf("Bad read of inttemp!\n");
+ }
+ if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]),
+ Templates->NumClasses) != Templates->NumClasses) {
+ tprintf("Bad read of inttemp!\n");
+ }
+ }
+
+ /* then read in the class pruners */
+ const int kNumBuckets =
+ NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;
+ for (i = 0; i < Templates->NumClassPruners; i++) {
+ Pruner = new CLASS_PRUNER_STRUCT;
+ if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) !=
+ kNumBuckets) {
+ tprintf("Bad read of inttemp!\n");
+ }
+ if (version_id < 2) {
+ TempClassPruner[i] = Pruner;
+ } else {
+ Templates->ClassPruners[i] = Pruner;
+ }
+ }
+
+ /* fix class pruners if they came from an old version of inttemp */
+ if (version_id < 2) {
+ // Allocate enough class pruners to cover all the class ids.
+ max_class_id = 0;
+ for (i = 0; i < Templates->NumClasses; i++)
+ if (ClassIdFor[i] > max_class_id)
+ max_class_id = ClassIdFor[i];
+ for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
+ Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
+ memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
+ }
+ // Convert class pruners from the old format (indexed by class index)
+ // to the new format (indexed by class id).
+ last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
+ for (i = 0; i < Templates->NumClassPruners; i++) {
+ for (x = 0; x < NUM_CP_BUCKETS; x++)
+ for (y = 0; y < NUM_CP_BUCKETS; y++)
+ for (z = 0; z < NUM_CP_BUCKETS; z++)
+ for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
+ if (TempClassPruner[i]->p[x][y][z][w] == 0)
+ continue;
+ for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
+ bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
+ if (bit_number > last_cp_bit_number)
+ break; // the rest of the bits in this word are not used
+ class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
+ // Single out NUM_BITS_PER_CLASS bits relating to class_id.
+ Mask = SetBitsForMask << b;
+ ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
+ // Move these bits to the new position in which they should
+ // appear (indexed corresponding to the class_id).
+ new_i = CPrunerIdFor(class_id);
+ new_w = CPrunerWordIndexFor(class_id);
+ new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
+ if (new_b > b) {
+ ClassBits <<= (new_b - b);
+ } else {
+ ClassBits >>= (b - new_b);
+ }
+ // Copy bits relating to class_id to the correct position
+ // in Templates->ClassPruner.
+ NewMask = SetBitsForMask << new_b;
+ Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
+ Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
+ }
+ }
+ }
+ for (i = 0; i < Templates->NumClassPruners; i++) {
+ delete TempClassPruner[i];
+ }
+ }
+
+ /* then read in each class */
+ for (i = 0; i < Templates->NumClasses; i++) {
+ /* first read in the high level struct for the class */
+ Class = static_cast<INT_CLASS>(malloc (sizeof (INT_CLASS_STRUCT)));
+ if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
+ fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
+ fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1)
+ tprintf("Bad read of inttemp!\n");
+ if (version_id == 0) {
+ // Only version 0 writes 5 pointless pointers to the file.
+ for (j = 0; j < 5; ++j) {
+ int32_t junk;
+ if (fp->FRead(&junk, sizeof(junk), 1) != 1)
+ tprintf("Bad read of inttemp!\n");
+ }
+ }
+ int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
+ ASSERT_HOST(num_configs <= MaxNumConfigs);
+ if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) !=
+ num_configs) {
+ tprintf("Bad read of inttemp!\n");
+ }
+ if (version_id < 2) {
+ ClassForClassId (Templates, ClassIdFor[i]) = Class;
+ } else {
+ ClassForClassId (Templates, i) = Class;
+ }
+
+ /* then read in the proto lengths */
+ Lengths = nullptr;
+ if (MaxNumIntProtosIn (Class) > 0) {
+ Lengths = static_cast<uint8_t *>(malloc(sizeof(uint8_t) * MaxNumIntProtosIn(Class)));
+ if (fp->FRead(Lengths, sizeof(uint8_t), MaxNumIntProtosIn(Class)) !=
+ MaxNumIntProtosIn(Class))
+ tprintf("Bad read of inttemp!\n");
+ }
+ Class->ProtoLengths = Lengths;
+
+ /* then read in the proto sets */
+ for (j = 0; j < Class->NumProtoSets; j++) {
+ ProtoSet = static_cast<PROTO_SET>(malloc(sizeof(PROTO_SET_STRUCT)));
+ int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
+ if (fp->FReadEndian(&ProtoSet->ProtoPruner,
+ sizeof(ProtoSet->ProtoPruner[0][0][0]),
+ num_buckets) != num_buckets)
+ tprintf("Bad read of inttemp!\n");
+ for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
+ if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A),
+ 1) != 1 ||
+ fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B),
+ 1) != 1 ||
+ fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C),
+ 1) != 1 ||
+ fp->FRead(&ProtoSet->Protos[x].Angle,
+ sizeof(ProtoSet->Protos[x].Angle), 1) != 1)
+ tprintf("Bad read of inttemp!\n");
+ if (fp->FReadEndian(&ProtoSet->Protos[x].Configs,
+ sizeof(ProtoSet->Protos[x].Configs[0]),
+ WerdsPerConfigVec) != WerdsPerConfigVec)
+ tprintf("Bad read of inttemp!\n");
+ }
+ Class->ProtoSets[j] = ProtoSet;
+ }
+ if (version_id < 4) {
+ Class->font_set_id = -1;
+ } else {
+ fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1);
+ }
+ }
+
+ if (version_id < 2) {
+ /* add an empty nullptr class with class id 0 */
+ assert(UnusedClassIdIn (Templates, 0));
+ ClassForClassId (Templates, 0) = NewIntClass (1, 1);
+ ClassForClassId (Templates, 0)->font_set_id = -1;
+ Templates->NumClasses++;
+ /* make sure the classes are contiguous */
+ for (i = 0; i < MAX_NUM_CLASSES; i++) {
+ if (i < Templates->NumClasses) {
+ if (ClassForClassId (Templates, i) == nullptr) {
+ fprintf(stderr, "Non-contiguous class ids in inttemp\n");
+ exit(1);
+ }
+ } else {
+ if (ClassForClassId (Templates, i) != nullptr) {
+ fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n",
+ i, Templates->NumClasses);
+ exit(1);
+ }
+ }
+ }
+ }
+ if (version_id >= 4) {
+ using namespace std::placeholders; // for _1, _2
+ this->fontinfo_table_.read(fp, std::bind(read_info, _1, _2));
+ if (version_id >= 5) {
+ this->fontinfo_table_.read(fp,
+ std::bind(read_spacing_info, _1, _2));
+ }
+ this->fontset_table_.read(fp, std::bind(read_set, _1, _2));
+ }
+
+ // Clean up.
+ delete[] IndexFor;
+ delete[] ClassIdFor;
+ delete[] TempClassPruner;
+
+ return (Templates);
+} /* ReadIntTemplates */
+
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * This routine sends the shapes in the global display
+ * lists to the match debugger window.
+ *
+ * Globals:
+ * - FeatureShapes display list containing feature matches
+ * - ProtoShapes display list containing proto matches
+ */
+void Classify::ShowMatchDisplay() {
+ InitIntMatchWindowIfReqd();
+ if (ProtoDisplayWindow) {
+ ProtoDisplayWindow->Clear();
+ }
+ if (FeatureDisplayWindow) {
+ FeatureDisplayWindow->Clear();
+ }
+ ClearFeatureSpaceWindow(
+ static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
+ IntMatchWindow);
+ IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
+ INT_MAX_X, INT_MAX_Y);
+ if (ProtoDisplayWindow) {
+ ProtoDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
+ INT_MAX_X, INT_MAX_Y);
+ }
+ if (FeatureDisplayWindow) {
+ FeatureDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
+ INT_MAX_X, INT_MAX_Y);
+ }
+} /* ShowMatchDisplay */
+
+/// Clears the given window and draws the featurespace guides for the
+/// appropriate normalization method.
+void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView* window) {
+ window->Clear();
+
+ window->Pen(ScrollView::GREY);
+ // Draw the feature space limit rectangle.
+ window->Rectangle(0, 0, INT_MAX_X, INT_MAX_Y);
+ if (norm_method == baseline) {
+ window->SetCursor(0, INT_DESCENDER);
+ window->DrawTo(INT_MAX_X, INT_DESCENDER);
+ window->SetCursor(0, INT_BASELINE);
+ window->DrawTo(INT_MAX_X, INT_BASELINE);
+ window->SetCursor(0, INT_XHEIGHT);
+ window->DrawTo(INT_MAX_X, INT_XHEIGHT);
+ window->SetCursor(0, INT_CAPHEIGHT);
+ window->DrawTo(INT_MAX_X, INT_CAPHEIGHT);
+ } else {
+ window->Rectangle(INT_XCENTER - INT_XRADIUS, INT_YCENTER - INT_YRADIUS,
+ INT_XCENTER + INT_XRADIUS, INT_YCENTER + INT_YRADIUS);
+ }
+}
+#endif
+
+/**
+ * This routine writes Templates to File. The format
+ * is an efficient binary format. File must already be open
+ * for writing.
+ * @param File open file to write templates to
+ * @param Templates templates to save into File
+ * @param target_unicharset the UNICHARSET to use
+ */
+void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
+ const UNICHARSET& target_unicharset) {
+ int i, j;
+ INT_CLASS Class;
+ int unicharset_size = target_unicharset.size();
+ int version_id = -5; // When negated by the reader -1 becomes +1 etc.
+
+ if (Templates->NumClasses != unicharset_size) {
+ tprintf("Warning: executing WriteIntTemplates() with %d classes in"
+ " Templates, while target_unicharset size is %d\n",
+ Templates->NumClasses, unicharset_size);
+ }
+
+ /* first write the high level template struct */
+ fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
+ fwrite(&version_id, sizeof(version_id), 1, File);
+ fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners),
+ 1, File);
+ fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
+
+ /* then write out the class pruners */
+ for (i = 0; i < Templates->NumClassPruners; i++)
+ fwrite(Templates->ClassPruners[i],
+ sizeof(CLASS_PRUNER_STRUCT), 1, File);
+
+ /* then write out each class */
+ for (i = 0; i < Templates->NumClasses; i++) {
+ Class = Templates->Class[i];
+
+ /* first write out the high level struct for the class */
+ fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
+ fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
+ ASSERT_HOST(Class->NumConfigs == this->fontset_table_.get(Class->font_set_id).size);
+ fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
+ for (j = 0; j < Class->NumConfigs; ++j) {
+ fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
+ }
+
+ /* then write out the proto lengths */
+ if (MaxNumIntProtosIn (Class) > 0) {
+ fwrite(Class->ProtoLengths, sizeof(uint8_t),
+ MaxNumIntProtosIn(Class), File);
+ }
+
+ /* then write out the proto sets */
+ for (j = 0; j < Class->NumProtoSets; j++)
+ fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
+
+ /* then write the fonts info */
+ fwrite(&Class->font_set_id, sizeof(int), 1, File);
+ }
+
+ /* Write the fonts info tables */
+ using namespace std::placeholders; // for _1, _2
+ this->fontinfo_table_.write(File, std::bind(write_info, _1, _2));
+ this->fontinfo_table_.write(File,
+ std::bind(write_spacing_info, _1, _2));
+ this->fontset_table_.write(File, std::bind(write_set, _1, _2));
+} /* WriteIntTemplates */
+
+/*-----------------------------------------------------------------------------
+ Private Code
+-----------------------------------------------------------------------------*/
+/**
+ * This routine returns the parameter value which
+ * corresponds to the beginning of the specified bucket.
+ * The bucket number should have been generated using the
+ * BucketFor() function with parameters Offset and NumBuckets.
+ * @param Bucket bucket whose start is to be computed
+ * @param Offset offset used to map params to buckets
+ * @param NumBuckets total number of buckets
+ * @return Param value corresponding to start position of Bucket.
+ * @note Globals: none
+ */
+float BucketStart(int Bucket, float Offset, int NumBuckets) {
+ return ((static_cast<float>(Bucket) / NumBuckets) - Offset);
+
+} /* BucketStart */
+
+/**
+ * This routine returns the parameter value which
+ * corresponds to the end of the specified bucket.
+ * The bucket number should have been generated using the
+ * BucketFor() function with parameters Offset and NumBuckets.
+ * @param Bucket bucket whose end is to be computed
+ * @param Offset offset used to map params to buckets
+ * @param NumBuckets total number of buckets
+ * @return Param value corresponding to end position of Bucket.
+ * @note Globals: none
+ */
+float BucketEnd(int Bucket, float Offset, int NumBuckets) {
+ return ((static_cast<float>(Bucket + 1) / NumBuckets) - Offset);
+} /* BucketEnd */
+
+/**
+ * This routine fills in the section of a class pruner
+ * corresponding to a single x value for a single proto of
+ * a class.
+ * @param FillSpec specifies which bits to fill in pruner
+ * @param Pruner class pruner to be filled
+ * @param ClassMask indicates which bits to change in each word
+ * @param ClassCount indicates what to change bits to
+ * @param WordIndex indicates which word to change
+ */
+void DoFill(FILL_SPEC *FillSpec,
+ CLASS_PRUNER_STRUCT* Pruner,
+ uint32_t ClassMask,
+ uint32_t ClassCount,
+ uint32_t WordIndex) {
+ int X, Y, Angle;
+ uint32_t OldWord;
+
+ X = FillSpec->X;
+ if (X < 0)
+ X = 0;
+ if (X >= NUM_CP_BUCKETS)
+ X = NUM_CP_BUCKETS - 1;
+
+ if (FillSpec->YStart < 0)
+ FillSpec->YStart = 0;
+ if (FillSpec->YEnd >= NUM_CP_BUCKETS)
+ FillSpec->YEnd = NUM_CP_BUCKETS - 1;
+
+ for (Y = FillSpec->YStart; Y <= FillSpec->YEnd; Y++)
+ for (Angle = FillSpec->AngleStart; ;
+ CircularIncrement(Angle, NUM_CP_BUCKETS)) {
+ OldWord = Pruner->p[X][Y][Angle][WordIndex];
+ if (ClassCount > (OldWord & ClassMask)) {
+ OldWord &= ~ClassMask;
+ OldWord |= ClassCount;
+ Pruner->p[X][Y][Angle][WordIndex] = OldWord;
+ }
+ if (Angle == FillSpec->AngleEnd)
+ break;
+ }
+} /* DoFill */
+
+/**
+ * Return true if the specified table filler is done, i.e.
+ * if it has no more lines to fill.
+ * @param Filler table filler to check if done
+ * @return true if no more lines to fill, false otherwise.
+ * @note Globals: none
+ */
+bool FillerDone(TABLE_FILLER* Filler) {
+ FILL_SWITCH *Next;
+
+ Next = &(Filler->Switch[Filler->NextSwitch]);
+
+ return Filler->X > Next->X && Next->Type == LastSwitch;
+
+} /* FillerDone */
+
+/**
+ * This routine sets Bit in each bit vector whose
+ * bucket lies within the range Center +- Spread. The fill
+ * is done for a circular dimension, i.e. bucket 0 is adjacent
+ * to the last bucket. It is assumed that Center and Spread
+ * are expressed in a circular coordinate system whose range
+ * is 0 to 1.
+ * @param ParamTable table of bit vectors, one per param bucket
+ * @param Bit bit position in vectors to be filled
+ * @param Center center of filled area
+ * @param Spread spread of filled area
+ * @param debug debug flag
+ */
+void FillPPCircularBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR],
+ int Bit, float Center, float Spread, bool debug) {
+ int i, FirstBucket, LastBucket;
+
+ if (Spread > 0.5)
+ Spread = 0.5;
+
+ FirstBucket = static_cast<int>(std::floor((Center - Spread) * NUM_PP_BUCKETS));
+ if (FirstBucket < 0)
+ FirstBucket += NUM_PP_BUCKETS;
+
+ LastBucket = static_cast<int>(std::floor((Center + Spread) * NUM_PP_BUCKETS));
+ if (LastBucket >= NUM_PP_BUCKETS)
+ LastBucket -= NUM_PP_BUCKETS;
+ if (debug) tprintf("Circular fill from %d to %d", FirstBucket, LastBucket);
+ for (i = FirstBucket; true; CircularIncrement (i, NUM_PP_BUCKETS)) {
+ SET_BIT (ParamTable[i], Bit);
+
+ /* exit loop after we have set the bit for the last bucket */
+ if (i == LastBucket)
+ break;
+ }
+
+} /* FillPPCircularBits */
+
+/**
+ * This routine sets Bit in each bit vector whose
+ * bucket lies within the range Center +- Spread. The fill
+ * is done for a linear dimension, i.e. there is no wrap-around
+ * for this dimension. It is assumed that Center and Spread
+ * are expressed in a linear coordinate system whose range
+ * is approximately 0 to 1. Values outside this range will
+ * be clipped.
+ * @param ParamTable table of bit vectors, one per param bucket
+ * @param Bit bit number being filled
+ * @param Center center of filled area
+ * @param Spread spread of filled area
+ * @param debug debug flag
+ */
+void FillPPLinearBits(uint32_t ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR],
+ int Bit, float Center, float Spread, bool debug) {
+ int i, FirstBucket, LastBucket;
+
+ FirstBucket = static_cast<int>(std::floor((Center - Spread) * NUM_PP_BUCKETS));
+ if (FirstBucket < 0)
+ FirstBucket = 0;
+
+ LastBucket = static_cast<int>(std::floor((Center + Spread) * NUM_PP_BUCKETS));
+ if (LastBucket >= NUM_PP_BUCKETS)
+ LastBucket = NUM_PP_BUCKETS - 1;
+
+ if (debug) tprintf("Linear fill from %d to %d", FirstBucket, LastBucket);
+ for (i = FirstBucket; i <= LastBucket; i++)
+ SET_BIT (ParamTable[i], Bit);
+
+} /* FillPPLinearBits */
+
+
+/*---------------------------------------------------------------------------*/
+#ifndef GRAPHICS_DISABLED
+/**
+ * This routine prompts the user with Prompt and waits
+ * for the user to enter something in the debug window.
+ * @param Prompt prompt to print while waiting for input from window
+ * @param adaptive_on
+ * @param pretrained_on
+ * @param shape_id
+ * @return Character entered in the debug window.
+ * @note Globals: none
+ */
+CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool* adaptive_on,
+ bool* pretrained_on, int* shape_id) {
+ tprintf("%s\n", Prompt);
+ SVEvent* ev;
+ SVEventType ev_type;
+ int unichar_id = INVALID_UNICHAR_ID;
+ // Wait until a click or popup event.
+ do {
+ ev = IntMatchWindow->AwaitEvent(SVET_ANY);
+ ev_type = ev->type;
+ if (ev_type == SVET_POPUP) {
+ if (ev->command_id == IDA_SHAPE_INDEX) {
+ if (shape_table_ != nullptr) {
+ *shape_id = atoi(ev->parameter);
+ *adaptive_on = false;
+ *pretrained_on = true;
+ if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
+ int font_id;
+ shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
+ &font_id);
+ tprintf("Shape %d, first unichar=%d, font=%d\n",
+ *shape_id, unichar_id, font_id);
+ return unichar_id;
+ }
+ tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
+ } else {
+ tprintf("No shape table loaded!\n");
+ }
+ } else {
+ if (unicharset.contains_unichar(ev->parameter)) {
+ unichar_id = unicharset.unichar_to_id(ev->parameter);
+ if (ev->command_id == IDA_ADAPTIVE) {
+ *adaptive_on = true;
+ *pretrained_on = false;
+ *shape_id = -1;
+ } else if (ev->command_id == IDA_STATIC) {
+ *adaptive_on = false;
+ *pretrained_on = true;
+ } else {
+ *adaptive_on = true;
+ *pretrained_on = true;
+ }
+ if (ev->command_id == IDA_ADAPTIVE || shape_table_ == nullptr) {
+ *shape_id = -1;
+ return unichar_id;
+ }
+ for (int s = 0; s < shape_table_->NumShapes(); ++s) {
+ if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
+ tprintf("%s\n", shape_table_->DebugStr(s).c_str());
+ }
+ }
+ } else {
+ tprintf("Char class '%s' not found in unicharset",
+ ev->parameter);
+ }
+ }
+ }
+ delete ev;
+ } while (ev_type != SVET_CLICK);
+ return 0;
+} /* GetClassToDebug */
+
+#endif
+
+/**
+ * This routine copies the appropriate global pad variables
+ * into EndPad, SidePad, and AnglePad. This is a kludge used
+ * to get around the fact that global control variables cannot
+ * be arrays. If the specified level is illegal, the tightest
+ * possible pads are returned.
+ * @param Level "tightness" level to return pads for
+ * @param EndPad place to put end pad for Level
+ * @param SidePad place to put side pad for Level
+ * @param AnglePad place to put angle pad for Level
+ */
+void GetCPPadsForLevel(int Level,
+ float *EndPad,
+ float *SidePad,
+ float *AnglePad) {
+ switch (Level) {
+ case 0:
+ *EndPad = classify_cp_end_pad_loose * GetPicoFeatureLength ();
+ *SidePad = classify_cp_side_pad_loose * GetPicoFeatureLength ();
+ *AnglePad = classify_cp_angle_pad_loose / 360.0;
+ break;
+
+ case 1:
+ *EndPad = classify_cp_end_pad_medium * GetPicoFeatureLength ();
+ *SidePad = classify_cp_side_pad_medium * GetPicoFeatureLength ();
+ *AnglePad = classify_cp_angle_pad_medium / 360.0;
+ break;
+
+ case 2:
+ *EndPad = classify_cp_end_pad_tight * GetPicoFeatureLength ();
+ *SidePad = classify_cp_side_pad_tight * GetPicoFeatureLength ();
+ *AnglePad = classify_cp_angle_pad_tight / 360.0;
+ break;
+
+ default:
+ *EndPad = classify_cp_end_pad_tight * GetPicoFeatureLength ();
+ *SidePad = classify_cp_side_pad_tight * GetPicoFeatureLength ();
+ *AnglePad = classify_cp_angle_pad_tight / 360.0;
+ break;
+ }
+ if (*AnglePad > 0.5)
+ *AnglePad = 0.5;
+
+} /* GetCPPadsForLevel */
+
+/**
+ * @param Evidence evidence value to return color for
+ * @return Color which corresponds to specified Evidence value.
+ * @note Globals: none
+ */
+ScrollView::Color GetMatchColorFor(float Evidence) {
+ assert (Evidence >= 0.0);
+ assert (Evidence <= 1.0);
+
+ if (Evidence >= 0.90)
+ return ScrollView::WHITE;
+ else if (Evidence >= 0.75)
+ return ScrollView::GREEN;
+ else if (Evidence >= 0.50)
+ return ScrollView::RED;
+ else
+ return ScrollView::BLUE;
+} /* GetMatchColorFor */
+
+/**
+ * This routine returns (in Fill) the specification of
+ * the next line to be filled from Filler. FillerDone() should
+ * always be called before GetNextFill() to ensure that we
+ * do not run past the end of the fill table.
+ * @param Filler filler to get next fill spec from
+ * @param Fill place to put spec for next fill
+ */
+void GetNextFill(TABLE_FILLER *Filler, FILL_SPEC *Fill) {
+ FILL_SWITCH *Next;
+
+ /* compute the fill assuming no switches will be encountered */
+ Fill->AngleStart = Filler->AngleStart;
+ Fill->AngleEnd = Filler->AngleEnd;
+ Fill->X = Filler->X;
+ Fill->YStart = Filler->YStart >> 8;
+ Fill->YEnd = Filler->YEnd >> 8;
+
+ /* update the fill info and the filler for ALL switches at this X value */
+ Next = &(Filler->Switch[Filler->NextSwitch]);
+ while (Filler->X >= Next->X) {
+ Fill->X = Filler->X = Next->X;
+ if (Next->Type == StartSwitch) {
+ Fill->YStart = Next->Y;
+ Filler->StartDelta = Next->Delta;
+ Filler->YStart = Next->YInit;
+ }
+ else if (Next->Type == EndSwitch) {
+ Fill->YEnd = Next->Y;
+ Filler->EndDelta = Next->Delta;
+ Filler->YEnd = Next->YInit;
+ }
+ else { /* Type must be LastSwitch */
+ break;
+ }
+ Filler->NextSwitch++;
+ Next = &(Filler->Switch[Filler->NextSwitch]);
+ }
+
+ /* prepare the filler for the next call to this routine */
+ Filler->X++;
+ Filler->YStart += Filler->StartDelta;
+ Filler->YEnd += Filler->EndDelta;
+
+} /* GetNextFill */
+
+/**
+ * This routine computes a data structure (Filler)
+ * which can be used to fill in a rectangle surrounding
+ * the specified Proto. Results are returned in Filler.
+ *
+ * @param EndPad, SidePad, AnglePad padding to add to proto
+ * @param Proto proto to create a filler for
+ * @param Filler place to put table filler
+ */
+void InitTableFiller (float EndPad, float SidePad,
+ float AnglePad, PROTO Proto, TABLE_FILLER * Filler)
+#define XS X_SHIFT
+#define YS Y_SHIFT
+#define AS ANGLE_SHIFT
+#define NB NUM_CP_BUCKETS
+{
+ float Angle;
+ float X, Y, HalfLength;
+ float Cos, Sin;
+ float XAdjust, YAdjust;
+ FPOINT Start, Switch1, Switch2, End;
+ int S1 = 0;
+ int S2 = 1;
+
+ Angle = Proto->Angle;
+ X = Proto->X;
+ Y = Proto->Y;
+ HalfLength = Proto->Length / 2.0;
+
+ Filler->AngleStart = CircBucketFor(Angle - AnglePad, AS, NB);
+ Filler->AngleEnd = CircBucketFor(Angle + AnglePad, AS, NB);
+ Filler->NextSwitch = 0;
+
+ if (fabs (Angle - 0.0) < HV_TOLERANCE || fabs (Angle - 0.5) < HV_TOLERANCE) {
+ /* horizontal proto - handle as special case */
+ Filler->X = Bucket8For(X - HalfLength - EndPad, XS, NB);
+ Filler->YStart = Bucket16For(Y - SidePad, YS, NB * 256);
+ Filler->YEnd = Bucket16For(Y + SidePad, YS, NB * 256);
+ Filler->StartDelta = 0;
+ Filler->EndDelta = 0;
+ Filler->Switch[0].Type = LastSwitch;
+ Filler->Switch[0].X = Bucket8For(X + HalfLength + EndPad, XS, NB);
+ } else if (fabs(Angle - 0.25) < HV_TOLERANCE ||
+ fabs(Angle - 0.75) < HV_TOLERANCE) {
+ /* vertical proto - handle as special case */
+ Filler->X = Bucket8For(X - SidePad, XS, NB);
+ Filler->YStart = Bucket16For(Y - HalfLength - EndPad, YS, NB * 256);
+ Filler->YEnd = Bucket16For(Y + HalfLength + EndPad, YS, NB * 256);
+ Filler->StartDelta = 0;
+ Filler->EndDelta = 0;
+ Filler->Switch[0].Type = LastSwitch;
+ Filler->Switch[0].X = Bucket8For(X + SidePad, XS, NB);
+ } else {
+ /* diagonal proto */
+
+ if ((Angle > 0.0 && Angle < 0.25) || (Angle > 0.5 && Angle < 0.75)) {
+ /* rising diagonal proto */
+ Angle *= 2.0 * M_PI;
+ Cos = fabs(cos(Angle));
+ Sin = fabs(sin(Angle));
+
+ /* compute the positions of the corners of the acceptance region */
+ Start.x = X - (HalfLength + EndPad) * Cos - SidePad * Sin;
+ Start.y = Y - (HalfLength + EndPad) * Sin + SidePad * Cos;
+ End.x = 2.0 * X - Start.x;
+ End.y = 2.0 * Y - Start.y;
+ Switch1.x = X - (HalfLength + EndPad) * Cos + SidePad * Sin;
+ Switch1.y = Y - (HalfLength + EndPad) * Sin - SidePad * Cos;
+ Switch2.x = 2.0 * X - Switch1.x;
+ Switch2.y = 2.0 * Y - Switch1.y;
+
+ if (Switch1.x > Switch2.x) {
+ S1 = 1;
+ S2 = 0;
+ }
+
+ /* translate into bucket positions and deltas */
+ Filler->X = Bucket8For(Start.x, XS, NB);
+ Filler->StartDelta = -static_cast<int16_t>((Cos / Sin) * 256);
+ Filler->EndDelta = static_cast<int16_t>((Sin / Cos) * 256);
+
+ XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x;
+ YAdjust = XAdjust * Cos / Sin;
+ Filler->YStart = Bucket16For(Start.y - YAdjust, YS, NB * 256);
+ YAdjust = XAdjust * Sin / Cos;
+ Filler->YEnd = Bucket16For(Start.y + YAdjust, YS, NB * 256);
+
+ Filler->Switch[S1].Type = StartSwitch;
+ Filler->Switch[S1].X = Bucket8For(Switch1.x, XS, NB);
+ Filler->Switch[S1].Y = Bucket8For(Switch1.y, YS, NB);
+ XAdjust = Switch1.x - BucketStart(Filler->Switch[S1].X, XS, NB);
+ YAdjust = XAdjust * Sin / Cos;
+ Filler->Switch[S1].YInit = Bucket16For(Switch1.y - YAdjust, YS, NB * 256);
+ Filler->Switch[S1].Delta = Filler->EndDelta;
+
+ Filler->Switch[S2].Type = EndSwitch;
+ Filler->Switch[S2].X = Bucket8For(Switch2.x, XS, NB);
+ Filler->Switch[S2].Y = Bucket8For(Switch2.y, YS, NB);
+ XAdjust = Switch2.x - BucketStart(Filler->Switch[S2].X, XS, NB);
+ YAdjust = XAdjust * Cos / Sin;
+ Filler->Switch[S2].YInit = Bucket16For(Switch2.y + YAdjust, YS, NB * 256);
+ Filler->Switch[S2].Delta = Filler->StartDelta;
+
+ Filler->Switch[2].Type = LastSwitch;
+ Filler->Switch[2].X = Bucket8For(End.x, XS, NB);
+ } else {
+ /* falling diagonal proto */
+ Angle *= 2.0 * M_PI;
+ Cos = fabs(cos(Angle));
+ Sin = fabs(sin(Angle));
+
+ /* compute the positions of the corners of the acceptance region */
+ Start.x = X - (HalfLength + EndPad) * Cos - SidePad * Sin;
+ Start.y = Y + (HalfLength + EndPad) * Sin - SidePad * Cos;
+ End.x = 2.0 * X - Start.x;
+ End.y = 2.0 * Y - Start.y;
+ Switch1.x = X - (HalfLength + EndPad) * Cos + SidePad * Sin;
+ Switch1.y = Y + (HalfLength + EndPad) * Sin + SidePad * Cos;
+ Switch2.x = 2.0 * X - Switch1.x;
+ Switch2.y = 2.0 * Y - Switch1.y;
+
+ if (Switch1.x > Switch2.x) {
+ S1 = 1;
+ S2 = 0;
+ }
+
+ /* translate into bucket positions and deltas */
+ Filler->X = Bucket8For(Start.x, XS, NB);
+ Filler->StartDelta = static_cast<int16_t>(ClipToRange<int>(
+ -IntCastRounded((Sin / Cos) * 256), INT16_MIN, INT16_MAX));
+ Filler->EndDelta = static_cast<int16_t>(ClipToRange<int>(
+ IntCastRounded((Cos / Sin) * 256), INT16_MIN, INT16_MAX));
+
+ XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x;
+ YAdjust = XAdjust * Sin / Cos;
+ Filler->YStart = Bucket16For(Start.y - YAdjust, YS, NB * 256);
+ YAdjust = XAdjust * Cos / Sin;
+ Filler->YEnd = Bucket16For(Start.y + YAdjust, YS, NB * 256);
+
+ Filler->Switch[S1].Type = EndSwitch;
+ Filler->Switch[S1].X = Bucket8For(Switch1.x, XS, NB);
+ Filler->Switch[S1].Y = Bucket8For(Switch1.y, YS, NB);
+ XAdjust = Switch1.x - BucketStart(Filler->Switch[S1].X, XS, NB);
+ YAdjust = XAdjust * Sin / Cos;
+ Filler->Switch[S1].YInit = Bucket16For(Switch1.y + YAdjust, YS, NB * 256);
+ Filler->Switch[S1].Delta = Filler->StartDelta;
+
+ Filler->Switch[S2].Type = StartSwitch;
+ Filler->Switch[S2].X = Bucket8For(Switch2.x, XS, NB);
+ Filler->Switch[S2].Y = Bucket8For(Switch2.y, YS, NB);
+ XAdjust = Switch2.x - BucketStart(Filler->Switch[S2].X, XS, NB);
+ YAdjust = XAdjust * Cos / Sin;
+ Filler->Switch[S2].YInit = Bucket16For(Switch2.y - YAdjust, YS, NB * 256);
+ Filler->Switch[S2].Delta = Filler->EndDelta;
+
+ Filler->Switch[2].Type = LastSwitch;
+ Filler->Switch[2].X = Bucket8For(End.x, XS, NB);
+ }
+ }
+} /* InitTableFiller */
+
+
+/*---------------------------------------------------------------------------*/
+#ifndef GRAPHICS_DISABLED
+/**
+ * This routine renders the specified feature into ShapeList.
+ * @param window to add feature rendering to
+ * @param Feature feature to be rendered
+ * @param color color to use for feature rendering
+ * @return New shape list with rendering of Feature added.
+ * @note Globals: none
+ */
+void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT* Feature,
+ ScrollView::Color color) {
+ float X, Y, Dx, Dy, Length;
+
+ window->Pen(color);
+ assert(Feature != nullptr);
+ assert(color != 0);
+
+ X = Feature->X;
+ Y = Feature->Y;
+ Length = GetPicoFeatureLength() * 0.7 * INT_CHAR_NORM_RANGE;
+ // The -PI has no significant effect here, but the value of Theta is computed
+ // using BinaryAnglePlusPi in intfx.cpp.
+ Dx = (Length / 2.0) * cos((Feature->Theta / 256.0) * 2.0 * M_PI - M_PI);
+ Dy = (Length / 2.0) * sin((Feature->Theta / 256.0) * 2.0 * M_PI - M_PI);
+
+ window->SetCursor(X, Y);
+ window->DrawTo(X + Dx, Y + Dy);
+} /* RenderIntFeature */
+
+/**
+ * This routine extracts the parameters of the specified
+ * proto from the class description and adds a rendering of
+ * the proto onto the ShapeList.
+ *
+ * @param window ScrollView instance
+ * @param Class class that proto is contained in
+ * @param ProtoId id of proto to be rendered
+ * @param color color to render proto in
+ *
+ * Globals: none
+ *
+ * @return New shape list with a rendering of one proto added.
+ */
+void RenderIntProto(ScrollView *window,
+ INT_CLASS Class,
+ PROTO_ID ProtoId,
+ ScrollView::Color color) {
+ PROTO_SET ProtoSet;
+ INT_PROTO Proto;
+ int ProtoSetIndex;
+ int ProtoWordIndex;
+ float Length;
+ int Xmin, Xmax, Ymin, Ymax;
+ float X, Y, Dx, Dy;
+ uint32_t ProtoMask;
+ int Bucket;
+
+ assert(ProtoId >= 0);
+ assert(Class != nullptr);
+ assert(ProtoId < Class->NumProtos);
+ assert(color != 0);
+ window->Pen(color);
+
+ ProtoSet = Class->ProtoSets[SetForProto(ProtoId)];
+ ProtoSetIndex = IndexForProto(ProtoId);
+ Proto = &(ProtoSet->Protos[ProtoSetIndex]);
+ Length = (Class->ProtoLengths[ProtoId] *
+ GetPicoFeatureLength() * INT_CHAR_NORM_RANGE);
+ ProtoMask = PPrunerMaskFor(ProtoId);
+ ProtoWordIndex = PPrunerWordIndexFor(ProtoId);
+
+ // find the x and y extent of the proto from the proto pruning table
+ Xmin = Ymin = NUM_PP_BUCKETS;
+ Xmax = Ymax = 0;
+ for (Bucket = 0; Bucket < NUM_PP_BUCKETS; Bucket++) {
+ if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex]) {
+ UpdateRange(Bucket, &Xmin, &Xmax);
+ }
+
+ if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex]) {
+ UpdateRange(Bucket, &Ymin, &Ymax);
+ }
+ }
+ X = (Xmin + Xmax + 1) / 2.0 * PROTO_PRUNER_SCALE;
+ Y = (Ymin + Ymax + 1) / 2.0 * PROTO_PRUNER_SCALE;
+ // The -PI has no significant effect here, but the value of Theta is computed
+ // using BinaryAnglePlusPi in intfx.cpp.
+ Dx = (Length / 2.0) * cos((Proto->Angle / 256.0) * 2.0 * M_PI - M_PI);
+ Dy = (Length / 2.0) * sin((Proto->Angle / 256.0) * 2.0 * M_PI - M_PI);
+
+ window->SetCursor(X - Dx, Y - Dy);
+ window->DrawTo(X + Dx, Y + Dy);
+} /* RenderIntProto */
+#endif
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * Initializes the int matcher window if it is not already
+ * initialized.
+ */
+void InitIntMatchWindowIfReqd() {
+ if (IntMatchWindow == nullptr) {
+ IntMatchWindow = CreateFeatureSpaceWindow("IntMatchWindow", 50, 200);
+ auto* popup_menu = new SVMenuNode();
+
+ popup_menu->AddChild("Debug Adapted classes", IDA_ADAPTIVE,
+ "x", "Class to debug");
+ popup_menu->AddChild("Debug Static classes", IDA_STATIC,
+ "x", "Class to debug");
+ popup_menu->AddChild("Debug Both", IDA_BOTH,
+ "x", "Class to debug");
+ popup_menu->AddChild("Debug Shape Index", IDA_SHAPE_INDEX,
+ "0", "Index to debug");
+ popup_menu->BuildMenu(IntMatchWindow, false);
+ }
+}
+
+/**
+ * Initializes the proto display window if it is not already
+ * initialized.
+ */
+void InitProtoDisplayWindowIfReqd() {
+ if (ProtoDisplayWindow == nullptr) {
+ ProtoDisplayWindow = CreateFeatureSpaceWindow("ProtoDisplayWindow",
+ 550, 200);
+ }
+}
+
+/**
+ * Initializes the feature display window if it is not already
+ * initialized.
+ */
+void InitFeatureDisplayWindowIfReqd() {
+ if (FeatureDisplayWindow == nullptr) {
+ FeatureDisplayWindow = CreateFeatureSpaceWindow("FeatureDisplayWindow",
+ 50, 700);
+ }
+}
+
+/// Creates a window of the appropriate size for displaying elements
+/// in feature space.
+ScrollView* CreateFeatureSpaceWindow(const char* name, int xpos, int ypos) {
+ return new ScrollView(name, xpos, ypos, 520, 520, 260, 260, true);
+}
+#endif // !GRAPHICS_DISABLED
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/intproto.h b/tesseract/src/classify/intproto.h
new file mode 100644
index 00000000..77bf2376
--- /dev/null
+++ b/tesseract/src/classify/intproto.h
@@ -0,0 +1,265 @@
+/******************************************************************************
+ ** Filename: intproto.h
+ ** Purpose: Definition of data structures for integer protos.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#ifndef INTPROTO_H
+#define INTPROTO_H
+
+/**----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#include "matchdefs.h"
+#include "mfoutline.h"
+#include "protos.h"
+#include "scrollview.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+class FCOORD;
+
+/* define order of params in pruners */
+#define PRUNER_X 0
+#define PRUNER_Y 1
+#define PRUNER_ANGLE 2
+
+/* definition of coordinate system offsets for each table parameter */
+#define ANGLE_SHIFT (0.0)
+#define X_SHIFT (0.5)
+#define Y_SHIFT (0.5)
+
+#define MAX_PROTO_INDEX 24
+#define BITS_PER_WERD static_cast<int>(8 * sizeof(uint32_t))
+/* Script detection: increase this number to 128 */
+#define MAX_NUM_CONFIGS 64
+#define MAX_NUM_PROTOS 512
+#define PROTOS_PER_PROTO_SET 64
+#define MAX_NUM_PROTO_SETS (MAX_NUM_PROTOS / PROTOS_PER_PROTO_SET)
+#define NUM_PP_PARAMS 3
+#define NUM_PP_BUCKETS 64
+#define NUM_CP_BUCKETS 24
+#define CLASSES_PER_CP 32
+#define NUM_BITS_PER_CLASS 2
+#define CLASS_PRUNER_CLASS_MASK (~(~0u << NUM_BITS_PER_CLASS))
+#define CLASSES_PER_CP_WERD (CLASSES_PER_CP / NUM_BITS_PER_CLASS)
+#define PROTOS_PER_PP_WERD BITS_PER_WERD
+#define BITS_PER_CP_VECTOR (CLASSES_PER_CP * NUM_BITS_PER_CLASS)
+#define MAX_NUM_CLASS_PRUNERS \
+ ((MAX_NUM_CLASSES + CLASSES_PER_CP - 1) / CLASSES_PER_CP)
+#define WERDS_PER_CP_VECTOR (BITS_PER_CP_VECTOR / BITS_PER_WERD)
+#define WERDS_PER_PP_VECTOR \
+ ((PROTOS_PER_PROTO_SET + BITS_PER_WERD - 1) / BITS_PER_WERD)
+#define WERDS_PER_PP (NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR)
+#define WERDS_PER_CP \
+ (NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR)
+#define WERDS_PER_CONFIG_VEC \
+ ((MAX_NUM_CONFIGS + BITS_PER_WERD - 1) / BITS_PER_WERD)
+
+/* The first 3 dimensions of the CLASS_PRUNER_STRUCT are the
+ * 3 axes of the quantized feature space.
+ * The position of the the bits recorded for each class in the
+ * 4th dimension is determined by using CPrunerWordIndexFor(c),
+ * where c is the corresponding class id. */
+struct CLASS_PRUNER_STRUCT {
+ uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS]
+ [WERDS_PER_CP_VECTOR];
+};
+
+typedef struct {
+ int8_t A;
+ uint8_t B;
+ int8_t C;
+ uint8_t Angle;
+ uint32_t Configs[WERDS_PER_CONFIG_VEC];
+}
+
+INT_PROTO_STRUCT,
+ *INT_PROTO;
+
+typedef uint32_t PROTO_PRUNER[NUM_PP_PARAMS][NUM_PP_BUCKETS]
+ [WERDS_PER_PP_VECTOR];
+
+typedef struct {
+ PROTO_PRUNER ProtoPruner;
+ INT_PROTO_STRUCT Protos[PROTOS_PER_PROTO_SET];
+}
+
+PROTO_SET_STRUCT,
+ *PROTO_SET;
+
+typedef uint32_t CONFIG_PRUNER[NUM_PP_PARAMS][NUM_PP_BUCKETS][4];
+
+typedef struct {
+ uint16_t NumProtos;
+ uint8_t NumProtoSets;
+ uint8_t NumConfigs;
+ PROTO_SET ProtoSets[MAX_NUM_PROTO_SETS];
+ uint8_t* ProtoLengths;
+ uint16_t ConfigLengths[MAX_NUM_CONFIGS];
+ int font_set_id; // FontSet id, see above
+}
+
+INT_CLASS_STRUCT,
+ *INT_CLASS;
+
+typedef struct {
+ int NumClasses;
+ int NumClassPruners;
+ INT_CLASS Class[MAX_NUM_CLASSES];
+ CLASS_PRUNER_STRUCT* ClassPruners[MAX_NUM_CLASS_PRUNERS];
+}
+
+INT_TEMPLATES_STRUCT,
+ *INT_TEMPLATES;
+
+/* definitions of integer features*/
+#define MAX_NUM_INT_FEATURES 512
+#define INT_CHAR_NORM_RANGE 256
+
+struct INT_FEATURE_STRUCT {
+ INT_FEATURE_STRUCT() : X(0), Y(0), Theta(0), CP_misses(0) {}
+ // Builds a feature from an FCOORD for position with all the necessary
+ // clipping and rounding.
+ INT_FEATURE_STRUCT(const FCOORD& pos, uint8_t theta);
+ // Builds a feature from ints with all the necessary clipping and casting.
+ INT_FEATURE_STRUCT(int x, int y, int theta);
+
+ uint8_t X;
+ uint8_t Y;
+ uint8_t Theta;
+ int8_t CP_misses;
+
+ void print() const {
+ tprintf("(%d,%d):%d\n", X, Y, Theta);
+ }
+};
+
+using INT_FEATURE = INT_FEATURE_STRUCT*;
+
+typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];
+
+enum IntmatcherDebugAction {
+ IDA_ADAPTIVE,
+ IDA_STATIC,
+ IDA_SHAPE_INDEX,
+ IDA_BOTH
+};
+
+/**----------------------------------------------------------------------------
+ Macros
+----------------------------------------------------------------------------**/
+
+#define MaxNumIntProtosIn(C) (C->NumProtoSets * PROTOS_PER_PROTO_SET)
+#define SetForProto(P) (P / PROTOS_PER_PROTO_SET)
+#define IndexForProto(P) (P % PROTOS_PER_PROTO_SET)
+#define ProtoForProtoId(C, P) \
+ (&((C->ProtoSets[SetForProto(P)])->Protos[IndexForProto(P)]))
+#define PPrunerWordIndexFor(I) \
+ (((I) % PROTOS_PER_PROTO_SET) / PROTOS_PER_PP_WERD)
+#define PPrunerBitIndexFor(I) ((I) % PROTOS_PER_PP_WERD)
+#define PPrunerMaskFor(I) (1 << PPrunerBitIndexFor(I))
+
+#define MaxNumClassesIn(T) (T->NumClassPruners * CLASSES_PER_CP)
+#define LegalClassId(c) ((c) >= 0 && (c) < MAX_NUM_CLASSES)
+#define UnusedClassIdIn(T, c) ((T)->Class[c] == nullptr)
+#define ClassForClassId(T, c) ((T)->Class[c])
+#define ClassPrunersFor(T) ((T)->ClassPruner)
+#define CPrunerIdFor(c) ((c) / CLASSES_PER_CP)
+#define CPrunerFor(T, c) ((T)->ClassPruners[CPrunerIdFor(c)])
+#define CPrunerWordIndexFor(c) (((c) % CLASSES_PER_CP) / CLASSES_PER_CP_WERD)
+#define CPrunerBitIndexFor(c) (((c) % CLASSES_PER_CP) % CLASSES_PER_CP_WERD)
+#define CPrunerMaskFor(L, c) \
+ (((L) + 1) << CPrunerBitIndexFor(c) * NUM_BITS_PER_CLASS)
+
+/* DEBUG macros*/
+#define PRINT_MATCH_SUMMARY 0x001
+#define DISPLAY_FEATURE_MATCHES 0x002
+#define DISPLAY_PROTO_MATCHES 0x004
+#define PRINT_FEATURE_MATCHES 0x008
+#define PRINT_PROTO_MATCHES 0x010
+#define CLIP_MATCH_EVIDENCE 0x020
+
+#define MatchDebuggingOn(D) (D)
+#define PrintMatchSummaryOn(D) ((D)&PRINT_MATCH_SUMMARY)
+#define DisplayFeatureMatchesOn(D) ((D)&DISPLAY_FEATURE_MATCHES)
+#define DisplayProtoMatchesOn(D) ((D)&DISPLAY_PROTO_MATCHES)
+#define PrintFeatureMatchesOn(D) ((D)&PRINT_FEATURE_MATCHES)
+#define PrintProtoMatchesOn(D) ((D)&PRINT_PROTO_MATCHES)
+#define ClipMatchEvidenceOn(D) ((D)&CLIP_MATCH_EVIDENCE)
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class);
+
+int AddIntConfig(INT_CLASS Class);
+
+int AddIntProto(INT_CLASS Class);
+
+void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId,
+ INT_TEMPLATES Templates);
+
+void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class,
+ bool debug);
+
+uint8_t Bucket8For(float param, float offset, int num_buckets);
+uint16_t Bucket16For(float param, float offset, int num_buckets);
+
+uint8_t CircBucketFor(float param, float offset, int num_buckets);
+
+void UpdateMatchDisplay();
+
+void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class);
+
+void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, float Evidence);
+
+void DisplayIntProto(INT_CLASS Class, PROTO_ID ProtoId, float Evidence);
+
+INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs);
+
+INT_TEMPLATES NewIntTemplates();
+
+TESS_API
+void free_int_templates(INT_TEMPLATES templates);
+
+void ShowMatchDisplay();
+
+// Clears the given window and draws the featurespace guides for the
+// appropriate normalization method.
+TESS_API
+void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView* window);
+
+/*----------------------------------------------------------------------------*/
+#ifndef GRAPHICS_DISABLED
+TESS_API
+void RenderIntFeature(ScrollView* window, const INT_FEATURE_STRUCT* Feature,
+ ScrollView::Color color);
+
+void InitIntMatchWindowIfReqd();
+
+void InitProtoDisplayWindowIfReqd();
+
+void InitFeatureDisplayWindowIfReqd();
+
+// Creates a window of the appropriate size for displaying elements
+// in feature space.
+TESS_API
+ScrollView* CreateFeatureSpaceWindow(const char* name, int xpos, int ypos);
+#endif // !GRAPHICS_DISABLED
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/kdtree.cpp b/tesseract/src/classify/kdtree.cpp
new file mode 100644
index 00000000..d8ff700d
--- /dev/null
+++ b/tesseract/src/classify/kdtree.cpp
@@ -0,0 +1,541 @@
+/******************************************************************************
+ ** Filename: kdtree.cpp
+ ** Purpose: Routines for managing K-D search trees
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+/*-----------------------------------------------------------------------------
+ Include Files and Type Defines
+-----------------------------------------------------------------------------*/
+#include "kdtree.h"
+
+#include <algorithm>
+#include <cfloat> // for FLT_MAX
+#include <cstdio>
+#include <cmath>
+
+namespace tesseract {
+
+#define Magnitude(X) ((X) < 0 ? -(X) : (X))
+#define NodeFound(N,K,D) (((N)->Key == (K)) && ((N)->Data == (D)))
+
+/*-----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+-----------------------------------------------------------------------------*/
+#define MINSEARCH -FLT_MAX
+#define MAXSEARCH FLT_MAX
+
+// Helper function to find the next essential dimension in a cycle.
+static int NextLevel(KDTREE *tree, int level) {
+ do {
+ ++level;
+ if (level >= tree->KeySize)
+ level = 0;
+ } while (tree->KeyDesc[level].NonEssential);
+ return level;
+}
+
+//-----------------------------------------------------------------------------
+/** Store the k smallest-keyed key-value pairs. */
+template<typename Key, typename Value>
+class MinK {
+ public:
+ MinK(Key max_key, int k);
+ ~MinK();
+
+ struct Element {
+ Element() {}
+ Element(const Key& k, const Value& v) : key(k), value(v) {}
+
+ Key key;
+ Value value;
+ };
+
+ bool insert(Key k, Value v);
+ const Key& max_insertable_key();
+
+ int elements_count() { return elements_count_; }
+ const Element* elements() { return elements_; }
+
+ private:
+ const Key max_key_; ///< the maximum possible Key
+ Element *elements_; ///< unsorted array of elements
+ int elements_count_; ///< the number of results collected so far
+ int k_; ///< the number of results we want from the search
+ int max_index_; ///< the index of the result with the largest key
+};
+
+template<typename Key, typename Value>
+MinK<Key, Value>::MinK(Key max_key, int k) :
+ max_key_(max_key), elements_count_(0), k_(k < 1 ? 1 : k), max_index_(0) {
+ elements_ = new Element[k_];
+}
+
+template<typename Key, typename Value>
+MinK<Key, Value>::~MinK() {
+ delete []elements_;
+}
+
+template<typename Key, typename Value>
+const Key& MinK<Key, Value>::max_insertable_key() {
+ if (elements_count_ < k_)
+ return max_key_;
+ return elements_[max_index_].key;
+}
+
+template<typename Key, typename Value>
+bool MinK<Key, Value>::insert(Key key, Value value) {
+ if (elements_count_ < k_) {
+ elements_[elements_count_++] = Element(key, value);
+ if (key > elements_[max_index_].key)
+ max_index_ = elements_count_ - 1;
+ return true;
+ } else if (key < elements_[max_index_].key) {
+ // evict the largest element.
+ elements_[max_index_] = Element(key, value);
+ // recompute max_index_
+ for (int i = 0; i < elements_count_; i++) {
+ if (elements_[i].key > elements_[max_index_].key)
+ max_index_ = i;
+ }
+ return true;
+ }
+ return false;
+}
+
+
+//-----------------------------------------------------------------------------
+/** Helper class for searching for the k closest points to query_point in tree.
+ */
+class KDTreeSearch {
+ public:
+ KDTreeSearch(KDTREE* tree, float *query_point, int k_closest);
+ ~KDTreeSearch();
+
+ /** Return the k nearest points' data. */
+ void Search(int *result_count, float *distances, void **results);
+
+ private:
+ void SearchRec(int Level, KDNODE *SubTree);
+ bool BoxIntersectsSearch(float *lower, float *upper);
+
+ KDTREE *tree_;
+ float *query_point_;
+ float *sb_min_; ///< search box minimum
+ float *sb_max_; ///< search box maximum
+ MinK<float, void *> results_;
+};
+
+KDTreeSearch::KDTreeSearch(KDTREE *tree, float *query_point, int k_closest)
+ : tree_(tree), query_point_(query_point), results_(MAXSEARCH, k_closest) {
+ sb_min_ = new float[tree->KeySize];
+ sb_max_ = new float[tree->KeySize];
+}
+
+KDTreeSearch::~KDTreeSearch() {
+ delete[] sb_min_;
+ delete[] sb_max_;
+}
+
+/// Locate the k_closest points to query_point_, and return their distances and
+/// data into the given buffers.
+void KDTreeSearch::Search(int *result_count,
+ float *distances,
+ void **results) {
+ if (tree_->Root.Left == nullptr) {
+ *result_count = 0;
+ } else {
+ for (int i = 0; i < tree_->KeySize; i++) {
+ sb_min_[i] = tree_->KeyDesc[i].Min;
+ sb_max_[i] = tree_->KeyDesc[i].Max;
+ }
+ SearchRec(0, tree_->Root.Left);
+ int count = results_.elements_count();
+ *result_count = count;
+ for (int j = 0; j < count; j++) {
+ // Pre-cast to float64 as key is a template type and we have no control
+ // over its actual type.
+ distances[j] = static_cast<float>(sqrt(static_cast<double>(results_.elements()[j].key)));
+ results[j] = results_.elements()[j].value;
+ }
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ Public Code
+-----------------------------------------------------------------------------*/
+/// @return a new KDTREE based on the specified parameters.
+/// @param KeySize # of dimensions in the K-D tree
+/// @param KeyDesc array of params to describe key dimensions
+KDTREE *MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[]) {
+ auto *KDTree = static_cast<KDTREE *>(malloc(
+ sizeof(KDTREE) + (KeySize - 1) * sizeof(PARAM_DESC)));
+ for (int i = 0; i < KeySize; i++) {
+ KDTree->KeyDesc[i].NonEssential = KeyDesc[i].NonEssential;
+ KDTree->KeyDesc[i].Circular = KeyDesc[i].Circular;
+ if (KeyDesc[i].Circular) {
+ KDTree->KeyDesc[i].Min = KeyDesc[i].Min;
+ KDTree->KeyDesc[i].Max = KeyDesc[i].Max;
+ KDTree->KeyDesc[i].Range = KeyDesc[i].Max - KeyDesc[i].Min;
+ KDTree->KeyDesc[i].HalfRange = KDTree->KeyDesc[i].Range / 2;
+ KDTree->KeyDesc[i].MidRange = (KeyDesc[i].Max + KeyDesc[i].Min) / 2;
+ } else {
+ KDTree->KeyDesc[i].Min = MINSEARCH;
+ KDTree->KeyDesc[i].Max = MAXSEARCH;
+ }
+ }
+ KDTree->KeySize = KeySize;
+ KDTree->Root.Left = nullptr;
+ KDTree->Root.Right = nullptr;
+ return KDTree;
+}
+
+
+/**
+ * This routine stores Data in the K-D tree specified by Tree
+ * using Key as an access key.
+ *
+ * @param Tree K-D tree in which data is to be stored
+ * @param Key ptr to key by which data can be retrieved
+ * @param Data ptr to data to be stored in the tree
+ */
+void KDStore(KDTREE *Tree, float *Key, void *Data) {
+ int Level;
+ KDNODE *Node;
+ KDNODE **PtrToNode;
+
+ PtrToNode = &(Tree->Root.Left);
+ Node = *PtrToNode;
+ Level = NextLevel(Tree, -1);
+ while (Node != nullptr) {
+ if (Key[Level] < Node->BranchPoint) {
+ PtrToNode = &(Node->Left);
+ if (Key[Level] > Node->LeftBranch)
+ Node->LeftBranch = Key[Level];
+ }
+ else {
+ PtrToNode = &(Node->Right);
+ if (Key[Level] < Node->RightBranch)
+ Node->RightBranch = Key[Level];
+ }
+ Level = NextLevel(Tree, Level);
+ Node = *PtrToNode;
+ }
+
+ *PtrToNode = MakeKDNode(Tree, Key, Data, Level);
+} /* KDStore */
+
+/**
+ * This routine deletes a node from Tree. The node to be
+ * deleted is specified by the Key for the node and the Data
+ * contents of the node. These two pointers must be identical
+ * to the pointers that were used for the node when it was
+ * originally stored in the tree. A node will be deleted from
+ * the tree only if its key and data pointers are identical
+ * to Key and Data respectively. The tree is re-formed by removing
+ * the affected subtree and inserting all elements but the root.
+ *
+ * @param Tree K-D tree to delete node from
+ * @param Key key of node to be deleted
+ * @param Data data contents of node to be deleted
+ */
+void
+KDDelete (KDTREE * Tree, float Key[], void *Data) {
+ int Level;
+ KDNODE *Current;
+ KDNODE *Father;
+
+ /* initialize search at root of tree */
+ Father = &(Tree->Root);
+ Current = Father->Left;
+ Level = NextLevel(Tree, -1);
+
+ /* search tree for node to be deleted */
+ while ((Current != nullptr) && (!NodeFound (Current, Key, Data))) {
+ Father = Current;
+ if (Key[Level] < Current->BranchPoint)
+ Current = Current->Left;
+ else
+ Current = Current->Right;
+
+ Level = NextLevel(Tree, Level);
+ }
+
+ if (Current != nullptr) { /* if node to be deleted was found */
+ if (Current == Father->Left) {
+ Father->Left = nullptr;
+ Father->LeftBranch = Tree->KeyDesc[Level].Min;
+ } else {
+ Father->Right = nullptr;
+ Father->RightBranch = Tree->KeyDesc[Level].Max;
+ }
+
+ InsertNodes(Tree, Current->Left);
+ InsertNodes(Tree, Current->Right);
+ FreeSubTree(Current);
+ }
+} /* KDDelete */
+
+/**
+ * This routine searches the K-D tree specified by Tree and
+ * finds the QuerySize nearest neighbors of Query. All neighbors
+ * must be within MaxDistance of Query. The data contents of
+ * the nearest neighbors
+ * are placed in NBuffer and their distances from Query are
+ * placed in DBuffer.
+ * @param Tree ptr to K-D tree to be searched
+ * @param Query ptr to query key (point in D-space)
+ * @param QuerySize number of nearest neighbors to be found
+ * @param MaxDistance all neighbors must be within this distance
+ * @param NBuffer ptr to QuerySize buffer to hold nearest neighbors
+ * @param DBuffer ptr to QuerySize buffer to hold distances
+ * from nearest neighbor to query point
+ * @param NumberOfResults [out] Number of nearest neighbors actually found
+ */
+void KDNearestNeighborSearch(
+ KDTREE *Tree, float Query[], int QuerySize, float MaxDistance,
+ int *NumberOfResults, void **NBuffer, float DBuffer[]) {
+ KDTreeSearch search(Tree, Query, QuerySize);
+ search.Search(NumberOfResults, DBuffer, NBuffer);
+}
+
+
+/*---------------------------------------------------------------------------*/
+/** Walk a given Tree with action. */
+void KDWalk(KDTREE *Tree, void_proc action, void *context) {
+ if (Tree->Root.Left != nullptr)
+ Walk(Tree, action, context, Tree->Root.Left, NextLevel(Tree, -1));
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine frees all memory which is allocated to the
+ * specified KD-tree. This includes the data structure for
+ * the kd-tree itself plus the data structures for each node
+ * in the tree. It does not include the Key and Data items
+ * which are pointed to by the nodes. This memory is left
+ * untouched.
+ * @param Tree tree data structure to be released
+ */
+void FreeKDTree(KDTREE *Tree) {
+ FreeSubTree(Tree->Root.Left);
+ free(Tree);
+} /* FreeKDTree */
+
+
+/*-----------------------------------------------------------------------------
+ Private Code
+-----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine allocates memory for a new K-D tree node
+ * and places the specified Key and Data into it. The
+ * left and right subtree pointers for the node are
+ * initialized to empty subtrees.
+ * @param tree The tree to create the node for
+ * @param Key Access key for new node in KD tree
+ * @param Data ptr to data to be stored in new node
+ * @param Index index of Key to branch on
+ * @return pointer to new K-D tree node
+ */
+KDNODE *MakeKDNode(KDTREE *tree, float Key[], void *Data, int Index) {
+ KDNODE *NewNode;
+
+ NewNode = static_cast<KDNODE *>(malloc (sizeof (KDNODE)));
+
+ NewNode->Key = Key;
+ NewNode->Data = Data;
+ NewNode->BranchPoint = Key[Index];
+ NewNode->LeftBranch = tree->KeyDesc[Index].Min;
+ NewNode->RightBranch = tree->KeyDesc[Index].Max;
+ NewNode->Left = nullptr;
+ NewNode->Right = nullptr;
+
+ return NewNode;
+} /* MakeKDNode */
+
+
+/*---------------------------------------------------------------------------*/
+void FreeKDNode(KDNODE *Node) { free(Node); }
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Recursively accumulate the k_closest points to query_point_ into results_.
+ * @param Level level in tree of sub-tree to be searched
+ * @param SubTree sub-tree to be searched
+ */
+void KDTreeSearch::SearchRec(int level, KDNODE *sub_tree) {
+ if (level >= tree_->KeySize)
+ level = 0;
+
+ if (!BoxIntersectsSearch(sb_min_, sb_max_))
+ return;
+
+ results_.insert(DistanceSquared(tree_->KeySize, tree_->KeyDesc, query_point_,
+ sub_tree->Key),
+ sub_tree->Data);
+
+ if (query_point_[level] < sub_tree->BranchPoint) {
+ if (sub_tree->Left != nullptr) {
+ float tmp = sb_max_[level];
+ sb_max_[level] = sub_tree->LeftBranch;
+ SearchRec(NextLevel(tree_, level), sub_tree->Left);
+ sb_max_[level] = tmp;
+ }
+ if (sub_tree->Right != nullptr) {
+ float tmp = sb_min_[level];
+ sb_min_[level] = sub_tree->RightBranch;
+ SearchRec(NextLevel(tree_, level), sub_tree->Right);
+ sb_min_[level] = tmp;
+ }
+ } else {
+ if (sub_tree->Right != nullptr) {
+ float tmp = sb_min_[level];
+ sb_min_[level] = sub_tree->RightBranch;
+ SearchRec(NextLevel(tree_, level), sub_tree->Right);
+ sb_min_[level] = tmp;
+ }
+ if (sub_tree->Left != nullptr) {
+ float tmp = sb_max_[level];
+ sb_max_[level] = sub_tree->LeftBranch;
+ SearchRec(NextLevel(tree_, level), sub_tree->Left);
+ sb_max_[level] = tmp;
+ }
+ }
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ *Returns the Euclidean distance squared between p1 and p2 for all essential
+ * dimensions.
+ * @param k keys are in k-space
+ * @param dim dimension descriptions (essential, circular, etc)
+ * @param p1,p2 two different points in K-D space
+ */
+float DistanceSquared(int k, PARAM_DESC *dim, float p1[], float p2[]) {
+ float total_distance = 0;
+
+ for (; k > 0; k--, p1++, p2++, dim++) {
+ if (dim->NonEssential)
+ continue;
+
+ float dimension_distance = *p1 - *p2;
+
+ /* if this dimension is circular - check wraparound distance */
+ if (dim->Circular) {
+ dimension_distance = Magnitude(dimension_distance);
+ float wrap_distance = dim->Max - dim->Min - dimension_distance;
+ dimension_distance = std::min(dimension_distance, wrap_distance);
+ }
+
+ total_distance += dimension_distance * dimension_distance;
+ }
+ return total_distance;
+}
+
+float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[]) {
+ return sqrt(DistanceSquared(k, dim, p1, p2));
+}
+
+/*---------------------------------------------------------------------------*/
+/// Return whether the query region (the smallest known circle about
+/// query_point_ containing results->k_ points) intersects the box specified
+/// between lower and upper. For circular dimensions, we also check the point
+/// one wrap distance away from the query.
+bool KDTreeSearch::BoxIntersectsSearch(float *lower, float *upper) {
+ float *query = query_point_;
+ // Compute the sum in higher precision.
+ double total_distance = 0.0;
+ double radius_squared = static_cast<double>(results_.max_insertable_key()) *
+ results_.max_insertable_key();
+ PARAM_DESC *dim = tree_->KeyDesc;
+
+ for (int i = tree_->KeySize; i > 0; i--, dim++, query++, lower++, upper++) {
+ if (dim->NonEssential)
+ continue;
+
+ float dimension_distance;
+ if (*query < *lower)
+ dimension_distance = *lower - *query;
+ else if (*query > *upper)
+ dimension_distance = *query - *upper;
+ else
+ dimension_distance = 0;
+
+ /* if this dimension is circular - check wraparound distance */
+ if (dim->Circular) {
+ float wrap_distance = FLT_MAX;
+ if (*query < *lower)
+ wrap_distance = *query + dim->Max - dim->Min - *upper;
+ else if (*query > *upper)
+ wrap_distance = *lower - (*query - (dim->Max - dim->Min));
+ dimension_distance = std::min(dimension_distance, wrap_distance);
+ }
+
+ total_distance +=
+ static_cast<double>(dimension_distance) * dimension_distance;
+ if (total_distance >= radius_squared)
+ return false;
+ }
+ return true;
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Walk a tree, calling action once on each node.
+ *
+ * Operation:
+ * This routine walks through the specified sub_tree and invokes action
+ * action at each node as follows:
+ * action(context, data, level)
+ * data the data contents of the node being visited,
+ * level is the level of the node in the tree with the root being level 0.
+ * @param tree root of the tree being walked.
+ * @param action action to be performed at every node
+ * @param context action's context
+ * @param sub_tree ptr to root of subtree to be walked
+ * @param level current level in the tree for this node
+ */
+void Walk(KDTREE *tree, void_proc action, void *context,
+ KDNODE *sub_tree, int32_t level) {
+ (*action)(context, sub_tree->Data, level);
+ if (sub_tree->Left != nullptr)
+ Walk(tree, action, context, sub_tree->Left, NextLevel(tree, level));
+ if (sub_tree->Right != nullptr)
+ Walk(tree, action, context, sub_tree->Right, NextLevel(tree, level));
+}
+
+/** Given a subtree nodes, insert all of its elements into tree. */
+void InsertNodes(KDTREE *tree, KDNODE *nodes) {
+ if (nodes == nullptr)
+ return;
+
+ KDStore(tree, nodes->Key, nodes->Data);
+ InsertNodes(tree, nodes->Left);
+ InsertNodes(tree, nodes->Right);
+}
+
+/** Free all of the nodes of a sub tree. */
+void FreeSubTree(KDNODE *sub_tree) {
+ if (sub_tree != nullptr) {
+ FreeSubTree(sub_tree->Left);
+ FreeSubTree(sub_tree->Right);
+ free(sub_tree);
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/kdtree.h b/tesseract/src/classify/kdtree.h
new file mode 100644
index 00000000..b8512191
--- /dev/null
+++ b/tesseract/src/classify/kdtree.h
@@ -0,0 +1,98 @@
+/******************************************************************************
+ ** Filename: kdtree.h
+ ** Purpose: Definition of K-D tree access routines.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#ifndef KDTREE_H
+#define KDTREE_H
+
+#include "ocrfeatures.h"
+
+namespace tesseract {
+
+using void_proc = void (*)(...);
+
+/**
+NOTE: All circular parameters of all keys must be in the range
+
+Min <= Param < Max
+
+where Min and Max are specified in the KeyDesc parameter passed to
+MakeKDTree. All KD routines assume that this is true and will not operate
+correctly if circular parameters outside the specified range are used.
+*/
+
+struct KDNODE {
+ float* Key; /**< search key */
+ void* Data; /**< data that corresponds to key */
+ float BranchPoint; /**< needed to make deletes work efficiently */
+ float LeftBranch; /**< used to optimize search pruning */
+ float RightBranch; /**< used to optimize search pruning */
+ struct KDNODE* Left; /**< ptrs for KD tree structure */
+ struct KDNODE* Right;
+};
+
+struct KDTREE {
+ int16_t KeySize; /* number of dimensions in the tree */
+ KDNODE Root; /* Root.Left points to actual root node */
+ PARAM_DESC KeyDesc[1]; /* description of each dimension */
+};
+
+/*----------------------------------------------------------------------------
+ Macros
+-----------------------------------------------------------------------------*/
+#define RootOf(T) ((T)->Root.Left->Data)
+
+/*-----------------------------------------------------------------------------
+ Public Function Prototypes
+-----------------------------------------------------------------------------*/
+KDTREE* MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[]);
+
+void KDStore(KDTREE* Tree, float* Key, void* Data);
+
+void KDDelete(KDTREE* Tree, float Key[], void* Data);
+
+void KDNearestNeighborSearch(KDTREE* Tree, float Query[], int QuerySize,
+ float MaxDistance, int* NumberOfResults,
+ void** NBuffer, float DBuffer[]);
+
+void KDWalk(KDTREE* Tree, void_proc Action, void* context);
+
+void FreeKDTree(KDTREE* Tree);
+
+/*-----------------------------------------------------------------------------
+ Private Function Prototypes
+-----------------------------------------------------------------------------*/
+KDNODE* MakeKDNode(KDTREE* tree, float Key[], void* Data, int Index);
+
+void FreeKDNode(KDNODE* Node);
+
+float DistanceSquared(int k, PARAM_DESC* dim, float p1[], float p2[]);
+
+TESS_API
+float ComputeDistance(int k, PARAM_DESC* dim, float p1[], float p2[]);
+
+int QueryInSearch(KDTREE* tree);
+
+void Walk(KDTREE* tree, void_proc action, void* context, KDNODE* SubTree,
+ int32_t Level);
+
+void InsertNodes(KDTREE* tree, KDNODE* nodes);
+
+void FreeSubTree(KDNODE* SubTree);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/mf.cpp b/tesseract/src/classify/mf.cpp
new file mode 100644
index 00000000..d6232eee
--- /dev/null
+++ b/tesseract/src/classify/mf.cpp
@@ -0,0 +1,82 @@
+/******************************************************************************
+ ** Filename: mf.c
+ ** Purpose: Micro-feature interface to flexible feature extractor.
+ ** Author: Dan Johnson
+ ** History: Thu May 24 09:08:38 1990, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+/*----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------*/
+#include "mf.h"
+
+#include "featdefs.h"
+#include "mfdefs.h"
+#include "mfx.h"
+
+#include <cmath>
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+/**
+ * Call the old micro-feature extractor and then copy
+ * the features into the new format. Then deallocate the
+ * old micro-features.
+ * @param Blob blob to extract micro-features from
+ * @param cn_denorm control parameter to feature extractor.
+ * @return Micro-features for Blob.
+ */
+FEATURE_SET ExtractMicros(TBLOB* Blob, const DENORM& cn_denorm) {
+ int NumFeatures;
+ MICROFEATURES Features, OldFeatures;
+ FEATURE_SET FeatureSet;
+ FEATURE Feature;
+ MICROFEATURE OldFeature;
+
+ OldFeatures = BlobMicroFeatures(Blob, cn_denorm);
+ if (OldFeatures == nullptr)
+ return nullptr;
+ NumFeatures = count (OldFeatures);
+ FeatureSet = NewFeatureSet (NumFeatures);
+
+ Features = OldFeatures;
+ iterate(Features) {
+ OldFeature = reinterpret_cast<MICROFEATURE>first_node (Features);
+ Feature = NewFeature (&MicroFeatureDesc);
+ Feature->Params[MFDirection] = OldFeature[ORIENTATION];
+ Feature->Params[MFXPosition] = OldFeature[XPOSITION];
+ Feature->Params[MFYPosition] = OldFeature[YPOSITION];
+ Feature->Params[MFLength] = OldFeature[MFLENGTH];
+
+ // Bulge features are deprecated and should not be used. Set to 0.
+ Feature->Params[MFBulge1] = 0.0f;
+ Feature->Params[MFBulge2] = 0.0f;
+
+#ifndef _WIN32
+ // Assert that feature parameters are well defined.
+ int i;
+ for (i = 0; i < Feature->Type->NumParams; i++) {
+ ASSERT_HOST(!std::isnan(Feature->Params[i]));
+ }
+#endif
+
+ AddFeature(FeatureSet, Feature);
+ }
+ FreeMicroFeatures(OldFeatures);
+ return FeatureSet;
+} /* ExtractMicros */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/mf.h b/tesseract/src/classify/mf.h
new file mode 100644
index 00000000..b1113ce5
--- /dev/null
+++ b/tesseract/src/classify/mf.h
@@ -0,0 +1,40 @@
+/******************************************************************************
+ ** Filename: mf.h
+ ** Purpose: Micro-feature interface to flexible feature extractor.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef MF_H
+#define MF_H
+
+#include "ocrfeatures.h"
+#include "blobs.h"
+
+namespace tesseract {
+
+typedef enum {
+ MFXPosition, MFYPosition,
+ MFLength, MFDirection, MFBulge1, MFBulge2,
+ MFCount // For array sizes.
+} MF_PARAM_NAME;
+
+typedef float MicroFeature[MFCount];
+/*----------------------------------------------------------------------------
+ Private Function Prototypes
+-----------------------------------------------------------------------------*/
+FEATURE_SET ExtractMicros(TBLOB* Blob, const DENORM& cn_denorm);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/mfdefs.cpp b/tesseract/src/classify/mfdefs.cpp
new file mode 100644
index 00000000..3442fdfc
--- /dev/null
+++ b/tesseract/src/classify/mfdefs.cpp
@@ -0,0 +1,46 @@
+/******************************************************************************
+ ** Filename: mfdefs.cpp
+ ** Purpose: Basic routines for manipulating micro-features
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "mfdefs.h"
+
+#include <cmath>
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------**/
+
+/**
+ * This routine allocates and returns a new micro-feature
+ * data structure.
+ * @return New MICROFEATURE
+ */
+MICROFEATURE NewMicroFeature() {
+ return (static_cast<MICROFEATURE>(malloc (sizeof (MFBLOCK))));
+} /* NewMicroFeature */
+
+/**
+ * This routine deallocates all of the memory consumed by
+ * a list of micro-features.
+ * @param MicroFeatures list of micro-features to be freed
+ */
+void FreeMicroFeatures(MICROFEATURES MicroFeatures) {
+ destroy_nodes(MicroFeatures, free);
+} /* FreeMicroFeatures */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/mfdefs.h b/tesseract/src/classify/mfdefs.h
new file mode 100644
index 00000000..90d5374b
--- /dev/null
+++ b/tesseract/src/classify/mfdefs.h
@@ -0,0 +1,61 @@
+/******************************************************************************
+ ** Filename: mfdefs.h
+ ** Purpose: Definition of micro-features
+ ** Author: Dan Johnson
+ ** History: Mon Jan 22 08:42:13 1990, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+#ifndef MFDEFS_H
+#define MFDEFS_H
+
+/**----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#include "matchdefs.h"
+#include "oldlist.h"
+
+namespace tesseract {
+
+/* definition of a list of micro-features */
+using MICROFEATURES = LIST;
+
+/* definition of structure of micro-features */
+#define MFSIZE 6
+typedef float MFBLOCK[MFSIZE];
+using MICROFEATURE = float*;
+
+/* definitions of individual micro-feature parameters */
+#define XPOSITION 0
+#define YPOSITION 1
+#define MFLENGTH 2
+#define ORIENTATION 3
+#define FIRSTBULGE 4
+#define SECONDBULGE 5
+
+/**----------------------------------------------------------------------------
+ Macros
+----------------------------------------------------------------------------**/
+
+/* macros for accessing micro-feature lists */
+#define NextFeatureOf(L) ((MICROFEATURE)first_node(L))
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+MICROFEATURE NewMicroFeature();
+
+void FreeMicroFeatures(MICROFEATURES MicroFeatures);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/mfoutline.cpp b/tesseract/src/classify/mfoutline.cpp
new file mode 100644
index 00000000..450c7acc
--- /dev/null
+++ b/tesseract/src/classify/mfoutline.cpp
@@ -0,0 +1,446 @@
+/******************************************************************************
+ ** Filename: mfoutline.c
+ ** Purpose: Interface to outline struct used for extracting features
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "mfoutline.h"
+
+#include "clusttool.h" //If remove you get cought in a loop somewhere
+#include "blobs.h"
+#include "mfx.h"
+#include "params.h"
+#include "classify.h"
+
+#include <cmath>
+#include <cstdio>
+
+namespace tesseract {
+
+/*---------------------------------------------------------------------------*/
+/** Convert a blob into a list of MFOUTLINEs (float-based microfeature format).
+ */
+LIST ConvertBlob(TBLOB *blob) {
+ LIST outlines = NIL_LIST;
+ return (blob == nullptr)
+ ? NIL_LIST
+ : ConvertOutlines(blob->outlines, outlines, outer);
+}
+
+
+/*---------------------------------------------------------------------------*/
+/** Convert a TESSLINE into the float-based MFOUTLINE micro-feature format. */
+MFOUTLINE ConvertOutline(TESSLINE *outline) {
+ MFEDGEPT *NewPoint;
+ MFOUTLINE MFOutline = NIL_LIST;
+ EDGEPT *EdgePoint;
+ EDGEPT *StartPoint;
+ EDGEPT *NextPoint;
+
+ if (outline == nullptr || outline->loop == nullptr)
+ return MFOutline;
+
+ StartPoint = outline->loop;
+ EdgePoint = StartPoint;
+ do {
+ NextPoint = EdgePoint->next;
+
+ /* filter out duplicate points */
+ if (EdgePoint->pos.x != NextPoint->pos.x ||
+ EdgePoint->pos.y != NextPoint->pos.y) {
+ NewPoint = NewEdgePoint();
+ NewPoint->ClearMark();
+ NewPoint->Hidden = EdgePoint->IsHidden();
+ NewPoint->Point.x = EdgePoint->pos.x;
+ NewPoint->Point.y = EdgePoint->pos.y;
+ MFOutline = push(MFOutline, NewPoint);
+ }
+ EdgePoint = NextPoint;
+ } while (EdgePoint != StartPoint);
+
+ if (MFOutline != nullptr)
+ MakeOutlineCircular(MFOutline);
+ return MFOutline;
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Convert a tree of outlines to a list of MFOUTLINEs (lists of MFEDGEPTs).
+ *
+ * @param outline first outline to be converted
+ * @param mf_outlines list to add converted outlines to
+ * @param outline_type are the outlines outer or holes?
+ */
+LIST ConvertOutlines(TESSLINE *outline,
+ LIST mf_outlines,
+ OUTLINETYPE outline_type) {
+ MFOUTLINE mf_outline;
+
+ while (outline != nullptr) {
+ mf_outline = ConvertOutline(outline);
+ if (mf_outline != nullptr)
+ mf_outlines = push(mf_outlines, mf_outline);
+ outline = outline->next;
+ }
+ return mf_outlines;
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine searches through the specified outline, computes
+ * a slope for each vector in the outline, and marks each
+ * vector as having one of the following directions:
+ * N, S, E, W, NE, NW, SE, SW
+ * This information is then stored in the outline and the
+ * outline is returned.
+ * @param Outline micro-feature outline to analyze
+ * @param MinSlope controls "snapping" of segments to horizontal
+ * @param MaxSlope controls "snapping" of segments to vertical
+ */
+void FindDirectionChanges(MFOUTLINE Outline,
+ float MinSlope,
+ float MaxSlope) {
+ MFEDGEPT *Current;
+ MFEDGEPT *Last;
+ MFOUTLINE EdgePoint;
+
+ if (DegenerateOutline (Outline))
+ return;
+
+ Last = PointAt (Outline);
+ Outline = NextPointAfter (Outline);
+ EdgePoint = Outline;
+ do {
+ Current = PointAt (EdgePoint);
+ ComputeDirection(Last, Current, MinSlope, MaxSlope);
+
+ Last = Current;
+ EdgePoint = NextPointAfter (EdgePoint);
+ }
+ while (EdgePoint != Outline);
+
+} /* FindDirectionChanges */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine deallocates all of the memory consumed by
+ * a micro-feature outline.
+ * @param arg micro-feature outline to be freed
+ */
+void FreeMFOutline(void *arg) { //MFOUTLINE Outline)
+ MFOUTLINE Start;
+ auto Outline = static_cast<MFOUTLINE>(arg);
+
+ /* break the circular outline so we can use std. techniques to deallocate */
+ Start = list_rest (Outline);
+ set_rest(Outline, NIL_LIST);
+ while (Start != nullptr) {
+ free(first_node(Start));
+ Start = pop (Start);
+ }
+
+} /* FreeMFOutline */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Release all memory consumed by the specified list
+ * of outlines.
+ * @param Outlines list of mf-outlines to be freed
+ */
+void FreeOutlines(LIST Outlines) {
+ destroy_nodes(Outlines, FreeMFOutline);
+} /* FreeOutlines */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine searches through the specified outline and finds
+ * the points at which the outline changes direction. These
+ * points are then marked as "extremities". This routine is
+ * used as an alternative to FindExtremities(). It forces the
+ * endpoints of the microfeatures to be at the direction
+ * changes rather than at the midpoint between direction
+ * changes.
+ * @param Outline micro-feature outline to analyze
+ */
+void MarkDirectionChanges(MFOUTLINE Outline) {
+ MFOUTLINE Current;
+ MFOUTLINE Last;
+ MFOUTLINE First;
+
+ if (DegenerateOutline (Outline))
+ return;
+
+ First = NextDirectionChange (Outline);
+ Last = First;
+ do {
+ Current = NextDirectionChange (Last);
+ PointAt(Current)->MarkPoint();
+ Last = Current;
+ }
+ while (Last != First);
+
+} /* MarkDirectionChanges */
+
+
+/*---------------------------------------------------------------------------*/
+/** Return a new edge point for a micro-feature outline. */
+MFEDGEPT *NewEdgePoint() {
+ return reinterpret_cast<MFEDGEPT *>(malloc(sizeof(MFEDGEPT)));
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine returns the next point in the micro-feature
+ * outline that is an extremity. The search starts after
+ * EdgePoint. The routine assumes that the outline being
+ * searched is not a degenerate outline (i.e. it must have
+ * 2 or more edge points).
+ * @param EdgePoint start search from this point
+ * @return Next extremity in the outline after EdgePoint.
+ * @note Globals: none
+ */
+MFOUTLINE NextExtremity(MFOUTLINE EdgePoint) {
+ EdgePoint = NextPointAfter(EdgePoint);
+ while (!PointAt(EdgePoint)->ExtremityMark)
+ EdgePoint = NextPointAfter(EdgePoint);
+
+ return (EdgePoint);
+
+} /* NextExtremity */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine normalizes the coordinates of the specified
+ * outline so that the outline is deskewed down to the
+ * baseline, translated so that x=0 is at XOrigin, and scaled
+ * so that the height of a character cell from descender to
+ * ascender is 1. Of this height, 0.25 is for the descender,
+ * 0.25 for the ascender, and 0.5 for the x-height. The
+ * y coordinate of the baseline is 0.
+ * @param Outline outline to be normalized
+ * @param XOrigin x-origin of text
+ */
+void NormalizeOutline(MFOUTLINE Outline,
+ float XOrigin) {
+ if (Outline == NIL_LIST)
+ return;
+
+ MFOUTLINE EdgePoint = Outline;
+ do {
+ MFEDGEPT *Current = PointAt(EdgePoint);
+ Current->Point.y = MF_SCALE_FACTOR *
+ (Current->Point.y - kBlnBaselineOffset);
+ Current->Point.x = MF_SCALE_FACTOR * (Current->Point.x - XOrigin);
+ EdgePoint = NextPointAfter(EdgePoint);
+ } while (EdgePoint != Outline);
+} /* NormalizeOutline */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine normalizes every outline in Outlines
+ * according to the currently selected normalization method.
+ * It also returns the scale factors that it used to do this
+ * scaling. The scale factors returned represent the x and
+ * y sizes in the normalized coordinate system that correspond
+ * to 1 pixel in the original coordinate system.
+ * Outlines are changed and XScale and YScale are updated.
+ *
+ * Globals:
+ * - classify_norm_method method being used for normalization
+ * - classify_char_norm_range map radius of gyration to this value
+ * @param Outlines list of outlines to be normalized
+ * @param XScale x-direction scale factor used by routine
+ * @param YScale y-direction scale factor used by routine
+ */
+void Classify::NormalizeOutlines(LIST Outlines,
+ float *XScale,
+ float *YScale) {
+ MFOUTLINE Outline;
+
+ switch (classify_norm_method) {
+ case character:
+ ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
+ break;
+
+ case baseline:
+ iterate(Outlines) {
+ Outline = static_cast<MFOUTLINE>first_node(Outlines);
+ NormalizeOutline(Outline, 0.0);
+ }
+ *XScale = *YScale = MF_SCALE_FACTOR;
+ break;
+ }
+} /* NormalizeOutlines */
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+/**
+ * Change the direction of every vector in the specified
+ * outline segment to Direction. The segment to be changed
+ * starts at Start and ends at End. Note that the previous
+ * direction of End must also be changed to reflect the
+ * change in direction of the point before it.
+ * @param Start defines start of segment of outline to be modified
+ * @param End defines end of segment of outline to be modified
+ * @param Direction new direction to assign to segment
+ */
+void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction) {
+ MFOUTLINE Current;
+
+ for (Current = Start; Current != End; Current = NextPointAfter (Current))
+ PointAt (Current)->Direction = Direction;
+
+ PointAt (End)->PreviousDirection = Direction;
+
+} /* ChangeDirection */
+
+/**
+ * This routine normalizes each point in Outline by
+ * translating it to the specified center and scaling it
+ * anisotropically according to the given scale factors.
+ * @param Outline outline to be character normalized
+ * @param cn_denorm
+ */
+void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm) {
+ MFOUTLINE First, Current;
+ MFEDGEPT *CurrentPoint;
+
+ if (Outline == NIL_LIST)
+ return;
+
+ First = Outline;
+ Current = First;
+ do {
+ CurrentPoint = PointAt(Current);
+ FCOORD pos(CurrentPoint->Point.x, CurrentPoint->Point.y);
+ cn_denorm.LocalNormTransform(pos, &pos);
+ CurrentPoint->Point.x = (pos.x() - UINT8_MAX / 2) * MF_SCALE_FACTOR;
+ CurrentPoint->Point.y = (pos.y() - UINT8_MAX / 2) * MF_SCALE_FACTOR;
+
+ Current = NextPointAfter(Current);
+ }
+ while (Current != First);
+
+} /* CharNormalizeOutline */
+
+/**
+ * This routine computes the slope from Start to Finish and
+ * and then computes the approximate direction of the line
+ * segment from Start to Finish. The direction is quantized
+ * into 8 buckets:
+ * N, S, E, W, NE, NW, SE, SW
+ * Both the slope and the direction are then stored into
+ * the appropriate fields of the Start edge point. The
+ * direction is also stored into the PreviousDirection field
+ * of the Finish edge point.
+ * @param Start starting point to compute direction from
+ * @param Finish finishing point to compute direction to
+ * @param MinSlope slope below which lines are horizontal
+ * @param MaxSlope slope above which lines are vertical
+ */
+void ComputeDirection(MFEDGEPT *Start,
+ MFEDGEPT *Finish,
+ float MinSlope,
+ float MaxSlope) {
+ FVECTOR Delta;
+
+ Delta.x = Finish->Point.x - Start->Point.x;
+ Delta.y = Finish->Point.y - Start->Point.y;
+ if (Delta.x == 0) {
+ if (Delta.y < 0) {
+ Start->Slope = -FLT_MAX;
+ Start->Direction = south;
+ } else {
+ Start->Slope = FLT_MAX;
+ Start->Direction = north;
+ }
+ } else {
+ Start->Slope = Delta.y / Delta.x;
+ if (Delta.x > 0) {
+ if (Delta.y > 0) {
+ if (Start->Slope > MinSlope) {
+ if (Start->Slope < MaxSlope) {
+ Start->Direction = northeast;
+ } else {
+ Start->Direction = north;
+ }
+ } else {
+ Start->Direction = east;
+ }
+ }
+ else if (Start->Slope < -MinSlope) {
+ if (Start->Slope > -MaxSlope) {
+ Start->Direction = southeast;
+ } else {
+ Start->Direction = south;
+ }
+ } else {
+ Start->Direction = east;
+ }
+ } else if (Delta.y > 0) {
+ if (Start->Slope < -MinSlope) {
+ if (Start->Slope > -MaxSlope) {
+ Start->Direction = northwest;
+ } else {
+ Start->Direction = north;
+ }
+ } else {
+ Start->Direction = west;
+ }
+ } else if (Start->Slope > MinSlope) {
+ if (Start->Slope < MaxSlope) {
+ Start->Direction = southwest;
+ } else {
+ Start->Direction = south;
+ }
+ } else {
+ Start->Direction = west;
+ }
+ }
+ Finish->PreviousDirection = Start->Direction;
+}
+
+/**
+ * This routine returns the next point in the micro-feature
+ * outline that has a direction different than EdgePoint. The
+ * routine assumes that the outline being searched is not a
+ * degenerate outline (i.e. it must have 2 or more edge points).
+ * @param EdgePoint start search from this point
+ * @return Point of next direction change in micro-feature outline.
+ * @note Globals: none
+ */
+MFOUTLINE NextDirectionChange(MFOUTLINE EdgePoint) {
+ DIRECTION InitialDirection;
+
+ InitialDirection = PointAt (EdgePoint)->Direction;
+
+ MFOUTLINE next_pt = nullptr;
+ do {
+ EdgePoint = NextPointAfter(EdgePoint);
+ next_pt = NextPointAfter(EdgePoint);
+ } while (PointAt(EdgePoint)->Direction == InitialDirection &&
+ !PointAt(EdgePoint)->Hidden &&
+ next_pt != nullptr && !PointAt(next_pt)->Hidden);
+
+ return (EdgePoint);
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/mfoutline.h b/tesseract/src/classify/mfoutline.h
new file mode 100644
index 00000000..6da42855
--- /dev/null
+++ b/tesseract/src/classify/mfoutline.h
@@ -0,0 +1,135 @@
+/******************************************************************************
+ ** Filename: mfoutline.h
+ ** Purpose: Interface spec for fx outline structures
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef MFOUTLINE_H
+#define MFOUTLINE_H
+
+#include "blobs.h"
+#include "fpoint.h"
+#include "oldlist.h"
+#include "params.h"
+
+namespace tesseract {
+
+using MFOUTLINE = LIST;
+
+enum DIRECTION : uint8_t {
+ north,
+ south,
+ east,
+ west,
+ northeast,
+ northwest,
+ southeast,
+ southwest
+};
+
+struct MFEDGEPT {
+ // Inline functions for manipulating micro-feature outline edge points.
+
+ void ClearMark() {
+ ExtremityMark = false;
+ }
+
+ void MarkPoint() {
+ ExtremityMark = true;
+ }
+
+ FPOINT Point;
+ float Slope;
+ bool Hidden;
+ bool ExtremityMark;
+ DIRECTION Direction;
+ DIRECTION PreviousDirection;
+};
+
+enum OUTLINETYPE { outer, hole };
+
+enum NORM_METHOD { baseline, character };
+
+/**----------------------------------------------------------------------------
+ Macros
+----------------------------------------------------------------------------**/
+#define AverageOf(A, B) (((A) + (B)) / 2)
+
+// Constant for computing the scale factor to use to normalize characters.
+const float MF_SCALE_FACTOR = 0.5f / kBlnXHeight;
+
+// Inline functions for manipulating micro-feature outlines.
+
+static inline bool DegenerateOutline(MFOUTLINE Outline) {
+ return (Outline == NIL_LIST) || (Outline == list_rest(Outline));
+}
+
+static inline MFEDGEPT* PointAt(MFOUTLINE Outline) {
+ return reinterpret_cast<MFEDGEPT*>first_node(Outline);
+}
+
+static inline MFOUTLINE NextPointAfter(MFOUTLINE Outline) {
+ return list_rest(Outline);
+}
+
+static inline void MakeOutlineCircular(MFOUTLINE Outline) {
+ set_rest(last(Outline), Outline);
+}
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+void ComputeBlobCenter(TBLOB* Blob, TPOINT* BlobCenter);
+
+LIST ConvertBlob(TBLOB* Blob);
+
+MFOUTLINE ConvertOutline(TESSLINE* Outline);
+
+LIST ConvertOutlines(TESSLINE* Outline, LIST ConvertedOutlines,
+ OUTLINETYPE OutlineType);
+
+void FilterEdgeNoise(MFOUTLINE Outline, float NoiseSegmentLength);
+
+void FindDirectionChanges(MFOUTLINE Outline, float MinSlope, float MaxSlope);
+
+void FreeMFOutline(void* agr); // MFOUTLINE Outline);
+
+void FreeOutlines(LIST Outlines);
+
+void MarkDirectionChanges(MFOUTLINE Outline);
+
+MFEDGEPT* NewEdgePoint();
+
+MFOUTLINE NextExtremity(MFOUTLINE EdgePoint);
+
+void NormalizeOutline(MFOUTLINE Outline, float XOrigin);
+
+/*----------------------------------------------------------------------------
+ Private Function Prototypes
+-----------------------------------------------------------------------------*/
+void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction);
+
+// Normalizes the Outline in-place using cn_denorm's local transformation,
+// then converts from the integer feature range [0,255] to the clusterer
+// feature range of [-0.5, 0.5].
+void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm);
+
+void ComputeDirection(MFEDGEPT* Start, MFEDGEPT* Finish, float MinSlope,
+ float MaxSlope);
+
+MFOUTLINE NextDirectionChange(MFOUTLINE EdgePoint);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/mfx.cpp b/tesseract/src/classify/mfx.cpp
new file mode 100644
index 00000000..49b7f0e0
--- /dev/null
+++ b/tesseract/src/classify/mfx.cpp
@@ -0,0 +1,152 @@
+/******************************************************************************
+ ** Filename: mfx.c
+ ** Purpose: Micro feature extraction routines
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#include "mfx.h"
+
+#include "mfdefs.h"
+#include "mfoutline.h"
+#include "clusttool.h" //NEEDED
+#include "intfx.h"
+#include "normalis.h"
+#include "params.h"
+
+namespace tesseract {
+
+/* old numbers corresponded to 10.0 degrees and 80.0 degrees */
+double_VAR(classify_min_slope, 0.414213562,
+ "Slope below which lines are called horizontal");
+double_VAR(classify_max_slope, 2.414213562,
+ "Slope above which lines are called vertical");
+
+/*----------------------------------------------------------------------------
+ Private Function Prototypes
+-----------------------------------------------------------------------------*/
+
+MICROFEATURES ConvertToMicroFeatures(MFOUTLINE Outline,
+ MICROFEATURES MicroFeatures);
+
+MICROFEATURE ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End);
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+
+/**
+ * This routine extracts micro-features from the specified
+ * blob and returns a list of the micro-features. All
+ * micro-features are normalized according to the specified
+ * line statistics.
+ * @param Blob blob to extract micro-features from
+ * @param cn_denorm control parameter to feature extractor
+ * @return List of micro-features extracted from the blob.
+ */
+MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm) {
+ MICROFEATURES MicroFeatures = NIL_LIST;
+ LIST Outlines;
+ LIST RemainingOutlines;
+ MFOUTLINE Outline;
+
+ if (Blob != nullptr) {
+ Outlines = ConvertBlob(Blob);
+
+ RemainingOutlines = Outlines;
+ iterate(RemainingOutlines) {
+ Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
+ CharNormalizeOutline(Outline, cn_denorm);
+ }
+
+ RemainingOutlines = Outlines;
+ iterate(RemainingOutlines) {
+ Outline = static_cast<MFOUTLINE>first_node(RemainingOutlines);
+ FindDirectionChanges(Outline, classify_min_slope, classify_max_slope);
+ MarkDirectionChanges(Outline);
+ MicroFeatures = ConvertToMicroFeatures(Outline, MicroFeatures);
+ }
+ FreeOutlines(Outlines);
+ }
+ return MicroFeatures;
+} /* BlobMicroFeatures */
+
+/*---------------------------------------------------------------------------
+ Private Code
+---------------------------------------------------------------------------*/
+
+/**
+ * Convert Outline to MicroFeatures
+ * @param Outline outline to extract micro-features from
+ * @param MicroFeatures list of micro-features to add to
+ * @return List of micro-features with new features added to front.
+ * @note Globals: none
+ */
+MICROFEATURES ConvertToMicroFeatures(MFOUTLINE Outline,
+ MICROFEATURES MicroFeatures) {
+ MFOUTLINE Current;
+ MFOUTLINE Last;
+ MFOUTLINE First;
+ MICROFEATURE NewFeature;
+
+ if (DegenerateOutline (Outline))
+ return (MicroFeatures);
+
+ First = NextExtremity (Outline);
+ Last = First;
+ do {
+ Current = NextExtremity (Last);
+ if (!PointAt(Current)->Hidden) {
+ NewFeature = ExtractMicroFeature (Last, Current);
+ if (NewFeature != nullptr)
+ MicroFeatures = push (MicroFeatures, NewFeature);
+ }
+ Last = Current;
+ }
+ while (Last != First);
+
+ return (MicroFeatures);
+} /* ConvertToMicroFeatures */
+
+/**
+ * This routine computes the feature parameters which describe
+ * the micro-feature that starts and Start and ends at End.
+ * A new micro-feature is allocated, filled with the feature
+ * parameters, and returned. The routine assumes that
+ * Start and End are not the same point. If they are the
+ * same point, nullptr is returned, a warning message is
+ * printed, and the current outline is dumped to stdout.
+ * @param Start starting point of micro-feature
+ * @param End ending point of micro-feature
+ * @return New micro-feature or nullptr if the feature was rejected.
+ * @note Globals: none
+ */
+MICROFEATURE ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End) {
+ MICROFEATURE NewFeature;
+ MFEDGEPT *P1, *P2;
+
+ P1 = PointAt(Start);
+ P2 = PointAt(End);
+
+ NewFeature = NewMicroFeature ();
+ NewFeature[XPOSITION] = AverageOf(P1->Point.x, P2->Point.x);
+ NewFeature[YPOSITION] = AverageOf(P1->Point.y, P2->Point.y);
+ NewFeature[MFLENGTH] = DistanceBetween(P1->Point, P2->Point);
+ NewFeature[ORIENTATION] = NormalizedAngleFrom(&P1->Point, &P2->Point, 1.0);
+ NewFeature[FIRSTBULGE] = 0.0f; // deprecated
+ NewFeature[SECONDBULGE] = 0.0f; // deprecated
+
+ return NewFeature;
+} /* ExtractMicroFeature */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/mfx.h b/tesseract/src/classify/mfx.h
new file mode 100644
index 00000000..818e6917
--- /dev/null
+++ b/tesseract/src/classify/mfx.h
@@ -0,0 +1,46 @@
+/******************************************************************************
+ ** Filename: mfx.h
+ ** Purpose: Definition of micro-feature extraction routines
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef MFX_H
+#define MFX_H
+
+#include "mfdefs.h"
+#include "params.h"
+
+namespace tesseract {
+
+class DENORM;
+struct TBLOB;
+
+/*----------------------------------------------------------------------------
+ Variables
+----------------------------------------------------------------------------**/
+
+/* old numbers corresponded to 10.0 degrees and 80.0 degrees */
+extern double_VAR_H(classify_min_slope, 0.414213562,
+ "Slope below which lines are called horizontal");
+extern double_VAR_H(classify_max_slope, 2.414213562,
+ "Slope above which lines are called vertical");
+
+/*----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/normfeat.cpp b/tesseract/src/classify/normfeat.cpp
new file mode 100644
index 00000000..6aa13cc2
--- /dev/null
+++ b/tesseract/src/classify/normfeat.cpp
@@ -0,0 +1,73 @@
+/******************************************************************************
+ ** Filename: normfeat.c
+ ** Purpose: Definition of char normalization features.
+ ** Author: Dan Johnson
+ ** History: 12/14/90, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "normfeat.h"
+
+#include "intfx.h"
+#include "featdefs.h"
+#include "mfoutline.h"
+
+namespace tesseract {
+
+/** Return the length of the outline in baseline normalized form. */
+float ActualOutlineLength(FEATURE Feature) {
+ return (Feature->Params[CharNormLength] * LENGTH_COMPRESSION);
+}
+
+/**
+ * Return the character normalization feature for a blob.
+ *
+ * The features returned are in a scale where the x-height has been
+ * normalized to live in the region y = [-0.25 .. 0.25]. Example ranges
+ * for English below are based on the Linux font collection on 2009-12-04:
+ *
+ * - Params[CharNormY]
+ * - The y coordinate of the grapheme's centroid.
+ * - English: [-0.27, 0.71]
+ *
+ * - Params[CharNormLength]
+ * - The length of the grapheme's outline (tiny segments discarded),
+ * divided by 10.0=LENGTH_COMPRESSION.
+ * - English: [0.16, 0.85]
+ *
+ * - Params[CharNormRx]
+ * - The radius of gyration about the x axis, as measured from CharNormY.
+ * - English: [0.011, 0.34]
+ *
+ * - Params[CharNormRy]
+ * - The radius of gyration about the y axis, as measured from
+ * the x center of the grapheme's bounding box.
+ * - English: [0.011, 0.31]
+ */
+FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT& fx_info) {
+ FEATURE_SET feature_set = NewFeatureSet(1);
+ FEATURE feature = NewFeature(&CharNormDesc);
+
+ feature->Params[CharNormY] =
+ MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
+ feature->Params[CharNormLength] =
+ MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
+ feature->Params[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
+ feature->Params[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;
+
+ AddFeature(feature_set, feature);
+
+ return feature_set;
+} /* ExtractCharNormFeatures */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/normfeat.h b/tesseract/src/classify/normfeat.h
new file mode 100644
index 00000000..6293cab9
--- /dev/null
+++ b/tesseract/src/classify/normfeat.h
@@ -0,0 +1,40 @@
+/******************************************************************************
+ ** Filename: normfeat.h
+ ** Purpose: Definition of character normalization features.
+ ** Author: Dan Johnson
+ ** History: 12/14/90, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *****************************************************************************/
+
+#ifndef NORMFEAT_H
+#define NORMFEAT_H
+
+#include "ocrfeatures.h"
+
+namespace tesseract {
+
+#define LENGTH_COMPRESSION (10.0)
+
+struct INT_FX_RESULT_STRUCT;
+
+typedef enum {
+ CharNormY, CharNormLength, CharNormRx, CharNormRy
+} NORM_PARAM_NAME;
+
+float ActualOutlineLength(FEATURE Feature);
+
+FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info);
+
+}
+
+#endif
diff --git a/tesseract/src/classify/normmatch.cpp b/tesseract/src/classify/normmatch.cpp
new file mode 100644
index 00000000..32bd2876
--- /dev/null
+++ b/tesseract/src/classify/normmatch.cpp
@@ -0,0 +1,231 @@
+/******************************************************************************
+ ** Filename: normmatch.c
+ ** Purpose: Simple matcher based on character normalization features.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+/*----------------------------------------------------------------------------
+ Include Files and Type Defines
+----------------------------------------------------------------------------*/
+#include "normmatch.h"
+
+#include "classify.h"
+#include "clusttool.h"
+#include "helpers.h"
+#include "normfeat.h"
+#include "unicharset.h"
+#include "params.h"
+
+#include <cstdio>
+#include <cmath>
+#include <sstream> // for std::istringstream
+
+namespace tesseract {
+
+struct NORM_PROTOS
+{
+ int NumParams;
+ PARAM_DESC *ParamDesc;
+ LIST* Protos;
+ int NumProtos;
+};
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+
+/**
+ * @name NormEvidenceOf
+ *
+ * Return the new type of evidence number corresponding to this
+ * normalization adjustment. The equation that represents the transform is:
+ * 1 / (1 + (NormAdj / midpoint) ^ curl)
+ */
+static double NormEvidenceOf(double NormAdj) {
+ NormAdj /= classify_norm_adj_midpoint;
+
+ if (classify_norm_adj_curl == 3) {
+ NormAdj = NormAdj * NormAdj * NormAdj;
+ } else if (classify_norm_adj_curl == 2) {
+ NormAdj = NormAdj * NormAdj;
+ } else {
+ NormAdj = pow(NormAdj, classify_norm_adj_curl);
+ }
+ return (1.0 / (1.0 + NormAdj));
+}
+
+/*----------------------------------------------------------------------------
+ Variables
+----------------------------------------------------------------------------*/
+
+/** control knobs used to control the normalization adjustment process */
+double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
+double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
+/** Weight of width variance against height and vertical position. */
+const double kWidthErrorWeighting = 0.125;
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+/**
+ * This routine compares Features against each character
+ * normalization proto for ClassId and returns the match
+ * rating of the best match.
+ * @param ClassId id of class to match against
+ * @param feature character normalization feature
+ * @param DebugMatch controls dump of debug info
+ *
+ * Globals:
+ * #NormProtos character normalization prototypes
+ *
+ * @return Best match rating for Feature against protos of ClassId.
+ */
+float Classify::ComputeNormMatch(CLASS_ID ClassId,
+ const FEATURE_STRUCT& feature,
+ bool DebugMatch) {
+ LIST Protos;
+ float BestMatch;
+ float Match;
+ float Delta;
+ PROTOTYPE *Proto;
+ int ProtoId;
+
+ if (ClassId >= NormProtos->NumProtos) {
+ ClassId = NO_CLASS;
+ }
+
+ /* handle requests for classification as noise */
+ if (ClassId == NO_CLASS) {
+ /* kludge - clean up constants and make into control knobs later */
+ Match = (feature.Params[CharNormLength] *
+ feature.Params[CharNormLength] * 500.0 +
+ feature.Params[CharNormRx] *
+ feature.Params[CharNormRx] * 8000.0 +
+ feature.Params[CharNormRy] *
+ feature.Params[CharNormRy] * 8000.0);
+ return (1.0 - NormEvidenceOf(Match));
+ }
+
+ BestMatch = FLT_MAX;
+ Protos = NormProtos->Protos[ClassId];
+
+ if (DebugMatch) {
+ tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
+ }
+
+ ProtoId = 0;
+ iterate(Protos) {
+ Proto = reinterpret_cast<PROTOTYPE *>first_node (Protos);
+ Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
+ Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
+ if (DebugMatch) {
+ tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
+ Proto->Mean[CharNormY], Delta,
+ Proto->Weight.Elliptical[CharNormY], Match);
+ }
+ Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
+ Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
+ if (DebugMatch) {
+ tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
+ Proto->Mean[CharNormRx], Delta,
+ Proto->Weight.Elliptical[CharNormRx], Match);
+ }
+ // Ry is width! See intfx.cpp.
+ Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
+ if (DebugMatch) {
+ tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
+ Proto->Mean[CharNormRy], Delta,
+ Proto->Weight.Elliptical[CharNormRy]);
+ }
+ Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
+ Delta *= kWidthErrorWeighting;
+ Match += Delta;
+ if (DebugMatch) {
+ tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
+ Match, Match / classify_norm_adj_midpoint,
+ NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
+ }
+
+ if (Match < BestMatch)
+ BestMatch = Match;
+
+ ProtoId++;
+ }
+ return 1.0 - NormEvidenceOf(BestMatch);
+} /* ComputeNormMatch */
+
+void Classify::FreeNormProtos() {
+ if (NormProtos != nullptr) {
+ for (int i = 0; i < NormProtos->NumProtos; i++)
+ FreeProtoList(&NormProtos->Protos[i]);
+ free(NormProtos->Protos);
+ free(NormProtos->ParamDesc);
+ free(NormProtos);
+ NormProtos = nullptr;
+ }
+}
+
+/**
+ * This routine allocates a new data structure to hold
+ * a set of character normalization protos. It then fills in
+ * the data structure by reading from the specified File.
+ * @param fp open text file to read normalization protos from
+ * Globals: none
+ * @return Character normalization protos.
+ */
+NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
+ NORM_PROTOS *NormProtos;
+ int i;
+ char unichar[2 * UNICHAR_LEN + 1];
+ UNICHAR_ID unichar_id;
+ LIST Protos;
+ int NumProtos;
+
+ /* allocate and initialization data structure */
+ NormProtos = static_cast<NORM_PROTOS *>(malloc (sizeof (NORM_PROTOS)));
+ NormProtos->NumProtos = unicharset.size();
+ NormProtos->Protos = static_cast<LIST *>(malloc (NormProtos->NumProtos * sizeof(LIST)));
+ for (i = 0; i < NormProtos->NumProtos; i++)
+ NormProtos->Protos[i] = NIL_LIST;
+
+ /* read file header and save in data structure */
+ NormProtos->NumParams = ReadSampleSize(fp);
+ NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
+
+ /* read protos for each class into a separate list */
+ const int kMaxLineSize = 100;
+ char line[kMaxLineSize];
+ while (fp->FGets(line, kMaxLineSize) != nullptr) {
+ std::istringstream stream(line);
+ stream.imbue(std::locale::classic());
+ stream >> unichar >> NumProtos;
+ if (stream.fail()) {
+ continue;
+ }
+ if (unicharset.contains_unichar(unichar)) {
+ unichar_id = unicharset.unichar_to_id(unichar);
+ Protos = NormProtos->Protos[unichar_id];
+ for (i = 0; i < NumProtos; i++)
+ Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
+ NormProtos->Protos[unichar_id] = Protos;
+ } else {
+ tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
+ unichar);
+ for (i = 0; i < NumProtos; i++)
+ FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
+ }
+ }
+ return (NormProtos);
+} /* ReadNormProtos */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/normmatch.h b/tesseract/src/classify/normmatch.h
new file mode 100644
index 00000000..77f66550
--- /dev/null
+++ b/tesseract/src/classify/normmatch.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+ ** Filename: normmatch.h
+ ** Purpose: Simple matcher based on character normalization features.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef NORMMATCH_H
+#define NORMMATCH_H
+
+#include "matchdefs.h"
+#include "ocrfeatures.h"
+#include "params.h"
+
+namespace tesseract {
+
+/* control knobs used to control the normalization adjustment process */
+extern double_VAR_H(classify_norm_adj_midpoint, 32.0,
+ "Norm adjust midpoint ...");
+extern double_VAR_H(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/ocrfeatures.cpp b/tesseract/src/classify/ocrfeatures.cpp
new file mode 100644
index 00000000..b8d646b1
--- /dev/null
+++ b/tesseract/src/classify/ocrfeatures.cpp
@@ -0,0 +1,190 @@
+/******************************************************************************
+ ** Filename: ocrfeatures.cpp
+ ** Purpose: Generic definition of a feature.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "ocrfeatures.h"
+
+#include "scanutils.h"
+#include "strngs.h" // for STRING
+
+#include <cassert>
+#include <cmath>
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+/**
+ * Add a feature to a feature set. If the feature set is
+ * already full, false is returned to indicate that the
+ * feature could not be added to the set; otherwise, true is
+ * returned.
+ * @param FeatureSet set of features to add Feature to
+ * @param Feature feature to be added to FeatureSet
+ * @return true if feature added to set, false if set is already full.
+ */
+bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature) {
+ if (FeatureSet->NumFeatures >= FeatureSet->MaxNumFeatures) {
+ FreeFeature(Feature);
+ return false;
+ }
+
+ FeatureSet->Features[FeatureSet->NumFeatures++] = Feature;
+ return true;
+} /* AddFeature */
+
+/**
+ * Release the memory consumed by the specified feature.
+ * @param Feature feature to be deallocated.
+ */
+void FreeFeature(FEATURE Feature) { free(Feature); } /* FreeFeature */
+
+/**
+ * Release the memory consumed by the specified feature
+ * set. This routine also frees the memory consumed by the
+ * features contained in the set.
+ * @param FeatureSet set of features to be freed
+ */
+void FreeFeatureSet(FEATURE_SET FeatureSet) {
+ int i;
+
+ if (FeatureSet) {
+ for (i = 0; i < FeatureSet->NumFeatures; i++)
+ FreeFeature(FeatureSet->Features[i]);
+ free(FeatureSet);
+ }
+} /* FreeFeatureSet */
+
+/**
+ * Allocate and return a new feature of the specified
+ * type.
+ * @param FeatureDesc description of feature to be created.
+ * @return New #FEATURE.
+ */
+FEATURE NewFeature(const FEATURE_DESC_STRUCT* FeatureDesc) {
+ FEATURE Feature;
+
+ Feature = static_cast<FEATURE>(malloc(sizeof(FEATURE_STRUCT) +
+ (FeatureDesc->NumParams - 1) * sizeof(float)));
+ Feature->Type = FeatureDesc;
+ return (Feature);
+
+} /* NewFeature */
+
+/**
+ * Allocate and return a new feature set large enough to
+ * hold the specified number of features.
+ * @param NumFeatures maximum # of features to be put in feature set
+ * @return New #FEATURE_SET.
+ */
+FEATURE_SET NewFeatureSet(int NumFeatures) {
+ FEATURE_SET FeatureSet;
+
+ FeatureSet = static_cast<FEATURE_SET>(malloc (sizeof (FEATURE_SET_STRUCT) +
+ (NumFeatures - 1) * sizeof (FEATURE)));
+ FeatureSet->MaxNumFeatures = NumFeatures;
+ FeatureSet->NumFeatures = 0;
+ return (FeatureSet);
+
+} /* NewFeatureSet */
+
+/**
+ * Create a new feature of the specified type and read in
+ * the value of its parameters from File. The extra penalty
+ * for the feature is also computed by calling the appropriate
+ * function for the specified feature type. The correct text
+ * representation for a feature is a list of N floats where
+ * N is the number of parameters in the feature.
+ * @param File open text file to read feature from
+ * @param FeatureDesc specifies type of feature to read from File
+ * @return New #FEATURE read from File.
+ */
+static FEATURE ReadFeature(FILE* File, const FEATURE_DESC_STRUCT* FeatureDesc) {
+ FEATURE Feature;
+ int i;
+
+ Feature = NewFeature (FeatureDesc);
+ for (i = 0; i < Feature->Type->NumParams; i++) {
+ ASSERT_HOST(tfscanf(File, "%f", &(Feature->Params[i])) == 1);
+#ifndef _WIN32
+ assert (!std::isnan(Feature->Params[i]));
+#endif
+ }
+ return Feature;
+}
+
+/**
+ * Create a new feature set of the specified type and read in
+ * the features from File. The correct text representation
+ * for a feature set is an integer which specifies the number (N)
+ * of features in a set followed by a list of N feature
+ * descriptions.
+ * @param File open text file to read new feature set from
+ * @param FeatureDesc specifies type of feature to read from File
+ * @return New feature set read from File.
+ */
+FEATURE_SET ReadFeatureSet(FILE* File, const FEATURE_DESC_STRUCT* FeatureDesc) {
+ int NumFeatures;
+ ASSERT_HOST(tfscanf(File, "%d", &NumFeatures) == 1);
+ ASSERT_HOST(NumFeatures >= 0);
+
+ FEATURE_SET FeatureSet = NewFeatureSet(NumFeatures);
+ for (int i = 0; i < NumFeatures; i++)
+ AddFeature(FeatureSet, ReadFeature(File, FeatureDesc));
+
+ return FeatureSet;
+}
+
+/**
+ * Appends a textual representation of Feature to str.
+ * This representation is simply a list of the N parameters
+ * of the feature, terminated with a newline. It is assumed
+ * that the ExtraPenalty field can be reconstructed from the
+ * parameters of the feature. It is also assumed that the
+ * feature type information is specified or assumed elsewhere.
+ * @param Feature feature to write out to str
+ * @param str string to write Feature to
+ */
+static void WriteFeature(FEATURE Feature, STRING* str) {
+ for (int i = 0; i < Feature->Type->NumParams; i++) {
+#ifndef WIN32
+ assert(!std::isnan(Feature->Params[i]));
+#endif
+ str->add_str_double(" ", Feature->Params[i]);
+ }
+ *str += "\n";
+} /* WriteFeature */
+
+/**
+ * Write a textual representation of FeatureSet to File.
+ * This representation is an integer specifying the number of
+ * features in the set, followed by a newline, followed by
+ * text representations for each feature in the set.
+ * @param FeatureSet feature set to write to File
+ * @param str string to write Feature to
+ */
+void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str) {
+ if (FeatureSet) {
+ str->add_str_int("", FeatureSet->NumFeatures);
+ *str += "\n";
+ for (int i = 0; i < FeatureSet->NumFeatures; i++) {
+ WriteFeature(FeatureSet->Features[i], str);
+ }
+ }
+} /* WriteFeatureSet */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/ocrfeatures.h b/tesseract/src/classify/ocrfeatures.h
new file mode 100644
index 00000000..edf63496
--- /dev/null
+++ b/tesseract/src/classify/ocrfeatures.h
@@ -0,0 +1,122 @@
+/******************************************************************************
+ ** Filename: features.h
+ ** Purpose: Generic definition of a feature.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef FEATURES_H
+#define FEATURES_H
+
+#include "blobs.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+class DENORM;
+
+#undef Min
+#undef Max
+#define FEAT_NAME_SIZE 80
+
+// A character is described by multiple sets of extracted features. Each
+// set contains a number of features of a particular type, for example, a
+// set of bays, or a set of closures, or a set of microfeatures. Each
+// feature consists of a number of parameters. All features within a
+// feature set contain the same number of parameters. All circular
+// parameters are required to be the first parameters in the feature.
+
+struct PARAM_DESC {
+ bool Circular; // true if dimension wraps around
+ bool NonEssential; // true if dimension not used in searches
+ float Min; // low end of range for circular dimensions
+ float Max; // high end of range for circular dimensions
+ float Range; // Max - Min
+ float HalfRange; // (Max - Min)/2
+ float MidRange; // (Max + Min)/2
+};
+
+struct FEATURE_DESC_STRUCT {
+ uint16_t NumParams; // total # of params
+ const char* ShortName; // short name for feature
+ const PARAM_DESC* ParamDesc; // array - one per param
+};
+using FEATURE_DESC = FEATURE_DESC_STRUCT*;
+
+struct FEATURE_STRUCT {
+ const FEATURE_DESC_STRUCT* Type; // points to description of feature type
+ float Params[1]; // variable size array - params for feature
+};
+using FEATURE = FEATURE_STRUCT*;
+
+struct FEATURE_SET_STRUCT {
+ uint16_t NumFeatures; // number of features in set
+ uint16_t MaxNumFeatures; // maximum size of feature set
+ FEATURE Features[1]; // variable size array of features
+};
+using FEATURE_SET = FEATURE_SET_STRUCT*;
+
+// A generic character description as a char pointer. In reality, it will be
+// a pointer to some data structure. Paired feature extractors/matchers need
+// to agree on the data structure to be used, however, the high level
+// classifier does not need to know the details of this data structure.
+using CHAR_FEATURES = char*;
+
+/*----------------------------------------------------------------------
+ Macros for defining the parameters of a new features
+----------------------------------------------------------------------*/
+#define StartParamDesc(Name) const PARAM_DESC Name[] = {
+#define DefineParam(Circular, NonEssential, Min, Max) \
+ {Circular, \
+ NonEssential, \
+ Min, \
+ Max, \
+ (Max) - (Min), \
+ (((Max) - (Min)) / 2.0), \
+ (((Max) + (Min)) / 2.0)},
+
+#define EndParamDesc };
+
+/*----------------------------------------------------------------------
+Macro for describing a new feature. The parameters of the macro
+are as follows:
+
+DefineFeature (Name, NumLinear, NumCircular, ShortName, ParamName)
+----------------------------------------------------------------------*/
+#define DefineFeature(Name, NL, NC, SN, PN) \
+ const FEATURE_DESC_STRUCT Name = {((NL) + (NC)), SN, PN};
+
+/*----------------------------------------------------------------------
+ Generic routines that work for all feature types
+----------------------------------------------------------------------*/
+bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature);
+
+TESS_API
+void FreeFeature(FEATURE Feature);
+
+TESS_API
+void FreeFeatureSet(FEATURE_SET FeatureSet);
+
+TESS_API
+FEATURE NewFeature(const FEATURE_DESC_STRUCT* FeatureDesc);
+
+FEATURE_SET NewFeatureSet(int NumFeatures);
+
+FEATURE_SET ReadFeatureSet(FILE* File, const FEATURE_DESC_STRUCT* FeatureDesc);
+
+void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/outfeat.cpp b/tesseract/src/classify/outfeat.cpp
new file mode 100644
index 00000000..f4746372
--- /dev/null
+++ b/tesseract/src/classify/outfeat.cpp
@@ -0,0 +1,168 @@
+/******************************************************************************
+ ** Filename: outfeat.c
+ ** Purpose: Definition of outline-features.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "outfeat.h"
+
+#include "classify.h"
+#include "featdefs.h"
+#include "mfoutline.h"
+#include "ocrfeatures.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+
+/**
+ * Convert each segment in the outline to a feature
+ * and return the features.
+ * @param Blob blob to extract pico-features from
+ * @return Outline-features for Blob.
+ * @note Globals: none
+ */
+FEATURE_SET Classify::ExtractOutlineFeatures(TBLOB *Blob) {
+ LIST Outlines;
+ LIST RemainingOutlines;
+ MFOUTLINE Outline;
+ FEATURE_SET FeatureSet;
+ float XScale, YScale;
+
+ FeatureSet = NewFeatureSet (MAX_OUTLINE_FEATURES);
+ if (Blob == nullptr)
+ return (FeatureSet);
+
+ Outlines = ConvertBlob (Blob);
+
+ NormalizeOutlines(Outlines, &XScale, &YScale);
+ RemainingOutlines = Outlines;
+ iterate(RemainingOutlines) {
+ Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
+ ConvertToOutlineFeatures(Outline, FeatureSet);
+ }
+ if (classify_norm_method == baseline)
+ NormalizeOutlineX(FeatureSet);
+ FreeOutlines(Outlines);
+ return (FeatureSet);
+} /* ExtractOutlineFeatures */
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine computes the midpoint between Start and
+ * End to obtain the x,y position of the outline-feature. It
+ * also computes the direction from Start to End as the
+ * direction of the outline-feature and the distance from
+ * Start to End as the length of the outline-feature.
+ * This feature is then
+ * inserted into the next feature slot in FeatureSet.
+ * @param Start starting point of outline-feature
+ * @param End ending point of outline-feature
+ * @param FeatureSet set to add outline-feature to
+ */
+void AddOutlineFeatureToSet(FPOINT *Start,
+ FPOINT *End,
+ FEATURE_SET FeatureSet) {
+ FEATURE Feature;
+
+ Feature = NewFeature(&OutlineFeatDesc);
+ Feature->Params[OutlineFeatDir] = NormalizedAngleFrom(Start, End, 1.0);
+ Feature->Params[OutlineFeatX] = AverageOf(Start->x, End->x);
+ Feature->Params[OutlineFeatY] = AverageOf(Start->y, End->y);
+ Feature->Params[OutlineFeatLength] = DistanceBetween(*Start, *End);
+ AddFeature(FeatureSet, Feature);
+
+} /* AddOutlineFeatureToSet */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine steps converts each section in the specified
+ * outline to a feature described by its x,y position, length
+ * and angle.
+ * Results are returned in FeatureSet.
+ * @param Outline outline to extract outline-features from
+ * @param FeatureSet set of features to add outline-features to
+ */
+void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet) {
+ MFOUTLINE Next;
+ MFOUTLINE First;
+ FPOINT FeatureStart;
+ FPOINT FeatureEnd;
+
+ if (DegenerateOutline (Outline))
+ return;
+
+ First = Outline;
+ Next = First;
+ do {
+ FeatureStart = PointAt(Next)->Point;
+ Next = NextPointAfter(Next);
+
+ /* note that an edge is hidden if the ending point of the edge is
+ marked as hidden. This situation happens because the order of
+ the outlines is reversed when they are converted from the old
+ format. In the old format, a hidden edge is marked by the
+ starting point for that edge. */
+ if (!PointAt(Next)->Hidden) {
+ FeatureEnd = PointAt(Next)->Point;
+ AddOutlineFeatureToSet(&FeatureStart, &FeatureEnd, FeatureSet);
+ }
+ }
+ while (Next != First);
+} /* ConvertToOutlineFeatures */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine computes the weighted average x position
+ * over all of the outline-features in FeatureSet and then
+ * renormalizes the outline-features to force this average
+ * to be the x origin (i.e. x=0).
+ * FeatureSet is changed.
+ * @param FeatureSet outline-features to be normalized
+ */
+void NormalizeOutlineX(FEATURE_SET FeatureSet) {
+ int i;
+ FEATURE Feature;
+ float Length;
+ float TotalX = 0.0;
+ float TotalWeight = 0.0;
+ float Origin;
+
+ if (FeatureSet->NumFeatures <= 0)
+ return;
+
+ for (i = 0; i < FeatureSet->NumFeatures; i++) {
+ Feature = FeatureSet->Features[i];
+ Length = Feature->Params[OutlineFeatLength];
+ TotalX += Feature->Params[OutlineFeatX] * Length;
+ TotalWeight += Length;
+ }
+ Origin = TotalX / TotalWeight;
+
+ for (i = 0; i < FeatureSet->NumFeatures; i++) {
+ Feature = FeatureSet->Features[i];
+ Feature->Params[OutlineFeatX] -= Origin;
+ }
+} /* NormalizeOutlineX */
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/outfeat.h b/tesseract/src/classify/outfeat.h
new file mode 100644
index 00000000..eefde2e1
--- /dev/null
+++ b/tesseract/src/classify/outfeat.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ ** Filename: outfeat.h
+ ** Purpose: Definition of outline features.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef OUTFEAT_H
+#define OUTFEAT_H
+
+#include "ocrfeatures.h"
+#include "fpoint.h"
+#include "mfoutline.h"
+
+namespace tesseract {
+
+typedef enum {
+ OutlineFeatX,
+ OutlineFeatY,
+ OutlineFeatLength,
+ OutlineFeatDir
+} OUTLINE_FEAT_PARAM_NAME;
+
+#define MAX_OUTLINE_FEATURES (100)
+
+/*---------------------------------------------------------------------------
+ Privat Function Prototypes
+----------------------------------------------------------------------------*/
+void AddOutlineFeatureToSet(FPOINT *Start,
+ FPOINT *End,
+ FEATURE_SET FeatureSet);
+
+void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet);
+
+void NormalizeOutlineX(FEATURE_SET FeatureSet);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/picofeat.cpp b/tesseract/src/classify/picofeat.cpp
new file mode 100644
index 00000000..17f5e66d
--- /dev/null
+++ b/tesseract/src/classify/picofeat.cpp
@@ -0,0 +1,264 @@
+/******************************************************************************
+ ** Filename: picofeat.c
+ ** Purpose: Definition of pico-features.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include "picofeat.h"
+
+#include "classify.h"
+#include "featdefs.h"
+#include "fpoint.h"
+#include "mfoutline.h"
+#include "ocrfeatures.h"
+#include "params.h"
+#include "trainingsample.h"
+
+#include <cmath>
+#include <cstdio>
+
+namespace tesseract {
+
+/*---------------------------------------------------------------------------
+ Variables
+----------------------------------------------------------------------------*/
+
+double_VAR(classify_pico_feature_length, 0.05, "Pico Feature Length");
+
+/*---------------------------------------------------------------------------
+ Private Function Prototypes
+----------------------------------------------------------------------------*/
+void ConvertSegmentToPicoFeat(FPOINT *Start,
+ FPOINT *End,
+ FEATURE_SET FeatureSet);
+
+void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet);
+
+void NormalizePicoX(FEATURE_SET FeatureSet);
+
+/*----------------------------------------------------------------------------
+ Public Code
+----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+ * Operation: Dummy for now.
+ *
+ * Globals:
+ * - classify_norm_method normalization method currently specified
+ * @param Blob blob to extract pico-features from
+ * @return Pico-features for Blob.
+ */
+FEATURE_SET Classify::ExtractPicoFeatures(TBLOB *Blob) {
+ LIST Outlines;
+ LIST RemainingOutlines;
+ MFOUTLINE Outline;
+ FEATURE_SET FeatureSet;
+ float XScale, YScale;
+
+ FeatureSet = NewFeatureSet(MAX_PICO_FEATURES);
+ Outlines = ConvertBlob(Blob);
+ NormalizeOutlines(Outlines, &XScale, &YScale);
+ RemainingOutlines = Outlines;
+ iterate(RemainingOutlines) {
+ Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
+ ConvertToPicoFeatures2(Outline, FeatureSet);
+ }
+ if (classify_norm_method == baseline)
+ NormalizePicoX(FeatureSet);
+ FreeOutlines(Outlines);
+ return (FeatureSet);
+
+} /* ExtractPicoFeatures */
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine converts an entire segment of an outline
+ * into a set of pico features which are added to
+ * FeatureSet. The length of the segment is rounded to the
+ * nearest whole number of pico-features. The pico-features
+ * are spaced evenly over the entire segment.
+ * Results are placed in FeatureSet.
+ * Globals:
+ * - classify_pico_feature_length length of a single pico-feature
+ * @param Start starting point of pico-feature
+ * @param End ending point of pico-feature
+ * @param FeatureSet set to add pico-feature to
+ */
+void ConvertSegmentToPicoFeat(FPOINT *Start,
+ FPOINT *End,
+ FEATURE_SET FeatureSet) {
+ FEATURE Feature;
+ float Angle;
+ float Length;
+ int NumFeatures;
+ FPOINT Center;
+ FPOINT Delta;
+ int i;
+
+ Angle = NormalizedAngleFrom (Start, End, 1.0);
+ Length = DistanceBetween (*Start, *End);
+ NumFeatures = static_cast<int>(floor (Length / classify_pico_feature_length + 0.5));
+ if (NumFeatures < 1)
+ NumFeatures = 1;
+
+ /* compute vector for one pico feature */
+ Delta.x = XDelta (*Start, *End) / NumFeatures;
+ Delta.y = YDelta (*Start, *End) / NumFeatures;
+
+ /* compute position of first pico feature */
+ Center.x = Start->x + Delta.x / 2.0;
+ Center.y = Start->y + Delta.y / 2.0;
+
+ /* compute each pico feature in segment and add to feature set */
+ for (i = 0; i < NumFeatures; i++) {
+ Feature = NewFeature (&PicoFeatDesc);
+ Feature->Params[PicoFeatDir] = Angle;
+ Feature->Params[PicoFeatX] = Center.x;
+ Feature->Params[PicoFeatY] = Center.y;
+ AddFeature(FeatureSet, Feature);
+
+ Center.x += Delta.x;
+ Center.y += Delta.y;
+ }
+} /* ConvertSegmentToPicoFeat */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine steps through the specified outline and cuts it
+ * up into pieces of equal length. These pieces become the
+ * desired pico-features. Each segment in the outline
+ * is converted into an integral number of pico-features.
+ * Results are returned in FeatureSet.
+ *
+ * Globals:
+ * - classify_pico_feature_length length of features to be extracted
+ * @param Outline outline to extract micro-features from
+ * @param FeatureSet set of features to add pico-features to
+ */
+void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet) {
+ MFOUTLINE Next;
+ MFOUTLINE First;
+ MFOUTLINE Current;
+
+ if (DegenerateOutline(Outline))
+ return;
+
+ First = Outline;
+ Current = First;
+ Next = NextPointAfter(Current);
+ do {
+ /* note that an edge is hidden if the ending point of the edge is
+ marked as hidden. This situation happens because the order of
+ the outlines is reversed when they are converted from the old
+ format. In the old format, a hidden edge is marked by the
+ starting point for that edge. */
+ if (!(PointAt(Next)->Hidden))
+ ConvertSegmentToPicoFeat (&(PointAt(Current)->Point),
+ &(PointAt(Next)->Point), FeatureSet);
+
+ Current = Next;
+ Next = NextPointAfter(Current);
+ }
+ while (Current != First);
+
+} /* ConvertToPicoFeatures2 */
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * This routine computes the average x position over all
+ * of the pico-features in FeatureSet and then renormalizes
+ * the pico-features to force this average to be the x origin
+ * (i.e. x=0).
+ * FeatureSet is changed.
+ * @param FeatureSet pico-features to be normalized
+ */
+void NormalizePicoX(FEATURE_SET FeatureSet) {
+ int i;
+ FEATURE Feature;
+ float Origin = 0.0;
+
+ for (i = 0; i < FeatureSet->NumFeatures; i++) {
+ Feature = FeatureSet->Features[i];
+ Origin += Feature->Params[PicoFeatX];
+ }
+ Origin /= FeatureSet->NumFeatures;
+
+ for (i = 0; i < FeatureSet->NumFeatures; i++) {
+ Feature = FeatureSet->Features[i];
+ Feature->Params[PicoFeatX] -= Origin;
+ }
+} /* NormalizePicoX */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * @param blob blob to extract features from
+ * @param fx_info
+ * @return Integer character-normalized features for blob.
+ */
+FEATURE_SET Classify::ExtractIntCNFeatures(
+ const TBLOB& blob, const INT_FX_RESULT_STRUCT& fx_info) {
+ INT_FX_RESULT_STRUCT local_fx_info(fx_info);
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
+ blob, false, &local_fx_info, &bl_features);
+ if (sample == nullptr) return nullptr;
+
+ uint32_t num_features = sample->num_features();
+ const INT_FEATURE_STRUCT* features = sample->features();
+ FEATURE_SET feature_set = NewFeatureSet(num_features);
+ for (uint32_t f = 0; f < num_features; ++f) {
+ FEATURE feature = NewFeature(&IntFeatDesc);
+
+ feature->Params[IntX] = features[f].X;
+ feature->Params[IntY] = features[f].Y;
+ feature->Params[IntDir] = features[f].Theta;
+ AddFeature(feature_set, feature);
+ }
+ delete sample;
+
+ return feature_set;
+} /* ExtractIntCNFeatures */
+
+/*---------------------------------------------------------------------------*/
+/**
+ * @param blob blob to extract features from
+ * @param fx_info
+ * @return Geometric (top/bottom/width) features for blob.
+ */
+FEATURE_SET Classify::ExtractIntGeoFeatures(
+ const TBLOB& blob, const INT_FX_RESULT_STRUCT& fx_info) {
+ INT_FX_RESULT_STRUCT local_fx_info(fx_info);
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
+ blob, false, &local_fx_info, &bl_features);
+ if (sample == nullptr) return nullptr;
+
+ FEATURE_SET feature_set = NewFeatureSet(1);
+ FEATURE feature = NewFeature(&IntFeatDesc);
+
+ feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
+ feature->Params[GeoTop] = sample->geo_feature(GeoTop);
+ feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
+ AddFeature(feature_set, feature);
+ delete sample;
+
+ return feature_set;
+} /* ExtractIntGeoFeatures */
+
+} // namespace tesseract.
diff --git a/tesseract/src/classify/picofeat.h b/tesseract/src/classify/picofeat.h
new file mode 100644
index 00000000..d5e7786e
--- /dev/null
+++ b/tesseract/src/classify/picofeat.h
@@ -0,0 +1,65 @@
+/******************************************************************************
+ ** Filename: picofeat.h
+ ** Purpose: Definition of pico features.
+ ** Author: Dan Johnson
+ ** History: 9/4/90, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef PICOFEAT_H
+#define PICOFEAT_H
+
+#include "ocrfeatures.h"
+#include "params.h"
+
+namespace tesseract {
+
+// Enum for the order/type of params in IntFeatDesc.
+enum IntParams {
+ IntX, // x-position (0-255).
+ IntY, // y-position (0-255).
+ IntDir // Direction (0-255, circular).
+};
+
+// Enum for the order/type of params in GeoFeatDesc.
+enum GeoParams {
+ GeoBottom, // Bounding box bottom in baseline space (0-255).
+ GeoTop, // Bounding box top in baseline space (0-255).
+ GeoWidth, // Bounding box width in baseline space (0-255).
+
+ GeoCount // Number of geo features.
+};
+
+typedef enum { PicoFeatY, PicoFeatDir, PicoFeatX } PICO_FEAT_PARAM_NAME;
+
+#define MAX_PICO_FEATURES (1000)
+
+/*---------------------------------------------------------------------------
+ Variables
+----------------------------------------------------------------------------*/
+
+extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length");
+
+/**----------------------------------------------------------------------------
+ Public Function Prototypes
+----------------------------------------------------------------------------**/
+#define GetPicoFeatureLength() (PicoFeatureLength)
+
+/**----------------------------------------------------------------------------
+ Global Data Definitions and Declarations
+----------------------------------------------------------------------------**/
+extern TESS_API float PicoFeatureLength;
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/protos.cpp b/tesseract/src/classify/protos.cpp
new file mode 100644
index 00000000..5cbe4b2e
--- /dev/null
+++ b/tesseract/src/classify/protos.cpp
@@ -0,0 +1,178 @@
+/******************************************************************************
+ *
+ * File: protos.cpp (Formerly protos.c)
+ * Author: Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+ I n c l u d e s
+----------------------------------------------------------------------*/
+#define _USE_MATH_DEFINES // for M_PI
+
+#include "protos.h"
+
+#include "tprintf.h"
+#include "classify.h"
+#include "params.h"
+#include "intproto.h"
+
+#include <cmath> // for M_PI
+#include <cstdio>
+
+namespace tesseract {
+
+#define PROTO_INCREMENT 32
+#define CONFIG_INCREMENT 16
+
+/*----------------------------------------------------------------------
+ F u n c t i o n s
+----------------------------------------------------------------------*/
+/**
+ * @name AddConfigToClass
+ *
+ * Add a new config to this class. Malloc new space and copy the
+ * old configs if necessary. Return the config id for the new config.
+ *
+ * @param Class The class to add to
+ */
+int AddConfigToClass(CLASS_TYPE Class) {
+ int NewNumConfigs;
+ int NewConfig;
+ int MaxNumProtos;
+ BIT_VECTOR Config;
+
+ MaxNumProtos = Class->MaxNumProtos;
+ ASSERT_HOST(MaxNumProtos <= MAX_NUM_PROTOS);
+
+ if (Class->NumConfigs >= Class->MaxNumConfigs) {
+ /* add configs in CONFIG_INCREMENT chunks at a time */
+ NewNumConfigs = (((Class->MaxNumConfigs + CONFIG_INCREMENT) /
+ CONFIG_INCREMENT) * CONFIG_INCREMENT);
+
+ Class->Configurations =
+ static_cast<CONFIGS>(realloc (Class->Configurations,
+ sizeof (BIT_VECTOR) * NewNumConfigs));
+
+ Class->MaxNumConfigs = NewNumConfigs;
+ }
+ NewConfig = Class->NumConfigs++;
+ Config = NewBitVector(MAX_NUM_PROTOS);
+ Class->Configurations[NewConfig] = Config;
+ zero_all_bits (Config, WordsInVectorOfSize(MAX_NUM_PROTOS));
+
+ return (NewConfig);
+}
+
+
+/**
+ * @name AddProtoToClass
+ *
+ * Add a new proto to this class. Malloc new space and copy the
+ * old protos if necessary. Return the proto id for the new proto.
+ *
+ * @param Class The class to add to
+ */
+int AddProtoToClass(CLASS_TYPE Class) {
+ if (Class->NumProtos >= Class->MaxNumProtos) {
+ /* add protos in PROTO_INCREMENT chunks at a time */
+ int NewNumProtos = (((Class->MaxNumProtos + PROTO_INCREMENT) /
+ PROTO_INCREMENT) * PROTO_INCREMENT);
+
+ Class->Prototypes = static_cast<PROTO>(realloc (Class->Prototypes,
+ sizeof (PROTO_STRUCT) *
+ NewNumProtos));
+
+ Class->MaxNumProtos = NewNumProtos;
+ ASSERT_HOST(NewNumProtos <= MAX_NUM_PROTOS);
+ }
+ int NewProto = Class->NumProtos++;
+ ASSERT_HOST(Class->NumProtos <= MAX_NUM_PROTOS);
+ return (NewProto);
+}
+
+
+/**********************************************************************
+ * FillABC
+ *
+ * Fill in Protos A, B, C fields based on the X, Y, Angle fields.
+ **********************************************************************/
+void FillABC(PROTO Proto) {
+ float Slope, Intercept, Normalizer;
+
+ Slope = tan(Proto->Angle * 2.0 * M_PI);
+ Intercept = Proto->Y - Slope * Proto->X;
+ Normalizer = 1.0 / sqrt (Slope * Slope + 1.0);
+ Proto->A = Slope * Normalizer;
+ Proto->B = -Normalizer;
+ Proto->C = Intercept * Normalizer;
+}
+
+
+/**********************************************************************
+ * FreeClass
+ *
+ * Deallocate the memory consumed by the specified class.
+ **********************************************************************/
+void FreeClass(CLASS_TYPE Class) {
+ if (Class) {
+ FreeClassFields(Class);
+ delete Class;
+ }
+}
+
+
+/**********************************************************************
+ * FreeClassFields
+ *
+ * Deallocate the memory consumed by subfields of the specified class.
+ **********************************************************************/
+void FreeClassFields(CLASS_TYPE Class) {
+ int i;
+
+ if (Class) {
+ if (Class->MaxNumProtos > 0) free(Class->Prototypes);
+ if (Class->MaxNumConfigs > 0) {
+ for (i = 0; i < Class->NumConfigs; i++)
+ FreeBitVector (Class->Configurations[i]);
+ free(Class->Configurations);
+ }
+ }
+}
+
+/**********************************************************************
+ * NewClass
+ *
+ * Allocate a new class with enough memory to hold the specified number
+ * of prototypes and configurations.
+ **********************************************************************/
+CLASS_TYPE NewClass(int NumProtos, int NumConfigs) {
+ CLASS_TYPE Class;
+
+ Class = new CLASS_STRUCT;
+
+ if (NumProtos > 0)
+ Class->Prototypes = static_cast<PROTO>(malloc (NumProtos * sizeof (PROTO_STRUCT)));
+
+ if (NumConfigs > 0)
+ Class->Configurations = static_cast<CONFIGS>(malloc (NumConfigs *
+ sizeof (BIT_VECTOR)));
+ Class->MaxNumProtos = NumProtos;
+ Class->MaxNumConfigs = NumConfigs;
+ Class->NumProtos = 0;
+ Class->NumConfigs = 0;
+ return (Class);
+
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/protos.h b/tesseract/src/classify/protos.h
new file mode 100644
index 00000000..ae35b194
--- /dev/null
+++ b/tesseract/src/classify/protos.h
@@ -0,0 +1,107 @@
+/******************************************************************************
+ *
+ * File: protos.h
+ * Author: Mark Seaman, SW Productivity
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#ifndef PROTOS_H
+#define PROTOS_H
+
+#include "bitvec.h"
+#include "params.h"
+#include "unicity_table.h"
+
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+using CONFIGS = BIT_VECTOR*;
+
+typedef struct {
+ float A;
+ float B;
+ float C;
+ float X;
+ float Y;
+ float Angle;
+ float Length;
+} PROTO_STRUCT;
+using PROTO = PROTO_STRUCT*;
+
+struct CLASS_STRUCT {
+ CLASS_STRUCT()
+ : NumProtos(0),
+ MaxNumProtos(0),
+ NumConfigs(0),
+ MaxNumConfigs(0),
+ Prototypes(nullptr),
+ Configurations(nullptr) {}
+ int16_t NumProtos;
+ int16_t MaxNumProtos;
+ int16_t NumConfigs;
+ int16_t MaxNumConfigs;
+ PROTO Prototypes;
+ CONFIGS Configurations;
+ UnicityTable<int> font_set;
+};
+using CLASS_TYPE = CLASS_STRUCT*;
+using CLASSES = CLASS_STRUCT*;
+
+/*----------------------------------------------------------------------
+ M a c r o s
+----------------------------------------------------------------------*/
+/**
+ * AddProtoToConfig
+ *
+ * Set a single proto bit in the specified configuration.
+ */
+
+#define AddProtoToConfig(Pid, Config) (SET_BIT(Config, Pid))
+
+/**
+ * ProtoIn
+ *
+ * Choose the selected prototype in this class record. Return the
+ * pointer to it (type PROTO).
+ */
+
+#define ProtoIn(Class, Pid) (&(Class)->Prototypes[Pid])
+
+/*----------------------------------------------------------------------
+ F u n c t i o n s
+----------------------------------------------------------------------*/
+TESS_API
+int AddConfigToClass(CLASS_TYPE Class);
+
+TESS_API
+int AddProtoToClass(CLASS_TYPE Class);
+
+TESS_API
+void FillABC(PROTO Proto);
+
+TESS_API
+void FreeClass(CLASS_TYPE Class);
+
+TESS_API
+void FreeClassFields(CLASS_TYPE Class);
+
+void InitPrototypes();
+
+TESS_API
+CLASS_TYPE NewClass(int NumProtos, int NumConfigs);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/classify/shapeclassifier.cpp b/tesseract/src/classify/shapeclassifier.cpp
new file mode 100644
index 00000000..b1091a53
--- /dev/null
+++ b/tesseract/src/classify/shapeclassifier.cpp
@@ -0,0 +1,234 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: shapeclassifier.cpp
+// Description: Base interface class for classifiers that return a
+// shape index.
+// Author: Ray Smith
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "shapeclassifier.h"
+
+#include "scrollview.h"
+#include "shapetable.h"
+#include "svmnode.h"
+#include "trainingsample.h"
+#include "tprintf.h"
+
+#include "genericvector.h"
+
+namespace tesseract {
+
+// Classifies the given [training] sample, writing to results.
+// See shapeclassifier.h for a full description.
+// Default implementation calls the ShapeRating version.
+int ShapeClassifier::UnicharClassifySample(
+ const TrainingSample& sample, Pix* page_pix, int debug,
+ UNICHAR_ID keep_this, std::vector<UnicharRating>* results) {
+ results->clear();
+ std::vector<ShapeRating> shape_results;
+ int num_shape_results = ClassifySample(sample, page_pix, debug, keep_this,
+ &shape_results);
+ const ShapeTable* shapes = GetShapeTable();
+ GenericVector<int> unichar_map;
+ unichar_map.init_to_size(shapes->unicharset().size(), -1);
+ for (int r = 0; r < num_shape_results; ++r) {
+ shapes->AddShapeToResults(shape_results[r], &unichar_map, results);
+ }
+ return results->size();
+}
+
+// Classifies the given [training] sample, writing to results.
+// See shapeclassifier.h for a full description.
+// Default implementation aborts.
+int ShapeClassifier::ClassifySample(const TrainingSample& sample, Pix* page_pix,
+ int debug, int keep_this,
+ std::vector<ShapeRating>* results) {
+ ASSERT_HOST("Must implement ClassifySample!" == nullptr);
+ return 0;
+}
+
+// Returns the shape that contains unichar_id that has the best result.
+// If result is not nullptr, it is set with the shape_id and rating.
+// Does not need to be overridden if ClassifySample respects the keep_this
+// rule.
+int ShapeClassifier::BestShapeForUnichar(const TrainingSample& sample,
+ Pix* page_pix, UNICHAR_ID unichar_id,
+ ShapeRating* result) {
+ std::vector<ShapeRating> results;
+ const ShapeTable* shapes = GetShapeTable();
+ int num_results = ClassifySample(sample, page_pix, 0, unichar_id, &results);
+ for (int r = 0; r < num_results; ++r) {
+ if (shapes->GetShape(results[r].shape_id).ContainsUnichar(unichar_id)) {
+ if (result != nullptr)
+ *result = results[r];
+ return results[r].shape_id;
+ }
+ }
+ return -1;
+}
+
+// Provides access to the UNICHARSET that this classifier works with.
+// Only needs to be overridden if GetShapeTable() can return nullptr.
+const UNICHARSET& ShapeClassifier::GetUnicharset() const {
+ return GetShapeTable()->unicharset();
+}
+
+#ifndef GRAPHICS_DISABLED
+
+// Visual debugger classifies the given sample, displays the results and
+// solicits user input to display other classifications. Returns when
+// the user has finished with debugging the sample.
+// Probably doesn't need to be overridden if the subclass provides
+// DisplayClassifyAs.
+void ShapeClassifier::DebugDisplay(const TrainingSample& sample,
+ Pix* page_pix,
+ UNICHAR_ID unichar_id) {
+ static ScrollView* terminator = nullptr;
+ if (terminator == nullptr) {
+ terminator = new ScrollView("XIT", 0, 0, 50, 50, 50, 50, true);
+ }
+ ScrollView* debug_win = CreateFeatureSpaceWindow("ClassifierDebug", 0, 0);
+ // Provide a right-click menu to choose the class.
+ auto* popup_menu = new SVMenuNode();
+ popup_menu->AddChild("Choose class to debug", 0, "x", "Class to debug");
+ popup_menu->BuildMenu(debug_win, false);
+ // Display the features in green.
+ const INT_FEATURE_STRUCT* features = sample.features();
+ uint32_t num_features = sample.num_features();
+ for (uint32_t f = 0; f < num_features; ++f) {
+ RenderIntFeature(debug_win, &features[f], ScrollView::GREEN);
+ }
+ debug_win->Update();
+ std::vector<UnicharRating> results;
+ // Debug classification until the user quits.
+ const UNICHARSET& unicharset = GetUnicharset();
+ SVEvent* ev;
+ SVEventType ev_type;
+ do {
+ PointerVector<ScrollView> windows;
+ if (unichar_id >= 0) {
+ tprintf("Debugging class %d = %s\n",
+ unichar_id, unicharset.id_to_unichar(unichar_id));
+ UnicharClassifySample(sample, page_pix, 1, unichar_id, &results);
+ DisplayClassifyAs(sample, page_pix, unichar_id, 1, &windows);
+ } else {
+ tprintf("Invalid unichar_id: %d\n", unichar_id);
+ UnicharClassifySample(sample, page_pix, 1, -1, &results);
+ }
+ if (unichar_id >= 0) {
+ tprintf("Debugged class %d = %s\n",
+ unichar_id, unicharset.id_to_unichar(unichar_id));
+ }
+ tprintf("Right-click in ClassifierDebug window to choose debug class,");
+ tprintf(" Left-click or close window to quit...\n");
+ UNICHAR_ID old_unichar_id;
+ do {
+ old_unichar_id = unichar_id;
+ ev = debug_win->AwaitEvent(SVET_ANY);
+ ev_type = ev->type;
+ if (ev_type == SVET_POPUP) {
+ if (unicharset.contains_unichar(ev->parameter)) {
+ unichar_id = unicharset.unichar_to_id(ev->parameter);
+ } else {
+ tprintf("Char class '%s' not found in unicharset", ev->parameter);
+ }
+ }
+ delete ev;
+ } while (unichar_id == old_unichar_id &&
+ ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
+ } while (ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
+ delete debug_win;
+}
+
+#endif // !GRAPHICS_DISABLED
+
+// Displays classification as the given shape_id. Creates as many windows
+// as it feels fit, using index as a guide for placement. Adds any created
+// windows to the windows output and returns a new index that may be used
+// by any subsequent classifiers. Caller waits for the user to view and
+// then destroys the windows by clearing the vector.
+int ShapeClassifier::DisplayClassifyAs(
+ const TrainingSample& sample, Pix* page_pix,
+ UNICHAR_ID unichar_id, int index,
+ PointerVector<ScrollView>* windows) {
+ // Does nothing in the default implementation.
+ return index;
+}
+
+// Prints debug information on the results.
+void ShapeClassifier::UnicharPrintResults(
+ const char* context, const std::vector<UnicharRating>& results) const {
+ tprintf("%s\n", context);
+ for (int i = 0; i < results.size(); ++i) {
+ tprintf("%g: c_id=%d=%s", results[i].rating, results[i].unichar_id,
+ GetUnicharset().id_to_unichar(results[i].unichar_id));
+ if (!results[i].fonts.empty()) {
+ tprintf(" Font Vector:");
+ for (int f = 0; f < results[i].fonts.size(); ++f) {
+ tprintf(" %d", results[i].fonts[f].fontinfo_id);
+ }
+ }
+ tprintf("\n");
+ }
+}
+void ShapeClassifier::PrintResults(
+ const char* context, const std::vector<ShapeRating>& results) const {
+ tprintf("%s\n", context);
+ for (int i = 0; i < results.size(); ++i) {
+ tprintf("%g:", results[i].rating);
+ if (results[i].joined)
+ tprintf("[J]");
+ if (results[i].broken)
+ tprintf("[B]");
+ tprintf(" %s\n", GetShapeTable()->DebugStr(results[i].shape_id).c_str());
+ }
+}
+
+// Removes any result that has all its unichars covered by a better choice,
+// regardless of font.
+void ShapeClassifier::FilterDuplicateUnichars(
+ std::vector<ShapeRating>* results) const {
+ std::vector<ShapeRating> filtered_results;
+ // Copy results to filtered results and knock out duplicate unichars.
+ const ShapeTable* shapes = GetShapeTable();
+ for (int r = 0; r < results->size(); ++r) {
+ if (r > 0) {
+ const Shape& shape_r = shapes->GetShape((*results)[r].shape_id);
+ int c;
+ for (c = 0; c < shape_r.size(); ++c) {
+ int unichar_id = shape_r[c].unichar_id;
+ int s;
+ for (s = 0; s < r; ++s) {
+ const Shape& shape_s = shapes->GetShape((*results)[s].shape_id);
+ if (shape_s.ContainsUnichar(unichar_id))
+ break; // We found unichar_id.
+ }
+ if (s == r)
+ break; // We didn't find unichar_id.
+ }
+ if (c == shape_r.size())
+ continue; // We found all the unichar ids in previous answers.
+ }
+ filtered_results.push_back((*results)[r]);
+ }
+ *results = filtered_results;
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/classify/shapeclassifier.h b/tesseract/src/classify/shapeclassifier.h
new file mode 100644
index 00000000..776880fc
--- /dev/null
+++ b/tesseract/src/classify/shapeclassifier.h
@@ -0,0 +1,121 @@
+///////////////////////////////////////////////////////////////////////
+// File: shapeclassifier.h
+// Description: Base interface class for classifiers that return a
+// shape index.
+// Author: Ray Smith
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
+#define TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
+
+#include <tesseract/unichar.h>
+
+struct Pix;
+
+namespace tesseract {
+
+class ScrollView;
+class UNICHARSET;
+
+template <typename T> class PointerVector;
+struct ShapeRating;
+class ShapeTable;
+class TrainingSample;
+class TrainingSampleSet;
+struct UnicharRating;
+
+// Interface base class for classifiers that produce ShapeRating results.
+class TESS_API ShapeClassifier {
+ public:
+ virtual ~ShapeClassifier() = default;
+
+ // Classifies the given [training] sample, writing to results.
+ // If page_pix is not nullptr, the overriding function may call
+ // sample.GetSamplePix(padding, page_pix) to get an image of the sample
+ // padded (with real image data) by the given padding to extract features
+ // from the image of the character. Other members of TrainingSample:
+ // features(), micro_features(), cn_feature(), geo_feature() may be used
+ // to get the appropriate tesseract features.
+ // If debug is non-zero, then various degrees of classifier dependent debug
+ // information is provided.
+ // If keep_this (a UNICHAR_ID) is >= 0, then the results should always
+ // contain keep_this, and (if possible) anything of intermediate confidence.
+ // (Used for answering "Why didn't it get that right?" questions.) It must
+ // be a UNICHAR_ID as the callers have no clue how to choose the best shape
+ // that may contain a desired answer.
+ // The return value is the number of classes saved in results.
+ // NOTE that overriding functions MUST clear and sort the results by
+ // descending rating unless the classifier is working with a team of such
+ // classifiers.
+ // NOTE: Neither overload of ClassifySample is pure, but at least one must
+ // be overridden by a classifier in order for it to do anything.
+ virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
+ int debug, UNICHAR_ID keep_this,
+ std::vector<UnicharRating>* results);
+
+ protected:
+ virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
+ int debug, UNICHAR_ID keep_this,
+ std::vector<ShapeRating>* results);
+
+ public:
+ // Returns the shape that contains unichar_id that has the best result.
+ // If result is not nullptr, it is set with the shape_id and rating.
+ // Returns -1 if ClassifySample fails to provide any result containing
+ // unichar_id. BestShapeForUnichar does not need to be overridden if
+ // ClassifySample respects the keep_this rule.
+ virtual int BestShapeForUnichar(const TrainingSample& sample, Pix* page_pix,
+ UNICHAR_ID unichar_id, ShapeRating* result);
+
+ // Provides access to the ShapeTable that this classifier works with.
+ virtual const ShapeTable* GetShapeTable() const = 0;
+ // Provides access to the UNICHARSET that this classifier works with.
+ // Must be overridden IFF GetShapeTable() returns nullptr.
+ virtual const UNICHARSET& GetUnicharset() const;
+
+ // Visual debugger classifies the given sample, displays the results and
+ // solicits user input to display other classifications. Returns when
+ // the user has finished with debugging the sample.
+ // Probably doesn't need to be overridden if the subclass provides
+ // DisplayClassifyAs.
+ void DebugDisplay(const TrainingSample& sample, Pix* page_pix,
+ UNICHAR_ID unichar_id);
+
+
+ // Displays classification as the given unichar_id. Creates as many windows
+ // as it feels fit, using index as a guide for placement. Adds any created
+ // windows to the windows output and returns a new index that may be used
+ // by any subsequent classifiers. Caller waits for the user to view and
+ // then destroys the windows by clearing the vector.
+ virtual int DisplayClassifyAs(const TrainingSample& sample, Pix* page_pix,
+ UNICHAR_ID unichar_id, int index,
+ PointerVector<ScrollView>* windows);
+
+ // Prints debug information on the results. context is some introductory/title
+ // message.
+ virtual void UnicharPrintResults(
+ const char* context, const std::vector<UnicharRating>& results) const;
+ virtual void PrintResults(const char* context,
+ const std::vector<ShapeRating>& results) const;
+
+ protected:
+ // Removes any result that has all its unichars covered by a better choice,
+ // regardless of font.
+ void FilterDuplicateUnichars(std::vector<ShapeRating>* results) const;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
diff --git a/tesseract/src/classify/shapetable.cpp b/tesseract/src/classify/shapetable.cpp
new file mode 100644
index 00000000..c68f5d82
--- /dev/null
+++ b/tesseract/src/classify/shapetable.cpp
@@ -0,0 +1,727 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: shapetable.cpp
+// Description: Class to map a classifier shape index to unicharset
+// indices and font indices.
+// Author: Ray Smith
+// Created: Tue Nov 02 15:31:32 PDT 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "shapetable.h"
+
+#include "bitvector.h"
+#include "fontinfo.h"
+#include "intfeaturespace.h"
+#include "strngs.h"
+#include "unicharset.h"
+#include "unicity_table.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+// Helper function to get the index of the first result with the required
+// unichar_id. If the results are sorted by rating, this will also be the
+// best result with the required unichar_id.
+// Returns -1 if the unichar_id is not found
+int ShapeRating::FirstResultWithUnichar(
+ const GenericVector<ShapeRating>& results,
+ const ShapeTable& shape_table,
+ UNICHAR_ID unichar_id) {
+ for (int r = 0; r < results.size(); ++r) {
+ const int shape_id = results[r].shape_id;
+ const Shape& shape = shape_table.GetShape(shape_id);
+ if (shape.ContainsUnichar(unichar_id)) {
+ return r;
+ }
+ }
+ return -1;
+}
+
+// Helper function to get the index of the first result with the required
+// unichar_id. If the results are sorted by rating, this will also be the
+// best result with the required unichar_id.
+// Returns -1 if the unichar_id is not found
+int UnicharRating::FirstResultWithUnichar(
+ const GenericVector<UnicharRating>& results,
+ UNICHAR_ID unichar_id) {
+ for (int r = 0; r < results.size(); ++r) {
+ if (results[r].unichar_id == unichar_id)
+ return r;
+ }
+ return -1;
+}
+
+// Writes to the given file. Returns false in case of error.
+bool UnicharAndFonts::Serialize(FILE* fp) const {
+ return tesseract::Serialize(fp, &unichar_id) && font_ids.Serialize(fp);
+}
+// Reads from the given file. Returns false in case of error.
+
+bool UnicharAndFonts::DeSerialize(TFile* fp) {
+ return fp->DeSerialize(&unichar_id) && font_ids.DeSerialize(fp);
+}
+
+// Sort function to sort a pair of UnicharAndFonts by unichar_id.
+int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
+ const auto* p1 = static_cast<const UnicharAndFonts*>(v1);
+ const auto* p2 = static_cast<const UnicharAndFonts*>(v2);
+ return p1->unichar_id - p2->unichar_id;
+}
+
+// Writes to the given file. Returns false in case of error.
+bool Shape::Serialize(FILE* fp) const {
+ uint8_t sorted = unichars_sorted_;
+ return tesseract::Serialize(fp, &sorted) && unichars_.SerializeClasses(fp);
+}
+// Reads from the given file. Returns false in case of error.
+
+bool Shape::DeSerialize(TFile* fp) {
+ uint8_t sorted;
+ if (!fp->DeSerialize(&sorted)) return false;
+ unichars_sorted_ = sorted != 0;
+ return unichars_.DeSerializeClasses(fp);
+}
+
+// Adds a font_id for the given unichar_id. If the unichar_id is not
+// in the shape, it is added.
+void Shape::AddToShape(int unichar_id, int font_id) {
+ for (int c = 0; c < unichars_.size(); ++c) {
+ if (unichars_[c].unichar_id == unichar_id) {
+ // Found the unichar in the shape table.
+ GenericVector<int>& font_list = unichars_[c].font_ids;
+ for (int f = 0; f < font_list.size(); ++f) {
+ if (font_list[f] == font_id)
+ return; // Font is already there.
+ }
+ font_list.push_back(font_id);
+ return;
+ }
+ }
+ // Unichar_id is not in shape, so add it to shape.
+ unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
+ unichars_sorted_ = unichars_.size() <= 1;
+}
+
+// Adds everything in other to this.
+void Shape::AddShape(const Shape& other) {
+ for (int c = 0; c < other.unichars_.size(); ++c) {
+ for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
+ AddToShape(other.unichars_[c].unichar_id,
+ other.unichars_[c].font_ids[f]);
+ }
+ }
+ unichars_sorted_ = unichars_.size() <= 1;
+}
+
+// Returns true if the shape contains the given unichar_id, font_id pair.
+bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
+ for (int c = 0; c < unichars_.size(); ++c) {
+ if (unichars_[c].unichar_id == unichar_id) {
+ // Found the unichar, so look for the font.
+ auto &font_list = unichars_[c].font_ids;
+ for (int f = 0; f < font_list.size(); ++f) {
+ if (font_list[f] == font_id)
+ return true;
+ }
+ return false;
+ }
+ }
+ return false;
+}
+
+// Returns true if the shape contains the given unichar_id, ignoring font.
+bool Shape::ContainsUnichar(int unichar_id) const {
+ for (int c = 0; c < unichars_.size(); ++c) {
+ if (unichars_[c].unichar_id == unichar_id) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns true if the shape contains the given font, ignoring unichar_id.
+bool Shape::ContainsFont(int font_id) const {
+ for (int c = 0; c < unichars_.size(); ++c) {
+ auto &font_list = unichars_[c].font_ids;
+ for (int f = 0; f < font_list.size(); ++f) {
+ if (font_list[f] == font_id)
+ return true;
+ }
+ }
+ return false;
+}
+// Returns true if the shape contains the given font properties, ignoring
+// unichar_id.
+bool Shape::ContainsFontProperties(const FontInfoTable& font_table,
+ uint32_t properties) const {
+ for (int c = 0; c < unichars_.size(); ++c) {
+ auto &font_list = unichars_[c].font_ids;
+ for (int f = 0; f < font_list.size(); ++f) {
+ if (font_table.get(font_list[f]).properties == properties)
+ return true;
+ }
+ }
+ return false;
+}
+// Returns true if the shape contains multiple different font properties,
+// ignoring unichar_id.
+bool Shape::ContainsMultipleFontProperties(
+ const FontInfoTable& font_table) const {
+ uint32_t properties = font_table.get(unichars_[0].font_ids[0]).properties;
+ for (int c = 0; c < unichars_.size(); ++c) {
+ auto &font_list = unichars_[c].font_ids;
+ for (int f = 0; f < font_list.size(); ++f) {
+ if (font_table.get(font_list[f]).properties != properties)
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns true if this shape is equal to other (ignoring order of unichars
+// and fonts).
+bool Shape::operator==(const Shape& other) const {
+ return IsSubsetOf(other) && other.IsSubsetOf(*this);
+}
+
+// Returns true if this is a subset (including equal) of other.
+bool Shape::IsSubsetOf(const Shape& other) const {
+ for (int c = 0; c < unichars_.size(); ++c) {
+ int unichar_id = unichars_[c].unichar_id;
+ const GenericVector<int>& font_list = unichars_[c].font_ids;
+ for (int f = 0; f < font_list.size(); ++f) {
+ if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
+ return false;
+ }
+ }
+ return true;
+}
+
+// Returns true if the lists of unichar ids are the same in this and other,
+// ignoring fonts.
+// NOT const, as it will sort the unichars on demand.
+bool Shape::IsEqualUnichars(Shape* other) {
+ if (unichars_.size() != other->unichars_.size()) return false;
+ if (!unichars_sorted_) SortUnichars();
+ if (!other->unichars_sorted_) other->SortUnichars();
+ for (int c = 0; c < unichars_.size(); ++c) {
+ if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
+ return false;
+ }
+ return true;
+}
+
+// Sorts the unichars_ vector by unichar.
+void Shape::SortUnichars() {
+ unichars_.sort(UnicharAndFonts::SortByUnicharId);
+ unichars_sorted_ = true;
+}
+
+ShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {
+}
+ShapeTable::ShapeTable(const UNICHARSET& unicharset)
+ : unicharset_(&unicharset), num_fonts_(0) {
+}
+
+// Writes to the given file. Returns false in case of error.
+bool ShapeTable::Serialize(FILE* fp) const {
+ return shape_table_.Serialize(fp);
+}
+// Reads from the given file. Returns false in case of error.
+
+bool ShapeTable::DeSerialize(TFile* fp) {
+ if (!shape_table_.DeSerialize(fp)) return false;
+ num_fonts_ = 0;
+ return true;
+}
+
+// Returns the number of fonts used in this ShapeTable, computing it if
+// necessary.
+int ShapeTable::NumFonts() const {
+ if (num_fonts_ <= 0) {
+ for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
+ const Shape& shape = *shape_table_[shape_id];
+ for (int c = 0; c < shape.size(); ++c) {
+ for (int f = 0; f < shape[c].font_ids.size(); ++f) {
+ if (shape[c].font_ids[f] >= num_fonts_)
+ num_fonts_ = shape[c].font_ids[f] + 1;
+ }
+ }
+ }
+ }
+ return num_fonts_;
+}
+
+// Re-indexes the class_ids in the shapetable according to the given map.
+// Useful in conjunction with set_unicharset.
+void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
+ for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
+ Shape* shape = shape_table_[shape_id];
+ for (int c = 0; c < shape->size(); ++c) {
+ shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
+ }
+ }
+}
+
+// Returns a string listing the classes/fonts in a shape.
+STRING ShapeTable::DebugStr(int shape_id) const {
+ if (shape_id < 0 || shape_id >= shape_table_.size())
+ return STRING("INVALID_UNICHAR_ID");
+ const Shape& shape = GetShape(shape_id);
+ STRING result;
+ result.add_str_int("Shape", shape_id);
+ if (shape.size() > 100) {
+ result.add_str_int(" Num unichars=", shape.size());
+ return result;
+ }
+ for (int c = 0; c < shape.size(); ++c) {
+ result.add_str_int(" c_id=", shape[c].unichar_id);
+ result += "=";
+ result += unicharset_->id_to_unichar(shape[c].unichar_id);
+ if (shape.size() < 10) {
+ result.add_str_int(", ", shape[c].font_ids.size());
+ result += " fonts =";
+ int num_fonts = shape[c].font_ids.size();
+ if (num_fonts > 10) {
+ result.add_str_int(" ", shape[c].font_ids[0]);
+ result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]);
+ } else {
+ for (int f = 0; f < num_fonts; ++f) {
+ result.add_str_int(" ", shape[c].font_ids[f]);
+ }
+ }
+ }
+ }
+ return result;
+}
+
+// Returns a debug string summarizing the table.
+STRING ShapeTable::SummaryStr() const {
+ int max_unichars = 0;
+ int num_multi_shapes = 0;
+ int num_master_shapes = 0;
+ for (int s = 0; s < shape_table_.size(); ++s) {
+ if (MasterDestinationIndex(s) != s) continue;
+ ++num_master_shapes;
+ int shape_size = GetShape(s).size();
+ if (shape_size > 1)
+ ++num_multi_shapes;
+ if (shape_size > max_unichars)
+ max_unichars = shape_size;
+ }
+ STRING result;
+ result.add_str_int("Number of shapes = ", num_master_shapes);
+ result.add_str_int(" max unichars = ", max_unichars);
+ result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
+ return result;
+}
+
+
+// Adds a new shape starting with the given unichar_id and font_id.
+// Returns the assigned index.
+int ShapeTable::AddShape(int unichar_id, int font_id) {
+ int index = shape_table_.size();
+ auto* shape = new Shape;
+ shape->AddToShape(unichar_id, font_id);
+ shape_table_.push_back(shape);
+ num_fonts_ = std::max(num_fonts_, font_id + 1);
+ return index;
+}
+
+// Adds a copy of the given shape unless it is already present.
+// Returns the assigned index or index of existing shape if already present.
+int ShapeTable::AddShape(const Shape& other) {
+ int index;
+ for (index = 0; index < shape_table_.size() &&
+ !(other == *shape_table_[index]); ++index)
+ continue;
+ if (index == shape_table_.size()) {
+ auto* shape = new Shape(other);
+ shape_table_.push_back(shape);
+ }
+ num_fonts_ = 0;
+ return index;
+}
+
+// Removes the shape given by the shape index.
+void ShapeTable::DeleteShape(int shape_id) {
+ delete shape_table_[shape_id];
+ shape_table_[shape_id] = nullptr;
+ shape_table_.remove(shape_id);
+}
+
+// Adds a font_id to the given existing shape index for the given
+// unichar_id. If the unichar_id is not in the shape, it is added.
+void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
+ Shape& shape = *shape_table_[shape_id];
+ shape.AddToShape(unichar_id, font_id);
+ num_fonts_ = std::max(num_fonts_, font_id + 1);
+}
+
+// Adds the given shape to the existing shape with the given index.
+void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
+ Shape& shape = *shape_table_[shape_id];
+ shape.AddShape(other);
+ num_fonts_ = 0;
+}
+
+// Returns the id of the shape that contains the given unichar and font.
+// If not found, returns -1.
+// If font_id < 0, the font_id is ignored and the first shape that matches
+// the unichar_id is returned.
+int ShapeTable::FindShape(int unichar_id, int font_id) const {
+ for (int s = 0; s < shape_table_.size(); ++s) {
+ const Shape& shape = GetShape(s);
+ for (int c = 0; c < shape.size(); ++c) {
+ if (shape[c].unichar_id == unichar_id) {
+ if (font_id < 0)
+ return s; // We don't care about the font.
+ for (int f = 0; f < shape[c].font_ids.size(); ++f) {
+ if (shape[c].font_ids[f] == font_id)
+ return s;
+ }
+ }
+ }
+ }
+ return -1;
+}
+
+// Returns the first unichar_id and font_id in the given shape.
+void ShapeTable::GetFirstUnicharAndFont(int shape_id,
+ int* unichar_id, int* font_id) const {
+ const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
+ *unichar_id = unichar_and_fonts.unichar_id;
+ *font_id = unichar_and_fonts.font_ids[0];
+}
+
+// Expands all the classes/fonts in the shape individually to build
+// a ShapeTable.
+int ShapeTable::BuildFromShape(const Shape& shape,
+ const ShapeTable& master_shapes) {
+ BitVector shape_map(master_shapes.NumShapes());
+ for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
+ for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
+ int c = shape[u_ind].unichar_id;
+ int f = shape[u_ind].font_ids[f_ind];
+ int master_id = master_shapes.FindShape(c, f);
+ if (master_id >= 0) {
+ shape_map.SetBit(master_id);
+ } else if (FindShape(c, f) < 0) {
+ AddShape(c, f);
+ }
+ }
+ }
+ int num_masters = 0;
+ for (int s = 0; s < master_shapes.NumShapes(); ++s) {
+ if (shape_map[s]) {
+ AddShape(master_shapes.GetShape(s));
+ ++num_masters;
+ }
+ }
+ return num_masters;
+}
+
+// Returns true if the shapes are already merged.
+bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
+ return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
+}
+
+// Returns true if any shape contains multiple unichars.
+bool ShapeTable::AnyMultipleUnichars() const {
+ int num_shapes = NumShapes();
+ for (int s1 = 0; s1 < num_shapes; ++s1) {
+ if (MasterDestinationIndex(s1) != s1) continue;
+ if (GetShape(s1).size() > 1)
+ return true;
+ }
+ return false;
+}
+
+// Returns the maximum number of unichars over all shapes.
+int ShapeTable::MaxNumUnichars() const {
+ int max_num_unichars = 0;
+ int num_shapes = NumShapes();
+ for (int s = 0; s < num_shapes; ++s) {
+ if (GetShape(s).size() > max_num_unichars)
+ max_num_unichars = GetShape(s).size();
+ }
+ return max_num_unichars;
+}
+
+
+// Merges shapes with a common unichar over the [start, end) interval.
+// Assumes single unichar per shape.
+void ShapeTable::ForceFontMerges(int start, int end) {
+ for (int s1 = start; s1 < end; ++s1) {
+ if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
+ int unichar_id = GetShape(s1)[0].unichar_id;
+ for (int s2 = s1 + 1; s2 < end; ++s2) {
+ if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
+ unichar_id == GetShape(s2)[0].unichar_id) {
+ MergeShapes(s1, s2);
+ }
+ }
+ }
+ }
+ ShapeTable compacted(*unicharset_);
+ compacted.AppendMasterShapes(*this, nullptr);
+ *this = compacted;
+}
+
+// Returns the number of unichars in the master shape.
+int ShapeTable::MasterUnicharCount(int shape_id) const {
+ int master_id = MasterDestinationIndex(shape_id);
+ return GetShape(master_id).size();
+}
+
+// Returns the sum of the font counts in the master shape.
+int ShapeTable::MasterFontCount(int shape_id) const {
+ int master_id = MasterDestinationIndex(shape_id);
+ const Shape& shape = GetShape(master_id);
+ int font_count = 0;
+ for (int c = 0; c < shape.size(); ++c) {
+ font_count += shape[c].font_ids.size();
+ }
+ return font_count;
+}
+
+// Returns the number of unichars that would result from merging the shapes.
+int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
+ // Do it the easy way for now.
+ int master_id1 = MasterDestinationIndex(shape_id1);
+ int master_id2 = MasterDestinationIndex(shape_id2);
+ Shape combined_shape(*shape_table_[master_id1]);
+ combined_shape.AddShape(*shape_table_[master_id2]);
+ return combined_shape.size();
+}
+
+// Merges two shape_ids, leaving shape_id2 marked as merged.
+void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
+ int master_id1 = MasterDestinationIndex(shape_id1);
+ int master_id2 = MasterDestinationIndex(shape_id2);
+ // Point master_id2 (and all merged shapes) to master_id1.
+ shape_table_[master_id2]->set_destination_index(master_id1);
+ // Add all the shapes of master_id2 to master_id1.
+ shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
+}
+
+// Swaps two shape_ids.
+void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
+ Shape* tmp = shape_table_[shape_id1];
+ shape_table_[shape_id1] = shape_table_[shape_id2];
+ shape_table_[shape_id2] = tmp;
+}
+
+// Returns the destination of this shape, (if merged), taking into account
+// the fact that the destination may itself have been merged.
+int ShapeTable::MasterDestinationIndex(int shape_id) const {
+ int dest_id = shape_table_[shape_id]->destination_index();
+ if (dest_id == shape_id || dest_id < 0)
+ return shape_id; // Is master already.
+ int master_id = shape_table_[dest_id]->destination_index();
+ if (master_id == dest_id || master_id < 0)
+ return dest_id; // Dest is the master and shape_id points to it.
+ master_id = MasterDestinationIndex(master_id);
+ return master_id;
+}
+
+// Returns false if the unichars in neither shape is a subset of the other.
+bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
+ const Shape& shape1 = GetShape(shape_id1);
+ const Shape& shape2 = GetShape(shape_id2);
+ int c1, c2;
+ for (c1 = 0; c1 < shape1.size(); ++c1) {
+ int unichar_id1 = shape1[c1].unichar_id;
+ if (!shape2.ContainsUnichar(unichar_id1))
+ break;
+ }
+ for (c2 = 0; c2 < shape2.size(); ++c2) {
+ int unichar_id2 = shape2[c2].unichar_id;
+ if (!shape1.ContainsUnichar(unichar_id2))
+ break;
+ }
+ return c1 == shape1.size() || c2 == shape2.size();
+}
+
+// Returns false if the unichars in neither shape is a subset of the other.
+bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
+ int shape_id) const {
+ const Shape& merge1 = GetShape(merge_id1);
+ const Shape& merge2 = GetShape(merge_id2);
+ const Shape& shape = GetShape(shape_id);
+ int cm1, cm2, cs;
+ for (cs = 0; cs < shape.size(); ++cs) {
+ int unichar_id = shape[cs].unichar_id;
+ if (!merge1.ContainsUnichar(unichar_id) &&
+ !merge2.ContainsUnichar(unichar_id))
+ break; // Shape is not a subset of the merge.
+ }
+ for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
+ int unichar_id1 = merge1[cm1].unichar_id;
+ if (!shape.ContainsUnichar(unichar_id1))
+ break; // Merge is not a subset of shape
+ }
+ for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
+ int unichar_id2 = merge2[cm2].unichar_id;
+ if (!shape.ContainsUnichar(unichar_id2))
+ break; // Merge is not a subset of shape
+ }
+ return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
+}
+
+// Returns true if the unichar sets are equal between the shapes.
+bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
+ const Shape& shape1 = GetShape(shape_id1);
+ const Shape& shape2 = GetShape(shape_id2);
+ for (int c1 = 0; c1 < shape1.size(); ++c1) {
+ int unichar_id1 = shape1[c1].unichar_id;
+ if (!shape2.ContainsUnichar(unichar_id1))
+ return false;
+ }
+ for (int c2 = 0; c2 < shape2.size(); ++c2) {
+ int unichar_id2 = shape2[c2].unichar_id;
+ if (!shape1.ContainsUnichar(unichar_id2))
+ return false;
+ }
+ return true;
+}
+
+// Returns true if the unichar sets are equal between the shapes.
+bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
+ int shape_id) const {
+ const Shape& merge1 = GetShape(merge_id1);
+ const Shape& merge2 = GetShape(merge_id2);
+ const Shape& shape = GetShape(shape_id);
+ for (int cs = 0; cs < shape.size(); ++cs) {
+ int unichar_id = shape[cs].unichar_id;
+ if (!merge1.ContainsUnichar(unichar_id) &&
+ !merge2.ContainsUnichar(unichar_id))
+ return false; // Shape has a unichar that appears in neither merge.
+ }
+ for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
+ int unichar_id1 = merge1[cm1].unichar_id;
+ if (!shape.ContainsUnichar(unichar_id1))
+ return false; // Merge has a unichar that is not in shape.
+ }
+ for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
+ int unichar_id2 = merge2[cm2].unichar_id;
+ if (!shape.ContainsUnichar(unichar_id2))
+ return false; // Merge has a unichar that is not in shape.
+ }
+ return true;
+}
+
+// Returns true if there is a common unichar between the shapes.
+bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
+ const Shape& shape1 = GetShape(shape_id1);
+ const Shape& shape2 = GetShape(shape_id2);
+ for (int c1 = 0; c1 < shape1.size(); ++c1) {
+ int unichar_id1 = shape1[c1].unichar_id;
+ if (shape2.ContainsUnichar(unichar_id1))
+ return true;
+ }
+ return false;
+}
+
+// Returns true if there is a common font id between the shapes.
+bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
+ const Shape& shape1 = GetShape(shape_id1);
+ const Shape& shape2 = GetShape(shape_id2);
+ for (int c1 = 0; c1 < shape1.size(); ++c1) {
+ const GenericVector<int>& font_list1 = shape1[c1].font_ids;
+ for (int f = 0; f < font_list1.size(); ++f) {
+ if (shape2.ContainsFont(font_list1[f]))
+ return true;
+ }
+ }
+ return false;
+}
+
+// Appends the master shapes from other to this.
+// If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
+void ShapeTable::AppendMasterShapes(const ShapeTable& other,
+ GenericVector<int>* shape_map) {
+ if (shape_map != nullptr)
+ shape_map->init_to_size(other.NumShapes(), -1);
+ for (int s = 0; s < other.shape_table_.size(); ++s) {
+ if (other.shape_table_[s]->destination_index() < 0) {
+ int index = AddShape(*other.shape_table_[s]);
+ if (shape_map != nullptr)
+ (*shape_map)[s] = index;
+ }
+ }
+}
+
+// Returns the number of master shapes remaining after merging.
+int ShapeTable::NumMasterShapes() const {
+ int num_shapes = 0;
+ for (int s = 0; s < shape_table_.size(); ++s) {
+ if (shape_table_[s]->destination_index() < 0)
+ ++num_shapes;
+ }
+ return num_shapes;
+}
+
+
+// Adds the unichars of the given shape_id to the vector of results. Any
+// unichar_id that is already present just has the fonts added to the
+// font set for that result without adding a new entry in the vector.
+// NOTE: it is assumed that the results are given to this function in order
+// of decreasing rating.
+// The unichar_map vector indicates the index of the results entry containing
+// each unichar, or -1 if the unichar is not yet included in results.
+void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating,
+ GenericVector<int>* unichar_map,
+ std::vector<UnicharRating>* results) const {
+ if (shape_rating.joined) {
+ AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
+ results);
+ }
+ if (shape_rating.broken) {
+ AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
+ results);
+ }
+ const Shape& shape = GetShape(shape_rating.shape_id);
+ for (int u = 0; u < shape.size(); ++u) {
+ int result_index = AddUnicharToResults(shape[u].unichar_id,
+ shape_rating.rating,
+ unichar_map, results);
+ for (int f = 0; f < shape[u].font_ids.size(); ++f) {
+ (*results)[result_index].fonts.push_back(
+ ScoredFont(shape[u].font_ids[f],
+ IntCastRounded(shape_rating.rating * INT16_MAX)));
+ }
+ }
+}
+
+// Adds the given unichar_id to the results if needed, updating unichar_map
+// and returning the index of unichar in results.
+int ShapeTable::AddUnicharToResults(
+ int unichar_id, float rating, GenericVector<int>* unichar_map,
+ std::vector<UnicharRating>* results) const {
+ int result_index = unichar_map->get(unichar_id);
+ if (result_index < 0) {
+ UnicharRating result(unichar_id, rating);
+ result_index = results->size();
+ results->push_back(result);
+ (*unichar_map)[unichar_id] = result_index;
+ }
+ return result_index;
+}
+
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/shapetable.h b/tesseract/src/classify/shapetable.h
new file mode 100644
index 00000000..5a551401
--- /dev/null
+++ b/tesseract/src/classify/shapetable.h
@@ -0,0 +1,379 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: shapetable.h
+// Description: Class to map a classifier shape index to unicharset
+// indices and font indices.
+// Author: Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
+#define TESSERACT_CLASSIFY_SHAPETABLE_H_
+
+#include "bitvector.h"
+#include "fontinfo.h"
+#include "genericheap.h"
+#include "intmatcher.h"
+
+#include "genericvector.h"
+
+namespace tesseract {
+
+class STRING;
+class UNICHARSET;
+class ShapeTable;
+
+// Simple struct to hold a single classifier unichar selection, a corresponding
+// rating, and a list of appropriate fonts.
+struct UnicharRating {
+ UnicharRating()
+ : unichar_id(0), rating(0.0f), adapted(false), config(0),
+ feature_misses(0) {}
+ UnicharRating(int u, float r)
+ : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
+
+ // Print debug info.
+ void Print() const {
+ tprintf("Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%u,"
+ " %zu fonts\n", unichar_id, rating, adapted, config, feature_misses,
+ fonts.size());
+ }
+
+ // Helper function to get the index of the first result with the required
+ // unichar_id. If the results are sorted by rating, this will also be the
+ // best result with the required unichar_id.
+ // Returns -1 if the unichar_id is not found
+ static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results,
+ UNICHAR_ID unichar_id);
+
+ // Index into some UNICHARSET table indicates the class of the answer.
+ UNICHAR_ID unichar_id;
+ // Rating from classifier with 1.0 perfect and 0.0 impossible.
+ // Call it a probability if you must.
+ float rating;
+ // True if this result is from the adaptive classifier.
+ bool adapted;
+ // Index of best matching font configuration of result.
+ uint8_t config;
+ // Number of features that were total misses - were liked by no classes.
+ uint16_t feature_misses;
+ // Unsorted collection of fontinfo ids and scores. Note that a raw result
+ // from the IntegerMatch will contain config ids, that require transforming
+ // to fontinfo ids via fontsets and (possibly) shapetable.
+ std::vector<ScoredFont> fonts;
+};
+
+// Classifier result from a low-level classification is an index into some
+// ShapeTable and a rating.
+struct ShapeRating {
+ ShapeRating()
+ : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
+ joined(false), broken(false) {}
+ ShapeRating(int s, float r)
+ : shape_id(s), rating(r), raw(1.0f), font(0.0f),
+ joined(false), broken(false) {}
+
+ // Helper function to get the index of the first result with the required
+ // unichar_id. If the results are sorted by rating, this will also be the
+ // best result with the required unichar_id.
+ // Returns -1 if the unichar_id is not found
+ static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
+ const ShapeTable& shape_table,
+ UNICHAR_ID unichar_id);
+
+ // Index into some shape table indicates the class of the answer.
+ int shape_id;
+ // Rating from classifier with 1.0 perfect and 0.0 impossible.
+ // Call it a probability if you must.
+ float rating;
+ // Subsidiary rating that a classifier may use internally.
+ float raw;
+ // Subsidiary rating that a classifier may use internally.
+ float font;
+ // Flag indicating that the input may be joined.
+ bool joined;
+ // Flag indicating that the input may be broken (a fragment).
+ bool broken;
+};
+
+// Simple struct to hold an entry for a heap-based priority queue of
+// ShapeRating.
+struct ShapeQueueEntry {
+ ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
+ ShapeQueueEntry(const ShapeRating& rating, int level0)
+ : result(rating), level(level0) {}
+
+ // Sort by decreasing rating and decreasing level for equal rating.
+ bool operator<(const ShapeQueueEntry& other) const {
+ if (result.rating > other.result.rating) return true;
+ if (result.rating == other.result.rating)
+ return level > other.level;
+ return false;
+ }
+
+ // Output from classifier.
+ ShapeRating result;
+ // Which level in the tree did this come from?
+ int level;
+};
+using ShapeQueue = GenericHeap<ShapeQueueEntry>;
+
+// Simple struct to hold a set of fonts associated with a single unichar-id.
+// A vector of UnicharAndFonts makes a shape.
+struct UnicharAndFonts {
+ UnicharAndFonts() : unichar_id(0) {
+ }
+ UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
+ font_ids.push_back(font_id);
+ }
+
+ // Writes to the given file. Returns false in case of error.
+ bool Serialize(FILE* fp) const;
+ // Reads from the given file. Returns false in case of error.
+ bool DeSerialize(TFile* fp);
+
+ // Sort function to sort a pair of UnicharAndFonts by unichar_id.
+ static int SortByUnicharId(const void* v1, const void* v2);
+
+ GenericVector<int32_t> font_ids;
+ int32_t unichar_id;
+};
+
+// A Shape is a collection of unichar-ids and a list of fonts associated with
+// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
+// a classifiable unit, and represents a group of characters or parts of
+// characters that have a similar or identical shape. Shapes/ShapeTables may
+// be organized hierarchically from identical shapes at the leaves to vaguely
+// similar shapes near the root.
+class TESS_API Shape {
+ public:
+ Shape() : destination_index_(-1) {}
+
+ // Writes to the given file. Returns false in case of error.
+ bool Serialize(FILE* fp) const;
+ // Reads from the given file. Returns false in case of error.
+ bool DeSerialize(TFile* fp);
+
+ int destination_index() const {
+ return destination_index_;
+ }
+ void set_destination_index(int index) {
+ destination_index_ = index;
+ }
+ int size() const {
+ return unichars_.size();
+ }
+ // Returns a UnicharAndFonts entry for the given index, which must be
+ // in the range [0, size()).
+ const UnicharAndFonts& operator[](int index) const {
+ return unichars_[index];
+ }
+ // Sets the unichar_id of the given index to the new unichar_id.
+ void SetUnicharId(int index, int unichar_id) {
+ unichars_[index].unichar_id = unichar_id;
+ }
+ // Adds a font_id for the given unichar_id. If the unichar_id is not
+ // in the shape, it is added.
+ void AddToShape(int unichar_id, int font_id);
+ // Adds everything in other to this.
+ void AddShape(const Shape& other);
+ // Returns true if the shape contains the given unichar_id, font_id pair.
+ bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
+ // Returns true if the shape contains the given unichar_id, ignoring font.
+ bool ContainsUnichar(int unichar_id) const;
+ // Returns true if the shape contains the given font, ignoring unichar_id.
+ bool ContainsFont(int font_id) const;
+ // Returns true if the shape contains the given font properties, ignoring
+ // unichar_id.
+ bool ContainsFontProperties(const FontInfoTable& font_table,
+ uint32_t properties) const;
+ // Returns true if the shape contains multiple different font properties,
+ // ignoring unichar_id.
+ bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
+ // Returns true if this shape is equal to other (ignoring order of unichars
+ // and fonts).
+ bool operator==(const Shape& other) const;
+ // Returns true if this is a subset (including equal) of other.
+ bool IsSubsetOf(const Shape& other) const;
+ // Returns true if the lists of unichar ids are the same in this and other,
+ // ignoring fonts.
+ // NOT const, as it will sort the unichars on demand.
+ bool IsEqualUnichars(Shape* other);
+
+ private:
+ // Sorts the unichars_ vector by unichar.
+ void SortUnichars();
+
+ // Flag indicates that the unichars are sorted, allowing faster set
+ // operations with another shape.
+ bool unichars_sorted_ = false;
+ // If this Shape is part of a ShapeTable the destiation_index_ is the index
+ // of some other shape in the ShapeTable with which this shape is merged.
+ int destination_index_ = 0;
+ // Array of unichars, each with a set of fonts. Each unichar has at most
+ // one entry in the vector.
+ GenericVector<UnicharAndFonts> unichars_;
+};
+
+// ShapeTable is a class to encapsulate the triple indirection that is
+// used here.
+// ShapeTable is a vector of shapes.
+// Each shape is a vector of UnicharAndFonts representing the set of unichars
+// that the shape represents.
+// Each UnicharAndFonts also lists the fonts of the unichar_id that were
+// mapped to the shape during training.
+class TESS_API ShapeTable {
+ public:
+ ShapeTable();
+ // The UNICHARSET reference supplied here, or in set_unicharset below must
+ // exist for the entire life of the ShapeTable. It is used only by DebugStr.
+ explicit ShapeTable(const UNICHARSET& unicharset);
+
+ // Writes to the given file. Returns false in case of error.
+ bool Serialize(FILE* fp) const;
+ // Reads from the given file. Returns false in case of error.
+ bool DeSerialize(TFile* fp);
+
+ // Accessors.
+ int NumShapes() const {
+ return shape_table_.size();
+ }
+ const UNICHARSET& unicharset() const {
+ return *unicharset_;
+ }
+ // Returns the number of fonts used in this ShapeTable, computing it if
+ // necessary.
+ int NumFonts() const;
+ // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
+ // entire life of the ShapeTable.
+ void set_unicharset(const UNICHARSET& unicharset) {
+ unicharset_ = &unicharset;
+ }
+ // Re-indexes the class_ids in the shapetable according to the given map.
+ // Useful in conjunction with set_unicharset.
+ void ReMapClassIds(const GenericVector<int>& unicharset_map);
+ // Returns a string listing the classes/fonts in a shape.
+ STRING DebugStr(int shape_id) const;
+ // Returns a debug string summarizing the table.
+ STRING SummaryStr() const;
+
+ // Adds a new shape starting with the given unichar_id and font_id.
+ // Returns the assigned index.
+ int AddShape(int unichar_id, int font_id);
+ // Adds a copy of the given shape unless it is already present.
+ // Returns the assigned index or index of existing shape if already present.
+ int AddShape(const Shape& other);
+ // Removes the shape given by the shape index. All indices above are changed!
+ void DeleteShape(int shape_id);
+ // Adds a font_id to the given existing shape index for the given
+ // unichar_id. If the unichar_id is not in the shape, it is added.
+ void AddToShape(int shape_id, int unichar_id, int font_id);
+ // Adds the given shape to the existing shape with the given index.
+ void AddShapeToShape(int shape_id, const Shape& other);
+ // Returns the id of the shape that contains the given unichar and font.
+ // If not found, returns -1.
+ // If font_id < 0, the font_id is ignored and the first shape that matches
+ // the unichar_id is returned.
+ int FindShape(int unichar_id, int font_id) const;
+ // Returns the first unichar_id and font_id in the given shape.
+ void GetFirstUnicharAndFont(int shape_id,
+ int* unichar_id, int* font_id) const;
+
+ // Accessors for the Shape with the given shape_id.
+ const Shape& GetShape(int shape_id) const {
+ return *shape_table_[shape_id];
+ }
+ Shape* MutableShape(int shape_id) {
+ return shape_table_[shape_id];
+ }
+
+ // Expands all the classes/fonts in the shape individually to build
+ // a ShapeTable.
+ int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
+
+ // Returns true if the shapes are already merged.
+ bool AlreadyMerged(int shape_id1, int shape_id2) const;
+ // Returns true if any shape contains multiple unichars.
+ bool AnyMultipleUnichars() const;
+ // Returns the maximum number of unichars over all shapes.
+ int MaxNumUnichars() const;
+ // Merges shapes with a common unichar over the [start, end) interval.
+ // Assumes single unichar per shape.
+ void ForceFontMerges(int start, int end);
+ // Returns the number of unichars in the master shape.
+ int MasterUnicharCount(int shape_id) const;
+ // Returns the sum of the font counts in the master shape.
+ int MasterFontCount(int shape_id) const;
+ // Returns the number of unichars that would result from merging the shapes.
+ int MergedUnicharCount(int shape_id1, int shape_id2) const;
+ // Merges two shape_ids, leaving shape_id2 marked as merged.
+ void MergeShapes(int shape_id1, int shape_id2);
+ // Swaps two shape_ids.
+ void SwapShapes(int shape_id1, int shape_id2);
+ // Appends the master shapes from other to this.
+ // Used to create a clean ShapeTable from a merged one, or to create a
+ // copy of a ShapeTable.
+ // If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
+ void AppendMasterShapes(const ShapeTable& other,
+ GenericVector<int>* shape_map);
+ // Returns the number of master shapes remaining after merging.
+ int NumMasterShapes() const;
+ // Returns the destination of this shape, (if merged), taking into account
+ // the fact that the destination may itself have been merged.
+ // For a non-merged shape, returns the input shape_id.
+ int MasterDestinationIndex(int shape_id) const;
+
+ // Returns false if the unichars in neither shape is a subset of the other..
+ bool SubsetUnichar(int shape_id1, int shape_id2) const;
+ // Returns false if the unichars in neither shape is a subset of the other..
+ bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
+ // Returns true if the unichar sets are equal between the shapes.
+ bool EqualUnichars(int shape_id1, int shape_id2) const;
+ bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
+ // Returns true if there is a common unichar between the shapes.
+ bool CommonUnichars(int shape_id1, int shape_id2) const;
+ // Returns true if there is a common font id between the shapes.
+ bool CommonFont(int shape_id1, int shape_id2) const;
+
+ // Adds the unichars of the given shape_id to the vector of results. Any
+ // unichar_id that is already present just has the fonts added to the
+ // font set for that result without adding a new entry in the vector.
+ // NOTE: it is assumed that the results are given to this function in order
+ // of decreasing rating.
+ // The unichar_map vector indicates the index of the results entry containing
+ // each unichar, or -1 if the unichar is not yet included in results.
+ void AddShapeToResults(const ShapeRating& shape_rating,
+ GenericVector<int>* unichar_map,
+ std::vector<UnicharRating>* results) const;
+
+ private:
+ // Adds the given unichar_id to the results if needed, updating unichar_map
+ // and returning the index of unichar in results.
+ int AddUnicharToResults(int unichar_id, float rating,
+ GenericVector<int>* unichar_map,
+ std::vector<UnicharRating>* results) const;
+
+ // Pointer to a provided unicharset used only by the Debugstr member.
+ const UNICHARSET* unicharset_;
+ // Vector of pointers to the Shapes in this ShapeTable.
+ PointerVector<Shape> shape_table_;
+
+ // Cached data calculated on demand.
+ mutable int num_fonts_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
diff --git a/tesseract/src/classify/tessclassifier.cpp b/tesseract/src/classify/tessclassifier.cpp
new file mode 100644
index 00000000..c7819d66
--- /dev/null
+++ b/tesseract/src/classify/tessclassifier.cpp
@@ -0,0 +1,84 @@
+///////////////////////////////////////////////////////////////////////
+// File: tessclassifier.cpp
+// Description: Tesseract implementation of a ShapeClassifier.
+// Author: Ray Smith
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tessclassifier.h"
+
+#include "classify.h"
+#include "trainingsample.h"
+
+namespace tesseract {
+
+// Classifies the given [training] sample, writing to results.
+// See ShapeClassifier for a full description.
+int TessClassifier::UnicharClassifySample(
+ const TrainingSample& sample, Pix* page_pix, int debug,
+ UNICHAR_ID keep_this, std::vector<UnicharRating>* results) {
+ const int old_matcher_level = classify_->matcher_debug_level;
+ const int old_matcher_flags = classify_->matcher_debug_flags;
+ const int old_classify_level = classify_->classify_debug_level;
+ if (debug) {
+ // Explicitly set values of various control parameters to generate debug
+ // output if required, restoring the old values after classifying.
+ classify_->matcher_debug_level.set_value(2);
+ classify_->matcher_debug_flags.set_value(25);
+ classify_->classify_debug_level.set_value(3);
+ }
+ classify_->CharNormTrainingSample(pruner_only_, keep_this, sample, results);
+ if (debug) {
+ classify_->matcher_debug_level.set_value(old_matcher_level);
+ classify_->matcher_debug_flags.set_value(old_matcher_flags);
+ classify_->classify_debug_level.set_value(old_classify_level);
+ }
+ return results->size();
+}
+
+// Provides access to the ShapeTable that this classifier works with.
+const ShapeTable* TessClassifier::GetShapeTable() const {
+ return classify_->shape_table();
+}
+// Provides access to the UNICHARSET that this classifier works with.
+// Only needs to be overridden if GetShapeTable() can return nullptr.
+const UNICHARSET& TessClassifier::GetUnicharset() const {
+ return classify_->unicharset;
+}
+
+// Displays classification as the given shape_id. Creates as many windows
+// as it feels fit, using index as a guide for placement. Adds any created
+// windows to the windows output and returns a new index that may be used
+// by any subsequent classifiers. Caller waits for the user to view and
+// then destroys the windows by clearing the vector.
+int TessClassifier::DisplayClassifyAs(
+ const TrainingSample& sample, Pix* page_pix, int unichar_id, int index,
+ PointerVector<ScrollView>* windows) {
+ int shape_id = unichar_id;
+ // TODO(rays) Fix this so it works with both flat and real shapetables.
+ // if (GetShapeTable() != nullptr)
+ // shape_id = BestShapeForUnichar(sample, page_pix, unichar_id, nullptr);
+ if (shape_id < 0) return index;
+ if (UnusedClassIdIn(classify_->PreTrainedTemplates, shape_id)) {
+ tprintf("No built-in templates for class/shape %d\n", shape_id);
+ return index;
+ }
+#ifndef GRAPHICS_DISABLED
+ classify_->ShowBestMatchFor(shape_id, sample.features(),
+ sample.num_features());
+#endif
+ return index;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/tessclassifier.h b/tesseract/src/classify/tessclassifier.h
new file mode 100644
index 00000000..a8b3f753
--- /dev/null
+++ b/tesseract/src/classify/tessclassifier.h
@@ -0,0 +1,72 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: tessclassifier.h
+// Description: Tesseract implementation of a ShapeClassifier.
+// Author: Ray Smith
+// Created: Tue Nov 22 14:10:45 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_
+#define THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_
+
+#include "shapeclassifier.h"
+
+namespace tesseract {
+
+class Classify;
+class TrainingSample;
+
+// Tesseract implementation of a ShapeClassifier.
+// Due to limitations in the content of TrainingSample, this currently
+// only works for the static classifier and only works if the ShapeTable
+// in classify is not nullptr.
+class TESS_API TessClassifier : public ShapeClassifier {
+ public:
+ TessClassifier(bool pruner_only, tesseract::Classify* classify)
+ : pruner_only_(pruner_only), classify_(classify) {}
+ ~TessClassifier() override = default;
+
+ // Classifies the given [training] sample, writing to results.
+ // See ShapeClassifier for a full description.
+ int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
+ int debug, UNICHAR_ID keep_this,
+ std::vector<UnicharRating>* results) override;
+ // Provides access to the ShapeTable that this classifier works with.
+ const ShapeTable* GetShapeTable() const override;
+ // Provides access to the UNICHARSET that this classifier works with.
+ // Only needs to be overridden if GetShapeTable() can return nullptr.
+ const UNICHARSET& GetUnicharset() const override;
+
+ // Displays classification as the given shape_id. Creates as many windows
+ // as it feels fit, using index as a guide for placement. Adds any created
+ // windows to the windows output and returns a new index that may be used
+ // by any subsequent classifiers. Caller waits for the user to view and
+ // then destroys the windows by clearing the vector.
+ int DisplayClassifyAs(const TrainingSample& sample, Pix* page_pix,
+ int unichar_id, int index,
+ PointerVector<ScrollView>* windows) override;
+
+ private:
+ // Indicates that this classifier is to use just the ClassPruner, or the
+ // full classifier if false.
+ bool pruner_only_;
+ // Borrowed pointer to the actual Tesseract classifier.
+ tesseract::Classify* classify_;
+};
+
+} // namespace tesseract
+
+#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_ */
diff --git a/tesseract/src/classify/trainingsample.cpp b/tesseract/src/classify/trainingsample.cpp
new file mode 100644
index 00000000..003fb97b
--- /dev/null
+++ b/tesseract/src/classify/trainingsample.cpp
@@ -0,0 +1,339 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#define _USE_MATH_DEFINES // for M_PI
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "trainingsample.h"
+
+#include "intfeaturespace.h"
+#include "helpers.h"
+#include "normfeat.h"
+#include "shapetable.h"
+
+#include "allheaders.h"
+
+#include <cmath> // for M_PI
+
+namespace tesseract {
+
+ELISTIZE(TrainingSample)
+
+// Center of randomizing operations.
+const int kRandomizingCenter = 128;
+
+// Randomizing factors.
+const int TrainingSample::kYShiftValues[kSampleYShiftSize] = {
+ 6, 3, -3, -6, 0
+};
+const double TrainingSample::kScaleValues[kSampleScaleSize] = {
+ 1.0625, 0.9375, 1.0
+};
+
+TrainingSample::~TrainingSample() {
+ delete [] features_;
+ delete [] micro_features_;
+}
+
+// WARNING! Serialize/DeSerialize do not save/restore the "cache" data
+// members, which is mostly the mapped features, and the weight.
+// It is assumed these can all be reconstructed from what is saved.
+// Writes to the given file. Returns false in case of error.
+bool TrainingSample::Serialize(FILE* fp) const {
+ if (fwrite(&class_id_, sizeof(class_id_), 1, fp) != 1) return false;
+ if (fwrite(&font_id_, sizeof(font_id_), 1, fp) != 1) return false;
+ if (fwrite(&page_num_, sizeof(page_num_), 1, fp) != 1) return false;
+ if (!bounding_box_.Serialize(fp)) return false;
+ if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
+ if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
+ return false;
+ if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
+ return false;
+ if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_)
+ return false;
+ if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_,
+ fp) != num_micro_features_)
+ return false;
+ if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
+ kNumCNParams) return false;
+ if (fwrite(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount)
+ return false;
+ return true;
+}
+
+// Creates from the given file. Returns nullptr in case of error.
+// If swap is true, assumes a big/little-endian swap is needed.
+TrainingSample* TrainingSample::DeSerializeCreate(bool swap, FILE* fp) {
+ auto* sample = new TrainingSample;
+ if (sample->DeSerialize(swap, fp)) return sample;
+ delete sample;
+ return nullptr;
+}
+
+// Reads from the given file. Returns false in case of error.
+// If swap is true, assumes a big/little-endian swap is needed.
+bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
+ if (fread(&class_id_, sizeof(class_id_), 1, fp) != 1) return false;
+ if (fread(&font_id_, sizeof(font_id_), 1, fp) != 1) return false;
+ if (fread(&page_num_, sizeof(page_num_), 1, fp) != 1) return false;
+ if (!bounding_box_.DeSerialize(swap, fp)) return false;
+ if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
+ if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
+ return false;
+ if (fread(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
+ return false;
+ if (swap) {
+ ReverseN(&class_id_, sizeof(class_id_));
+ ReverseN(&num_features_, sizeof(num_features_));
+ ReverseN(&num_micro_features_, sizeof(num_micro_features_));
+ ReverseN(&outline_length_, sizeof(outline_length_));
+ }
+ // Arbitrarily limit the number of elements to protect against bad data.
+ if (num_features_ > UINT16_MAX) return false;
+ if (num_micro_features_ > UINT16_MAX) return false;
+ delete [] features_;
+ features_ = new INT_FEATURE_STRUCT[num_features_];
+ if (fread(features_, sizeof(*features_), num_features_, fp)
+ != num_features_)
+ return false;
+ delete [] micro_features_;
+ micro_features_ = new MicroFeature[num_micro_features_];
+ if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_,
+ fp) != num_micro_features_)
+ return false;
+ if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
+ kNumCNParams) return false;
+ if (fread(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount)
+ return false;
+ return true;
+}
+
+// Saves the given features into a TrainingSample.
+TrainingSample* TrainingSample::CopyFromFeatures(
+ const INT_FX_RESULT_STRUCT& fx_info,
+ const TBOX& bounding_box,
+ const INT_FEATURE_STRUCT* features,
+ int num_features) {
+ auto* sample = new TrainingSample;
+ sample->num_features_ = num_features;
+ sample->features_ = new INT_FEATURE_STRUCT[num_features];
+ sample->outline_length_ = fx_info.Length;
+ memcpy(sample->features_, features, num_features * sizeof(features[0]));
+ sample->geo_feature_[GeoBottom] = bounding_box.bottom();
+ sample->geo_feature_[GeoTop] = bounding_box.top();
+ sample->geo_feature_[GeoWidth] = bounding_box.width();
+
+ // Generate the cn_feature_ from the fx_info.
+ sample->cn_feature_[CharNormY] =
+ MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
+ sample->cn_feature_[CharNormLength] =
+ MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
+ sample->cn_feature_[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
+ sample->cn_feature_[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;
+
+ sample->features_are_indexed_ = false;
+ sample->features_are_mapped_ = false;
+ return sample;
+}
+
+// Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
+FEATURE_STRUCT* TrainingSample::GetCNFeature() const {
+ FEATURE feature = NewFeature(&CharNormDesc);
+ for (int i = 0; i < kNumCNParams; ++i)
+ feature->Params[i] = cn_feature_[i];
+ return feature;
+}
+
+// Constructs and returns a copy randomized by the method given by
+// the randomizer index. If index is out of [0, kSampleRandomSize) then
+// an exact copy is returned.
+TrainingSample* TrainingSample::RandomizedCopy(int index) const {
+ TrainingSample* sample = Copy();
+ if (index >= 0 && index < kSampleRandomSize) {
+ ++index; // Remove the first combination.
+ const int yshift = kYShiftValues[index / kSampleScaleSize];
+ double scaling = kScaleValues[index % kSampleScaleSize];
+ for (uint32_t i = 0; i < num_features_; ++i) {
+ double result = (features_[i].X - kRandomizingCenter) * scaling;
+ result += kRandomizingCenter;
+ sample->features_[i].X = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);
+ result = (features_[i].Y - kRandomizingCenter) * scaling;
+ result += kRandomizingCenter + yshift;
+ sample->features_[i].Y = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);
+ }
+ }
+ return sample;
+}
+
+// Constructs and returns an exact copy.
+TrainingSample* TrainingSample::Copy() const {
+ auto* sample = new TrainingSample;
+ sample->class_id_ = class_id_;
+ sample->font_id_ = font_id_;
+ sample->weight_ = weight_;
+ sample->sample_index_ = sample_index_;
+ sample->num_features_ = num_features_;
+ if (num_features_ > 0) {
+ sample->features_ = new INT_FEATURE_STRUCT[num_features_];
+ memcpy(sample->features_, features_, num_features_ * sizeof(features_[0]));
+ }
+ sample->num_micro_features_ = num_micro_features_;
+ if (num_micro_features_ > 0) {
+ sample->micro_features_ = new MicroFeature[num_micro_features_];
+ memcpy(sample->micro_features_, micro_features_,
+ num_micro_features_ * sizeof(micro_features_[0]));
+ }
+ memcpy(sample->cn_feature_, cn_feature_, sizeof(*cn_feature_) * kNumCNParams);
+ memcpy(sample->geo_feature_, geo_feature_, sizeof(*geo_feature_) * GeoCount);
+ return sample;
+}
+
+// Extracts the needed information from the CHAR_DESC_STRUCT.
+void TrainingSample::ExtractCharDesc(int int_feature_type,
+ int micro_type,
+ int cn_type,
+ int geo_type,
+ CHAR_DESC_STRUCT* char_desc) {
+ // Extract the INT features.
+ delete[] features_;
+ FEATURE_SET_STRUCT* char_features = char_desc->FeatureSets[int_feature_type];
+ if (char_features == nullptr) {
+ tprintf("Error: no features to train on of type %s\n",
+ kIntFeatureType);
+ num_features_ = 0;
+ features_ = nullptr;
+ } else {
+ num_features_ = char_features->NumFeatures;
+ features_ = new INT_FEATURE_STRUCT[num_features_];
+ for (uint32_t f = 0; f < num_features_; ++f) {
+ features_[f].X =
+ static_cast<uint8_t>(char_features->Features[f]->Params[IntX]);
+ features_[f].Y =
+ static_cast<uint8_t>(char_features->Features[f]->Params[IntY]);
+ features_[f].Theta =
+ static_cast<uint8_t>(char_features->Features[f]->Params[IntDir]);
+ features_[f].CP_misses = 0;
+ }
+ }
+ // Extract the Micro features.
+ delete[] micro_features_;
+ char_features = char_desc->FeatureSets[micro_type];
+ if (char_features == nullptr) {
+ tprintf("Error: no features to train on of type %s\n",
+ kMicroFeatureType);
+ num_micro_features_ = 0;
+ micro_features_ = nullptr;
+ } else {
+ num_micro_features_ = char_features->NumFeatures;
+ micro_features_ = new MicroFeature[num_micro_features_];
+ for (uint32_t f = 0; f < num_micro_features_; ++f) {
+ for (int d = 0; d < MFCount; ++d) {
+ micro_features_[f][d] = char_features->Features[f]->Params[d];
+ }
+ }
+ }
+ // Extract the CN feature.
+ char_features = char_desc->FeatureSets[cn_type];
+ if (char_features == nullptr) {
+ tprintf("Error: no CN feature to train on.\n");
+ } else {
+ ASSERT_HOST(char_features->NumFeatures == 1);
+ cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY];
+ cn_feature_[CharNormLength] =
+ char_features->Features[0]->Params[CharNormLength];
+ cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx];
+ cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy];
+ }
+ // Extract the Geo feature.
+ char_features = char_desc->FeatureSets[geo_type];
+ if (char_features == nullptr) {
+ tprintf("Error: no Geo feature to train on.\n");
+ } else {
+ ASSERT_HOST(char_features->NumFeatures == 1);
+ geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom];
+ geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop];
+ geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth];
+ }
+ features_are_indexed_ = false;
+ features_are_mapped_ = false;
+}
+
+// Sets the mapped_features_ from the features_ using the provided
+// feature_space to the indexed versions of the features.
+void TrainingSample::IndexFeatures(const IntFeatureSpace& feature_space) {
+ GenericVector<int> indexed_features;
+ feature_space.IndexAndSortFeatures(features_, num_features_,
+ &mapped_features_);
+ features_are_indexed_ = true;
+ features_are_mapped_ = false;
+}
+
+// Returns a pix representing the sample. (Int features only.)
+Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const {
+ Pix* pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1);
+ for (uint32_t f = 0; f < num_features_; ++f) {
+ int start_x = features_[f].X;
+ int start_y = kIntFeatureExtent - features_[f].Y;
+ double dx = cos((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);
+ double dy = -sin((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);
+ for (int i = 0; i <= 5; ++i) {
+ int x = static_cast<int>(start_x + dx * i);
+ int y = static_cast<int>(start_y + dy * i);
+ if (x >= 0 && x < 256 && y >= 0 && y < 256)
+ pixSetPixel(pix, x, y, 1);
+ }
+ }
+ if (unicharset != nullptr)
+ pixSetText(pix, unicharset->id_to_unichar(class_id_));
+ return pix;
+}
+
+#ifndef GRAPHICS_DISABLED
+
+// Displays the features in the given window with the given color.
+void TrainingSample::DisplayFeatures(ScrollView::Color color,
+ ScrollView* window) const {
+ for (uint32_t f = 0; f < num_features_; ++f) {
+ RenderIntFeature(window, &features_[f], color);
+ }
+}
+
+#endif // !GRAPHICS_DISABLED
+
+// Returns a pix of the original sample image. The pix is padded all round
+// by padding wherever possible.
+// The returned Pix must be pixDestroyed after use.
+// If the input page_pix is nullptr, nullptr is returned.
+Pix* TrainingSample::GetSamplePix(int padding, Pix* page_pix) const {
+ if (page_pix == nullptr)
+ return nullptr;
+ int page_width = pixGetWidth(page_pix);
+ int page_height = pixGetHeight(page_pix);
+ TBOX padded_box = bounding_box();
+ padded_box.pad(padding, padding);
+ // Clip the padded_box to the limits of the page
+ TBOX page_box(0, 0, page_width, page_height);
+ padded_box &= page_box;
+ Box* box = boxCreate(page_box.left(), page_height - page_box.top(),
+ page_box.width(), page_box.height());
+ Pix* sample_pix = pixClipRectangle(page_pix, box, nullptr);
+ boxDestroy(&box);
+ return sample_pix;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/classify/trainingsample.h b/tesseract/src/classify/trainingsample.h
new file mode 100644
index 00000000..0ac2cc4f
--- /dev/null
+++ b/tesseract/src/classify/trainingsample.h
@@ -0,0 +1,252 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H_
+#define TESSERACT_TRAINING_TRAININGSAMPLE_H_
+
+#include "elst.h"
+#include "featdefs.h"
+#include "intfx.h"
+#include "intmatcher.h"
+#include "matrix.h"
+#include "mf.h"
+#include "picofeat.h"
+#include "shapetable.h"
+#include "unicharset.h"
+
+struct Pix;
+
+namespace tesseract {
+
+class IntFeatureMap;
+class IntFeatureSpace;
+class ShapeTable;
+
+// Number of elements of cn_feature_.
+static const int kNumCNParams = 4;
+// Number of ways to shift the features when randomizing.
+static const int kSampleYShiftSize = 5;
+// Number of ways to scale the features when randomizing.
+static const int kSampleScaleSize = 3;
+// Total number of different ways to manipulate the features when randomizing.
+// The first and last combinations are removed to avoid an excessive
+// top movement (first) and an identity transformation (last).
+// WARNING: To avoid patterned duplication of samples, be sure to keep
+// kSampleRandomSize prime!
+// Eg with current values (kSampleYShiftSize = 5 and TkSampleScaleSize = 3)
+// kSampleRandomSize is 13, which is prime.
+static const int kSampleRandomSize = kSampleYShiftSize * kSampleScaleSize - 2;
+// ASSERT_IS_PRIME(kSampleRandomSize) !!
+
+class TESS_API TrainingSample : public ELIST_LINK {
+ public:
+ TrainingSample()
+ : class_id_(INVALID_UNICHAR_ID), font_id_(0), page_num_(0),
+ num_features_(0), num_micro_features_(0), outline_length_(0),
+ features_(nullptr), micro_features_(nullptr), weight_(1.0),
+ max_dist_(0.0), sample_index_(0),
+ features_are_indexed_(false), features_are_mapped_(false),
+ is_error_(false) {
+ }
+ ~TrainingSample();
+
+ // Saves the given features into a TrainingSample. The features are copied,
+ // so may be deleted afterwards. Delete the return value after use.
+ static TrainingSample* CopyFromFeatures(const INT_FX_RESULT_STRUCT& fx_info,
+ const TBOX& bounding_box,
+ const INT_FEATURE_STRUCT* features,
+ int num_features);
+ // Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
+ FEATURE_STRUCT* GetCNFeature() const;
+ // Constructs and returns a copy "randomized" by the method given by
+ // the randomizer index. If index is out of [0, kSampleRandomSize) then
+ // an exact copy is returned.
+ TrainingSample* RandomizedCopy(int index) const;
+ // Constructs and returns an exact copy.
+ TrainingSample* Copy() const;
+
+ // WARNING! Serialize/DeSerialize do not save/restore the "cache" data
+ // members, which is mostly the mapped features, and the weight.
+ // It is assumed these can all be reconstructed from what is saved.
+ // Writes to the given file. Returns false in case of error.
+ bool Serialize(FILE* fp) const;
+ // Creates from the given file. Returns nullptr in case of error.
+ // If swap is true, assumes a big/little-endian swap is needed.
+ static TrainingSample* DeSerializeCreate(bool swap, FILE* fp);
+ // Reads from the given file. Returns false in case of error.
+ // If swap is true, assumes a big/little-endian swap is needed.
+ bool DeSerialize(bool swap, FILE* fp);
+
+ // Extracts the needed information from the CHAR_DESC_STRUCT.
+ void ExtractCharDesc(int feature_type, int micro_type,
+ int cn_type, int geo_type,
+ CHAR_DESC_STRUCT* char_desc);
+
+ // Sets the mapped_features_ from the features_ using the provided
+ // feature_space to the indexed versions of the features.
+ void IndexFeatures(const IntFeatureSpace& feature_space);
+
+ // Returns a pix representing the sample. (Int features only.)
+ Pix* RenderToPix(const UNICHARSET* unicharset) const;
+ // Displays the features in the given window with the given color.
+ void DisplayFeatures(ScrollView::Color color, ScrollView* window) const;
+
+ // Returns a pix of the original sample image. The pix is padded all round
+ // by padding wherever possible.
+ // The returned Pix must be pixDestroyed after use.
+ // If the input page_pix is nullptr, nullptr is returned.
+ Pix* GetSamplePix(int padding, Pix* page_pix) const;
+
+ // Accessors.
+ UNICHAR_ID class_id() const {
+ return class_id_;
+ }
+ void set_class_id(int id) {
+ class_id_ = id;
+ }
+ int font_id() const {
+ return font_id_;
+ }
+ void set_font_id(int id) {
+ font_id_ = id;
+ }
+ int page_num() const {
+ return page_num_;
+ }
+ void set_page_num(int page) {
+ page_num_ = page;
+ }
+ const TBOX& bounding_box() const {
+ return bounding_box_;
+ }
+ void set_bounding_box(const TBOX& box) {
+ bounding_box_ = box;
+ }
+ uint32_t num_features() const {
+ return num_features_;
+ }
+ const INT_FEATURE_STRUCT* features() const {
+ return features_;
+ }
+ uint32_t num_micro_features() const {
+ return num_micro_features_;
+ }
+ const MicroFeature* micro_features() const {
+ return micro_features_;
+ }
+ int outline_length() const {
+ return outline_length_;
+ }
+ float cn_feature(int index) const {
+ return cn_feature_[index];
+ }
+ int geo_feature(int index) const {
+ return geo_feature_[index];
+ }
+ double weight() const {
+ return weight_;
+ }
+ void set_weight(double value) {
+ weight_ = value;
+ }
+ double max_dist() const {
+ return max_dist_;
+ }
+ void set_max_dist(double value) {
+ max_dist_ = value;
+ }
+ int sample_index() const {
+ return sample_index_;
+ }
+ void set_sample_index(int value) {
+ sample_index_ = value;
+ }
+ bool features_are_mapped() const {
+ return features_are_mapped_;
+ }
+ const GenericVector<int>& mapped_features() const {
+ ASSERT_HOST(features_are_mapped_);
+ return mapped_features_;
+ }
+ const GenericVector<int>& indexed_features() const {
+ ASSERT_HOST(features_are_indexed_);
+ return mapped_features_;
+ }
+ bool is_error() const {
+ return is_error_;
+ }
+ void set_is_error(bool value) {
+ is_error_ = value;
+ }
+
+ private:
+ // Unichar id that this sample represents. There obviously must be a
+ // reference UNICHARSET somewhere. Usually in TrainingSampleSet.
+ UNICHAR_ID class_id_;
+ // Font id in which this sample was printed. Refers to a fontinfo_table_ in
+ // MasterTrainer.
+ int font_id_;
+ // Number of page that the sample came from.
+ int page_num_;
+ // Bounding box of sample in original image.
+ TBOX bounding_box_;
+ // Number of INT_FEATURE_STRUCT in features_ array.
+ uint32_t num_features_;
+ // Number of MicroFeature in micro_features_ array.
+ uint32_t num_micro_features_;
+ // Total length of outline in the baseline normalized coordinate space.
+ // See comment in WERD_RES class definition for a discussion of coordinate
+ // spaces.
+ int outline_length_;
+ // Array of features.
+ INT_FEATURE_STRUCT* features_;
+ // Array of features.
+ MicroFeature* micro_features_;
+ // The one and only CN feature. Indexed by NORM_PARAM_NAME enum.
+ float cn_feature_[kNumCNParams];
+ // The one and only geometric feature. (Aims at replacing cn_feature_).
+ // Indexed by GeoParams enum in picofeat.h
+ int geo_feature_[GeoCount];
+
+ // Non-serialized cache data.
+ // Weight used for boosting training.
+ double weight_;
+ // Maximum distance to other samples of same class/font used in computing
+ // the canonical sample.
+ double max_dist_;
+ // Global index of this sample.
+ int sample_index_;
+public:
+ // both are used in training tools
+ // hide after refactoring
+
+ // Indexed/mapped features, as indicated by the bools below.
+ GenericVector<int> mapped_features_;
+ bool features_are_indexed_;
+ bool features_are_mapped_;
+private:
+ // True if the last classification was an error by the current definition.
+ bool is_error_;
+
+ // Randomizing factors.
+ static const int kYShiftValues[kSampleYShiftSize];
+ static const double kScaleValues[kSampleScaleSize];
+};
+
+ELISTIZEH(TrainingSample)
+
+} // namespace tesseract
+
+#endif // TESSERACT_TRAINING_TRAININGSAMPLE_H_