Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added the option for character accumulated glyph confidences. #1851

Merged
merged 1 commit into from
Aug 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1606,12 +1606,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>";
// If glyph confidence is required it is added here
if (tesseract_->glyph_confidences && confidencemap != nullptr) {
if (tesseract_->glyph_confidences == 1 && confidencemap != nullptr) {
for (size_t i = 0; i < confidencemap->size(); i++) {
hocr_str += "\n <span class='ocrx_cinfo'";
AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
hocr_str += ">";
//*
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
for (std::pair<const char*, float> conf : timestep) {
hocr_str += "<span class='ocr_glyph'";
Expand All @@ -1623,10 +1622,32 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
hocr_str += "</span>";
gcnt++;
}
//*/
hocr_str += "</span>";
tcnt++;
}
} else if (tesseract_->glyph_confidences == 2 && confidencemap != nullptr) {
for (size_t i = 0; i < confidencemap->size(); i++) {
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
if (timestep.size() > 0) {
hocr_str += "\n <span class='ocrx_cinfo'";
AddIdTohOCR(&hocr_str, "alternative_glyphs", page_id, wcnt, tcnt);
hocr_str += " chosen='";
hocr_str += timestep[0].first;
hocr_str += "'>";
for (size_t j = 1; j < timestep.size(); j++) {
hocr_str += "<span class='ocr_glyph'";
AddIdTohOCR(&hocr_str, "glyph", page_id, wcnt, gcnt);
hocr_str.add_str_int(" title='x_confs ", int(timestep[j].second * 100));
hocr_str += "'";
hocr_str += ">";
hocr_str += timestep[j].first;
hocr_str += "</span>";
gcnt++;
}
hocr_str += "</span>";
tcnt++;
}
}
}
hocr_str += "</span>";
tcnt = 1;
Expand Down
2 changes: 1 addition & 1 deletion src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ Tesseract::Tesseract()
STRING_MEMBER(page_separator, "\f",
"Page separator (default is form feed control character)",
this->params()),
BOOL_MEMBER(glyph_confidences, false,
INT_MEMBER(glyph_confidences, 0,
"Allows to include glyph confidences in the hOCR output",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noah, could you please add help information here on the valid values for glyph_confidences?

this->params()),

Expand Down
3 changes: 2 additions & 1 deletion src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,8 @@ class Tesseract : public Wordrec {
"Preserve multiple interword spaces");
STRING_VAR_H(page_separator, "\f",
"Page separator (default is form feed control character)");
BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output");
INT_VAR_H(glyph_confidences, 0,
"Allows to include glyph confidences in the hOCR output");

//// ambigsrecog.cpp /////////////////////////////////////////////////////////
FILE *init_recog_training(const STRING &fname);
Expand Down
3 changes: 2 additions & 1 deletion src/lstm/lstmrecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
bool debug, double worst_dict_cert,
const TBOX& line_box,
PointerVector<WERD_RES>* words, bool glyph_confidences) {
PointerVector<WERD_RES>* words,
int glyph_confidences) {
NetworkIO outputs;
float scale_factor;
NetworkIO inputs;
Expand Down
2 changes: 1 addition & 1 deletion src/lstm/lstmrecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class LSTMRecognizer {
void RecognizeLine(const ImageData& image_data, bool invert, bool debug,
double worst_dict_cert, const TBOX& line_box,
PointerVector<WERD_RES>* words,
bool glyph_confidences = false);
int glyph_confidences = 0);

// Helper computes min and mean best results in the output.
void OutputStats(const NetworkIO& outputs,
Expand Down
77 changes: 71 additions & 6 deletions src/lstm/recodebeam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "networkio.h"
#include "pageres.h"
#include "unicharcompress.h"
#include <deque>
#include <map>
#include <set>
#include <vector>

Expand Down Expand Up @@ -79,7 +81,7 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder,
// Decodes the set of network outputs, storing the lattice internally.
void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
double cert_offset, double worst_dict_cert,
const UNICHARSET* charset, bool glyph_confidence) {
const UNICHARSET* charset, int glyph_confidence) {
beam_size_ = 0;
int width = output.Width();
if (glyph_confidence)
Expand Down Expand Up @@ -177,14 +179,15 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
float scale_factor, bool debug,
const UNICHARSET* unicharset,
PointerVector<WERD_RES>* words,
bool glyph_confidence) {
int glyph_confidence) {
words->truncate(0);
GenericVector<int> unichar_ids;
GenericVector<float> certs;
GenericVector<float> ratings;
GenericVector<int> xcoords;
GenericVector<const RecodeNode*> best_nodes;
GenericVector<const RecodeNode*> second_nodes;
std::deque<std::pair<int,int>> best_glyphs;
ExtractBestPaths(&best_nodes, &second_nodes);
if (debug) {
DebugPath(unicharset, best_nodes);
Expand All @@ -194,15 +197,29 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
DebugUnicharPath(unicharset, second_nodes, unichar_ids, certs, ratings,
xcoords);
}
ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords);
int current_char;
int timestepEnd = 0;
//if glyph confidence is required in granularity level 2 it stores the x
//Coordinates of every chosen character to match the alternative glyphs to it
if (glyph_confidence == 2) {
ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
&xcoords, &best_glyphs);
if (best_glyphs.size() > 0) {
current_char = best_glyphs.front().first;
timestepEnd = best_glyphs.front().second;
best_glyphs.pop_front();
}
} else {
ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
&xcoords);
}
int num_ids = unichar_ids.size();
if (debug) {
DebugUnicharPath(unicharset, best_nodes, unichar_ids, certs, ratings,
xcoords);
}
// Convert labels to unichar-ids.
int word_end = 0;
int timestepEnd = 0;
float prev_space_cert = 0.0f;
for (int word_start = 0; word_start < num_ids; word_start = word_end) {
for (word_end = word_start + 1; word_end < num_ids; ++word_end) {
Expand All @@ -226,11 +243,55 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
WERD_RES* word_res = InitializeWord(
leading_space, line_box, word_start, word_end,
std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor);
if (glyph_confidence) {
if (glyph_confidence == 1) {
for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
word_res->timesteps.push_back(timesteps[i]);
}
timestepEnd = xcoords[word_end];
} else if (glyph_confidence == 2) {
float sum = 0;
std::vector<std::pair<const char*, float>> glyph_pairs;
for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
for (std::pair<const char*, float> glyph : timesteps[i]) {
if (std::strcmp(glyph.first, "") != 0) {
sum += glyph.second;
glyph_pairs.push_back(glyph);
}
}
if (best_glyphs.size() > 0 && i == best_glyphs.front().second-1
|| i == xcoords[word_end]-1) {
std::map<const char*, float> summed_propabilities;
for(auto it = glyph_pairs.begin(); it != glyph_pairs.end(); ++it) {
summed_propabilities[it->first] += it->second;
}
std::vector<std::pair<const char*, float>> accumulated_timestep;
accumulated_timestep.push_back(std::pair<const char*,float>
(unicharset->id_to_unichar_ext
(current_char), 2.0));
int pos;
for (auto it = summed_propabilities.begin();
it != summed_propabilities.end(); ++it) {
if(sum == 0) break;
it->second/=sum;
pos = 0;
while (accumulated_timestep.size() > pos
&& accumulated_timestep[pos].second > it->second) {
pos++;
}
accumulated_timestep.insert(accumulated_timestep.begin() + pos,
std::pair<const char*,float>(it->first,
it->second));
}
if (best_glyphs.size() > 0) {
current_char = best_glyphs.front().first;
best_glyphs.pop_front();
}
glyph_pairs.clear();
word_res->timesteps.push_back(accumulated_timestep);
sum = 0;
}
}
timestepEnd = xcoords[word_end];
}
for (int i = word_start; i < word_end; ++i) {
BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
Expand Down Expand Up @@ -304,7 +365,8 @@ void RecodeBeamSearch::DebugBeamPos(const UNICHARSET& unicharset,
void RecodeBeamSearch::ExtractPathAsUnicharIds(
const GenericVector<const RecodeNode*>& best_nodes,
GenericVector<int>* unichar_ids, GenericVector<float>* certs,
GenericVector<float>* ratings, GenericVector<int>* xcoords) {
GenericVector<float>* ratings, GenericVector<int>* xcoords,
std::deque<std::pair<int,int>>* best_glyphs) {
unichar_ids->truncate(0);
certs->truncate(0);
ratings->truncate(0);
Expand Down Expand Up @@ -333,6 +395,9 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
}
unichar_ids->push_back(unichar_id);
xcoords->push_back(t);
if(best_glyphs != nullptr) {
best_glyphs->push_back(std::pair<int,int>(unichar_id,t));
}
do {
double cert = best_nodes[t++]->certainty;
// Special-case NO-PERM space to forget the certainty of the previous
Expand Down
14 changes: 10 additions & 4 deletions src/lstm/recodebeam.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "networkio.h"
#include "ratngs.h"
#include "unicharcompress.h"
#include <deque>
#include <set>
#include <vector>

Expand Down Expand Up @@ -185,7 +186,7 @@ class RecodeBeamSearch {
// If charset is not null, it enables detailed debugging of the beam search.
void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
double worst_dict_cert, const UNICHARSET* charset,
bool glyph_confidence = false);
int glyph_confidence = 0);
void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
double cert_offset, double worst_dict_cert,
const UNICHARSET* charset);
Expand All @@ -204,12 +205,16 @@ class RecodeBeamSearch {
// Returns the best path as a set of WERD_RES.
void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
bool debug, const UNICHARSET* unicharset,
PointerVector<WERD_RES>* words, bool glyph_confidence);
PointerVector<WERD_RES>* words,
int glyph_confidence = 0);

// Generates debug output of the content of the beams after a Decode.
void DebugBeams(const UNICHARSET& unicharset) const;


// Stores the alternative characters of every timestep together with their
// probability.
std::vector< std::vector<std::pair<const char*, float>>> timesteps;

// Clipping value for certainty inside Tesseract. Reflects the minimum value
// of certainty that will be returned by ExtractBestPathAsUnicharIds.
// Supposedly on a uniform scale that can be compared across languages and
Expand Down Expand Up @@ -276,7 +281,8 @@ class RecodeBeamSearch {
static void ExtractPathAsUnicharIds(
const GenericVector<const RecodeNode*>& best_nodes,
GenericVector<int>* unichar_ids, GenericVector<float>* certs,
GenericVector<float>* ratings, GenericVector<int>* xcoords);
GenericVector<float>* ratings, GenericVector<int>* xcoords,
std::deque<std::pair<int,int>>* best_glyphs = nullptr);

// Sets up a word with the ratings matrix and fake blobs with boxes in the
// right places.
Expand Down