lstmeval: Improve output by ensuring 'Truth:' text is encoded the sam…

…e as OCR output This ensures that transformations like unicode normalisation are done on the truth output as well as the OCR output, so that you can compare the two properly. Before this a perfect OCR could show different lines for Truth and OCR if the OCR output included characters that were normalised.
GerHobbelt · Apr 27, 2021 · 2acaac4 · 2acaac4
1 parent 7aba1e4
commit 2acaac4
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/src/training/unicharset/lstmtester.cpp b/src/training/unicharset/lstmtester.cpp
@@ -91,6 +91,11 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors
   int error_count = 0;
   while (error_count < total_pages_) {
     const ImageData *trainingdata = test_data_.GetPageBySerial(eval_iteration);
+    std::vector<int> truth_labels;
+    if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
+      continue
+    }
+    std::string truth_text = trainer.DecodeLabels(truth_labels);
     trainer.SetIteration(++eval_iteration);
     NetworkIO fwd_outputs, targets;
     Trainability result = trainer.PrepareForBackward(trainingdata, &fwd_outputs, &targets);
@@ -99,7 +104,7 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors
       word_error += trainer.NewSingleError(tesseract::ET_WORD_RECERR);
       ++error_count;
       if (verbosity > 1 || (verbosity > 0 && result != PERFECT)) {
-        tprintf("Truth:%s\n", trainingdata->transcription().c_str());
+        tprintf("Truth:%s\n", truth_text.c_str());
         std::vector<int> ocr_labels;
         std::vector<int> xcoords;
         trainer.LabelsFromOutputs(fwd_outputs, &ocr_labels, &xcoords);