/********************************************************************** * File: adaptions.cpp (Formerly adaptions.c) * Description: Functions used to adapt to blobs already confidently * identified * Author: Chris Newton * Created: Thu Oct 7 10:17:28 BST 1993 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "mfcpch.h" #ifdef __UNIX__ #include <assert.h> #endif #include <ctype.h> #include <string.h> #include "tessbox.h" #include "tessvars.h" #include "memry.h" #include "mainblk.h" #include "charcut.h" #include "imgs.h" #include "scaleimg.h" #include "reject.h" #include "control.h" #include "adaptions.h" #include "stopper.h" #include "charsample.h" #include "matmatch.h" #include "secname.h" #include "tesseractclass.h" inT32 demo_word = 0; #define WINDOWNAMESIZE 13 /*max size of name */ #define EXTERN EXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's"); EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's"); EXTERN double_VAR (tessedit_cluster_t1, 0.20, "t1 threshold for clustering samples"); EXTERN double_VAR (tessedit_cluster_t2, 0.40, "t2 threshold for clustering samples"); EXTERN double_VAR (tessedit_cluster_t3, 0.12, "Extra threshold for clustering samples, only keep a new sample if best score greater than this value"); EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80, "Largest fraction of characters in cluster for it to be used for adaption"); EXTERN INT_VAR (tessedit_cluster_min_size, 3, "Smallest number of samples in a cluster for it to be used for adaption"); EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE, "Generate and print debug information for adaption by clustering"); EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE, "Use best sample from cluster when adapting"); EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE, "Set reject map to enable cluster input to be measured"); EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher"); EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE, "Don't try to adapt to characters on this list"); EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*", "Characters to be avoided when adapting"); EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE, "Use prototypes when adapting"); EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE, "Use prototypes as clusters are built"); EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE, "Adapt to characters using reject map"); EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE, "Adapt to all characters using, matrix matcher"); EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE, "Only match samples against clusters for the same character"); EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs"); EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE, "Display cut images and matrix match for demo purposes"); EXTERN INT_VAR (tessedit_demo_word1, 62, "Word number of first word to display"); EXTERN INT_VAR (tessedit_demo_word2, 64, "Word number of second word to display"); EXTERN STRING_VAR (tessedit_demo_file, "academe", "Name of document containing demo words"); EXTERN BOOL_VAR(tessedit_adapt_to_char_fragments, TRUE, "Adapt to words that contain " " a character composed form fragments"); namespace tesseract { BOOL8 Tesseract::word_adaptable( //should we adapt? WERD_RES *word, uinT16 mode) { if (tessedit_adaption_debug) { tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n", word->best_choice == NULL ? "" : word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { if (tessedit_adaption_debug) tprintf("adaption disabled\n"); return FALSE; } if (flags.bit (ADAPTABLE_WERD)) { status |= word->tess_would_adapt; // result of Classify::AdaptableWord() if (tessedit_adaption_debug && !status) { tprintf("tess_would_adapt bit is false\n"); } } if (flags.bit (ACCEPTABLE_WERD)) { status |= word->tess_accepted; if (tessedit_adaption_debug && !status) { tprintf("tess_accepted bit is false\n"); } } if (!status) { // If not set then return FALSE; // ignore other checks } if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { if (tessedit_adaption_debug) tprintf("word not in dawgs\n"); return FALSE; } if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) { if (tessedit_adaption_debug) tprintf("word has ell conflict\n"); return FALSE; } if (flags.bit (CHECK_SPACES) && (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) { if (tessedit_adaption_debug) tprintf("word contains spaces\n"); return FALSE; } // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word)) if (flags.bit (CHECK_AMBIG_WERD) && !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) { if (tessedit_adaption_debug) tprintf("word is ambiguous\n"); return FALSE; } // Do not adapt to words that are composed from fragments if // tessedit_adapt_to_char_fragments is false. if (!tessedit_adapt_to_char_fragments) { const char *fragment_lengths = word->best_choice->fragment_lengths(); if (fragment_lengths != NULL && *fragment_lengths != '\0') { for (int i = 0; i < word->best_choice->length(); ++i) { if (fragment_lengths[i] > 1) { if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n"); return false; // found a character composed from fragments } } } } if (tessedit_adaption_debug) { tprintf("returning status %d\n", status); } return status; } void Tesseract::collect_ems_for_adaption(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm PBLOB_IT copy_blob_it; OUTLINE_IT copy_outline_it; inT32 resolution = page_image.get_res (); if (tessedit_reject_ems || tessedit_reject_suspect_ems) return; // Do nothing if (word->word->bounding_box ().height () > resolution / 3) return; if (tessedit_demo_adaption) // Make sure not set tessedit_display_mm.set_value (FALSE); if (word_adaptable (word, tessedit_em_adaption_mode) && word->reject_map.reject_count () == 0 && (strchr (word->best_choice->unichar_string().string (), 'm') != NULL || (tessedit_process_rns && strstr (word->best_choice->unichar_string().string (), "rn") != NULL))) { if (tessedit_process_rns && strstr (word->best_choice->unichar_string().string (), "rn") != NULL) { copy_outword = *(word->outword); copy_blob_it.set_to_list (copy_outword.blob_list ()); i = 0; while (word->best_choice->unichar_string()[i] != '\0') { if (word->best_choice->unichar_string()[i] == 'r' && word->best_choice->unichar_string()[i + 1] == 'n') { copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ()); copy_outline_it.add_list_after (copy_blob_it. data_relative (1)-> out_list ()); copy_blob_it.forward (); delete (copy_blob_it.extract ()); i++; } copy_blob_it.forward (); i++; } } else copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); blob_it.move_to_first (); for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward ()) { if (word->best_choice->unichar_string()[i] == 'm' || (word->best_choice->unichar_string()[i] == 'r' && word->best_choice->unichar_string()[i + 1] == 'n')) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample %c for adaption found in %s, index %d\n", word->best_choice->unichar_string()[i], word->best_choice->unichar_string().string (), i); #endif if (tessedit_matrix_match) { sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->unichar_string()[i]); if (sample == NULL) { //Clip failed #ifndef SECURE_NAMES tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->unichar_string().string (), i); #endif if (word->best_choice->unichar_string()[i] == 'r') i++; continue; } } else sample = new CHAR_SAMPLE (blob_it.data (), &word->denorm, word->best_choice->unichar_string()[i]); cluster_sample(sample, char_clusters, chars_waiting); if (word->best_choice->unichar_string()[i] == 'r') i++; // Skip next character } } delete[]imlines; // Free array of imlines delete pixrow_list; } } void Tesseract::collect_characters_for_adaption( WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm inT32 resolution = page_image.get_res (); if (word->word->bounding_box ().height () > resolution / 3) return; if (tessedit_demo_adaption) // Make sure not set tessedit_display_mm.set_value (FALSE); if ((word_adaptable (word, tessedit_cluster_adaption_mode) && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) { if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) return; // Reject map set to acceptable /* Collect information about good matches */ copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); blob_it.move_to_first (); for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward ()) { if (!(tessedit_mm_use_non_adaption_set && STRING(tessedit_non_adaption_set).contains( word->best_choice->unichar_string()[i])) || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample %c for adaption found in %s, index %d\n", word->best_choice->unichar_string()[i], word->best_choice->unichar_string().string (), i); #endif sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->unichar_string()[i]); if (sample == NULL) { //Clip failed #ifndef SECURE_NAMES tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->unichar_string().string (), i); #endif continue; } cluster_sample(sample, char_clusters, chars_waiting); } } delete[]imlines; // Free array of imlines delete pixrow_list; } else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) // Set word to all rejects word->reject_map.rej_word_tess_failure (); } void Tesseract::cluster_sample(CHAR_SAMPLE *sample, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { CHAR_SAMPLES *best_cluster = NULL; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; float score; float best_score = MAX_INT32; if (c_it.empty ()) c_it.add_to_end (new CHAR_SAMPLES (sample)); else { for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { score = c_it.data ()->match_score (sample, this); if (score < best_score) { best_score = score; best_cluster = c_it.data (); } } if (tessedit_cluster_debug) tprintf ("Sample's best score %f\n", best_score); if (best_score < tessedit_cluster_t1) { if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) { best_cluster->add_sample (sample, this); check_wait_list(chars_waiting, sample, best_cluster); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample added to an existing cluster\n"); #endif } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample dropped, good match to an existing cluster\n"); #endif } } else if (best_score > tessedit_cluster_t2) { c_it.add_to_end (new CHAR_SAMPLES (sample)); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("New cluster created for this sample\n"); #endif } else { cw_it.add_to_end (sample); if (tessedit_cluster_debug) tprintf ("Sample added to the wait list\n"); } } } void Tesseract::check_wait_list(CHAR_SAMPLE_LIST *chars_waiting, CHAR_SAMPLE *sample, CHAR_SAMPLES *best_cluster) { CHAR_SAMPLE *wait_sample; CHAR_SAMPLE *test_sample = sample; CHAR_SAMPLE_IT cw_it = chars_waiting; CHAR_SAMPLE_LIST add_list; //Samples added to best cluster CHAR_SAMPLE_IT add_it = &add_list; float score; add_list.clear (); if (!cw_it.empty ()) { do { if (!add_list.empty ()) { add_it.forward (); test_sample = add_it.extract (); best_cluster->add_sample (test_sample, this); } for (cw_it.mark_cycle_pt (); !cw_it.cycled_list (); cw_it.forward ()) { wait_sample = cw_it.data (); if (tessedit_mm_use_prototypes) score = best_cluster->match_score (wait_sample, this); else score = sample->match_sample (wait_sample, FALSE, this); if (score < tessedit_cluster_t1) { if (score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) { add_it.add_after_stay_put (cw_it.extract ()); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Wait sample added to an existing cluster\n"); #endif } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Wait sample dropped, good match to an existing cluster\n"); #endif } } } } while (!add_list.empty ()); } } void Tesseract::complete_clustering(CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { CHAR_SAMPLES *best_cluster; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; CHAR_SAMPLE *sample; inT32 total_sample_count = 0; while (!cw_it.empty ()) { cw_it.move_to_first (); sample = cw_it.extract (); best_cluster = new CHAR_SAMPLES (sample); c_it.add_to_end (best_cluster); check_wait_list(chars_waiting, sample, best_cluster); } for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { c_it.data ()->assign_to_char (); if (tessedit_use_best_sample) c_it.data ()->find_best_sample (); else if (tessedit_mm_adapt_using_prototypes) c_it.data ()->build_prototype (); if (tessedit_cluster_debug) total_sample_count += c_it.data ()->n_samples (); } #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Clustering completed, %d samples in all\n", total_sample_count); #endif #ifndef GRAPHICS_DISABLED if (tessedit_demo_adaption) display_cluster_prototypes(char_clusters); #endif } void Tesseract::adapt_to_good_ems(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; float score; float best_score; char best_char; CHAR_SAMPLES *best_cluster; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm TBOX b_box; PBLOB_IT copy_blob_it; OUTLINE_IT copy_outline_it; PIXROW *pixrow = NULL; static inT32 word_number = 0; #ifndef GRAPHICS_DISABLED ScrollView* demo_win = NULL; #endif inT32 resolution = page_image.get_res (); if (word->word->bounding_box ().height () > resolution / 3) return; word_number++; if (strchr (word->best_choice->unichar_string().string (), 'm') == NULL && (tessedit_process_rns && strstr (word->best_choice->unichar_string().string (), "rn") == NULL)) return; if (tessedit_reject_ems) reject_all_ems(word); else if (tessedit_reject_suspect_ems) reject_suspect_ems(word); else { if (char_clusters->length () == 0) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("No clusters to use for em adaption\n"); #endif return; } if (!cw_it.empty ()) { complete_clustering(char_clusters, chars_waiting); print_em_stats(char_clusters, chars_waiting); } if ((!word_adaptable (word, tessedit_em_adaption_mode) || word->reject_map.reject_count () != 0) && (strchr (word->best_choice->unichar_string().string (), 'm') != NULL || (tessedit_process_rns && strstr (word->best_choice->unichar_string().string (), "rn") != NULL))) { if (tessedit_process_rns && strstr (word->best_choice->unichar_string().string (), "rn") != NULL) { copy_outword = *(word->outword); copy_blob_it.set_to_list (copy_outword.blob_list ()); i = 0; while (word->best_choice->unichar_string()[i] != '\0') { if (word->best_choice->unichar_string()[i] == 'r' && word->best_choice->unichar_string()[i + 1] == 'n') { copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ()); copy_outline_it.add_list_after (copy_blob_it. data_relative (1)-> out_list ()); copy_blob_it.forward (); delete (copy_blob_it.extract ()); i++; } copy_blob_it.forward (); i++; } } else copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); copy_blob_it.set_to_list (copy_outword.blob_list ()); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); // For debugging only b_box = copy_outword.bounding_box (); pixrow = pixrow_it.data (); blob_it.move_to_first (); copy_blob_it.move_to_first (); for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward (), copy_blob_it.forward ()) { if ((word->best_choice->unichar_string()[i] == 'm' || (word->best_choice->unichar_string()[i] == 'r' && word->best_choice->unichar_string()[i + 1] == 'n')) && !word->reject_map[i].perm_rejected ()) { if (tessedit_cluster_debug) tprintf ("Sample %c to check found in %s, index %d\n", word->best_choice->unichar_string()[i], word->best_choice->unichar_string().string (), i); if (tessedit_demo_adaption) tprintf ("Sample %c to check found in %s (%d), index %d\n", word->best_choice->unichar_string()[i], word->best_choice->unichar_string().string (), word_number, i); if (tessedit_matrix_match) { TBOX copy_box = copy_blob_it.data ()->bounding_box (); sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->unichar_string()[i]); //Clip failed if (sample == NULL) { tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->unichar_string().string (), i); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (no sample)\n"); #endif word->reject_map[i].setrej_mm_reject (); if (word->best_choice->unichar_string()[i] == 'r') { word->reject_map[i + 1].setrej_mm_reject (); i++; } continue; } } else sample = new CHAR_SAMPLE(blob_it.data(), &word->denorm, word->best_choice->unichar_string()[i]); best_score = MAX_INT32; best_char = '\0'; best_cluster = NULL; for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { if (c_it.data ()->character () != '\0') { score = c_it.data ()->match_score (sample, this); if (score < best_score) { best_cluster = c_it.data (); best_score = score; best_char = c_it.data ()->character (); } } } if (best_score > tessedit_cluster_t1) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (score %f)\n", best_score); #endif word->reject_map[i].setrej_mm_reject (); if (word->best_choice->unichar_string()[i] == 'r') word->reject_map[i + 1].setrej_mm_reject (); } else { if (word->best_choice->unichar_string()[i] == best_char) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample accepted (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample accepted (score %f)\n", best_score); #endif word->reject_map[i].setrej_mm_accept (); if (word->best_choice->unichar_string()[i] == 'r') word->reject_map[i + 1].setrej_mm_accept (); } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); #endif word->reject_map[i].setrej_mm_reject (); if (word->best_choice->unichar_string()[i] == 'r') word->reject_map[i + 1].setrej_mm_reject (); } } if (tessedit_demo_adaption) { if (strcmp (imagebasename.string (), tessedit_demo_file.string ()) != 0 || word_number == tessedit_demo_word1 || word_number == tessedit_demo_word2) { #ifndef GRAPHICS_DISABLED demo_win = display_clip_image(©_outword, page_image, pixrow_list, pix_box); #endif demo_word = word_number; best_cluster->match_score (sample, this); demo_word = 0; } } if (word->best_choice->unichar_string()[i] == 'r') i++; // Skip next character } } delete[]imlines; // Free array of imlines delete pixrow_list; } } } void Tesseract::adapt_to_good_samples(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; float score; float best_score; char best_char; CHAR_SAMPLES *best_cluster; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm TBOX b_box; PBLOB_IT copy_blob_it; PIXROW *pixrow = NULL; static inT32 word_number = 0; #ifndef GRAPHICS_DISABLED ScrollView* demo_win = NULL; #endif inT32 resolution = page_image.get_res (); word_number++; if (tessedit_test_cluster_input) return; if (word->word->bounding_box ().height () > resolution / 3) return; if (char_clusters->length () == 0) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("No clusters to use for adaption\n"); #endif return; } if (!cw_it.empty ()) { complete_clustering(char_clusters, chars_waiting); print_em_stats(char_clusters, chars_waiting); } if ((!word_adaptable (word, tessedit_cluster_adaption_mode) && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) { if (tessedit_cluster_debug) { tprintf ("\nChecking: \"%s\" MAP ", word->best_choice->unichar_string().string ()); word->reject_map.print (debug_fp); tprintf ("\n"); } copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); copy_blob_it.set_to_list (copy_outword.blob_list ()); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); // For debugging only b_box = copy_outword.bounding_box (); pixrow = pixrow_it.data (); blob_it.move_to_first (); copy_blob_it.move_to_first (); for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward (), copy_blob_it.forward ()) { if (word->reject_map[i].recoverable () || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) { TBOX copy_box = copy_blob_it.data ()->bounding_box (); if (tessedit_cluster_debug) tprintf ("Sample %c to check found in %s, index %d\n", word->best_choice->unichar_string()[i], word->best_choice->unichar_string().string (), i); if (tessedit_demo_adaption) tprintf ("Sample %c to check found in %s (%d), index %d\n", word->best_choice->unichar_string()[i], word->best_choice->unichar_string().string (), word_number, i); sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->unichar_string()[i]); if (sample == NULL) { //Clip failed tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->unichar_string().string (), i); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (no sample)\n"); #endif word->reject_map[i].setrej_mm_reject (); continue; } best_score = MAX_INT32; best_char = '\0'; best_cluster = NULL; for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { if (c_it.data ()->character () != '\0') { score = c_it.data ()->match_score (sample, this); if (score < best_score) { best_cluster = c_it.data (); best_score = score; best_char = c_it.data ()->character (); } } } if (best_score > tessedit_cluster_t1) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (score %f)\n", best_score); #endif word->reject_map[i].setrej_mm_reject (); } else { if (word->best_choice->unichar_string()[i] == best_char) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample accepted (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample accepted (score %f)\n", best_score); #endif if (tessedit_test_adaption) word->reject_map[i].setrej_minimal_rej_accept (); else word->reject_map[i].setrej_mm_accept (); } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); #endif word->reject_map[i].setrej_mm_reject (); } } if (tessedit_demo_adaption) { if (strcmp (imagebasename.string (), tessedit_demo_file.string ()) != 0 || word_number == tessedit_demo_word1 || word_number == tessedit_demo_word2) { #ifndef GRAPHICS_DISABLED demo_win = display_clip_image(©_outword, page_image, pixrow_list, pix_box); #endif demo_word = word_number; best_cluster->match_score (sample, this); demo_word = 0; } } } } delete[]imlines; // Free array of imlines delete pixrow_list; if (tessedit_cluster_debug) { tprintf ("\nFinal: \"%s\" MAP ", word->best_choice->unichar_string().string ()); word->reject_map.print (debug_fp); tprintf ("\n"); } } } } // namespace tesseract void print_em_stats(CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { CHAR_SAMPLES_IT c_it = char_clusters; if (!tessedit_cluster_debug) return; #ifndef SECURE_NAMES tprintf ("There are %d clusters and %d samples waiting\n", char_clusters->length (), chars_waiting->length ()); for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) c_it.data ()->print (debug_fp); #endif tprintf ("\n"); } CHAR_SAMPLE *clip_sample( //lines of the image PIXROW *pixrow, IMAGELINE *imlines, TBOX pix_box, //box of imlines extent BOOL8 white_on_black, char c) { TBOX b_box = pixrow->bounding_box (); float baseline_pos = 0; inT32 resolution = page_image.get_res (); if (!b_box.null_box ()) { ASSERT_HOST (b_box.width () < page_image.get_xsize () && b_box.height () < page_image.get_ysize ()); if (b_box.width () > resolution || b_box.height () > resolution) { tprintf ("clip sample: sample too big (%d x %d)\n", b_box.width (), b_box.height ()); return NULL; } IMAGE *image = new (IMAGE); if (image->create (b_box.width (), b_box.height (), 1) == -1) { tprintf ("clip sample: create image failed (%d x %d)\n", b_box.width (), b_box.height ()); delete image; return NULL; } if (!white_on_black) invert_image(image); // Set background to white pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos); if (white_on_black) invert_image(image); //invert white on black for scaling &NN return new CHAR_SAMPLE (image, c); } else return NULL; } #ifndef GRAPHICS_DISABLED void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters) { inT16 proto_number = 0; CHAR_SAMPLES_IT c_it = char_clusters; char title[WINDOWNAMESIZE]; for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { proto_number++; #ifndef SECURE_NAMES tprintf ("Displaying proto number %d\n", proto_number); #endif if (c_it.data ()->prototype () != NULL) { sprintf (title, "Proto - %d", proto_number); display_image (c_it.data ()->prototype ()->make_image (), title, (proto_number - 1) * 400, 0, FALSE); } } } #endif // ********************************************************************* // Simplistic routines to test the effect of rejecting ems and fullstops // ********************************************************************* void reject_all_ems(WERD_RES *word) { inT16 i; for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { if (word->best_choice->unichar_string()[i] == 'm') // reject all ems word->reject_map[i].setrej_mm_reject (); } } void reject_all_fullstops(WERD_RES *word) { inT16 i; for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { if (word->best_choice->unichar_string()[i] == '.') // reject all fullstops word->reject_map[i].setrej_mm_reject (); } } namespace tesseract { void Tesseract::reject_suspect_ems(WERD_RES *word) { inT16 i; if (!word_adaptable (word, tessedit_cluster_adaption_mode)) for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { if (word->best_choice->unichar_string()[i] == 'm' && suspect_em (word, i)) // reject all ems word->reject_map[i].setrej_mm_reject (); } } } // namespace tesseract void reject_suspect_fullstops(WERD_RES *word) { inT16 i; for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { if (word->best_choice->unichar_string()[i] == '.' && suspect_fullstop (word, i)) // reject all commas word->reject_map[i].setrej_mm_reject (); } } BOOL8 suspect_em(WERD_RES *word, inT16 index) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 j; for (j = 0; j < index; j++) blob_it.forward (); return (blob_it.data ()->out_list ()->length () != 1); } BOOL8 suspect_fullstop(WERD_RES *word, inT16 i) { float aspect_ratio; PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 j; TBOX box; inT16 width; inT16 height; for (j = 0; j < i; j++) blob_it.forward (); box = blob_it.data ()->bounding_box (); width = box.width (); height = box.height (); aspect_ratio = ((width > height) ? ((float) width) / height : ((float) height) / width); return (aspect_ratio > tessed_fullstop_aspect_ratio); }