/////////////////////////////////////////////////////////////////////// // File: baseapi.h // Description: Simple API for calling tesseract. // Author: Ray Smith // Created: Fri Oct 06 15:35:01 PDT 2006 // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_CCMAIN_BASEAPI_H__ #define TESSERACT_CCMAIN_BASEAPI_H__ #include "thresholder.h" class PAGE_RES; class PAGE_RES_IT; class BLOCK_LIST; class IMAGE; class STRING; struct Pix; struct Box; struct Pixa; struct Boxa; struct ETEXT_STRUCT; struct OSResults; struct TBOX; #define MAX_NUM_INT_FEATURES 512 struct INT_FEATURE_STRUCT; typedef INT_FEATURE_STRUCT *INT_FEATURE; typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]; #ifdef TESSDLL_EXPORTS #define TESSDLL_API __declspec(dllexport) #elif defined(TESSDLL_IMPORTS) #define TESSDLL_API __declspec(dllimport) #else #define TESSDLL_API #endif namespace tesseract { class Dict; class Tesseract; class Trie; class CubeRecoContext; class TesseractCubeCombiner; class CubeObject; class CubeLineObject; class Dawg; typedef int (Dict::*DictFunc)(void* void_dawg_args, int char_index, const void *word, bool word_end); enum PageSegMode { PSM_AUTO, // Fully automatic page segmentation. PSM_SINGLE_COLUMN, // Assume a single column of text of variable sizes. PSM_SINGLE_BLOCK, // Assume a single uniform block of text. (Default.) PSM_SINGLE_LINE, // Treat the image as a single text line. PSM_SINGLE_WORD, // Treat the image as a single word. PSM_SINGLE_CHAR, // Treat the image as a single character. PSM_COUNT // Number of enum entries. }; // The values in the AccuracyVSpeed enum provide hints for how the engine // should trade speed for accuracy. There is no guarantee of any effect. enum AccuracyVSpeed { AVS_FASTEST = 0, // Fastest speed, but lowest accuracy. AVS_MOST_ACCURATE = 100 // Greatest accuracy, but slowest speed. }; // Base class for all tesseract APIs. // Specific classes can add ability to work on different inputs or produce // different outputs. // This class is mostly an interface layer on top of the Tesseract instance // class to hide the data types so that users of this class don't have to // include any other Tesseract headers. class TESSDLL_API TessBaseAPI { public: TessBaseAPI(); virtual ~TessBaseAPI(); // Set the name of the input file. Needed only for training and // reading a UNLV zone file. void SetInputName(const char* name); // Set the name of the bonus output files. Needed only for debugging. void SetOutputName(const char* name); // Set the value of an internal "variable" (of either old or new types). // Supply the name of the variable and the value as a string, just as // you would in a config file. // Returns false if the name lookup failed. // Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. // Or SetVariable("bln_numericmode", "1"); to set numeric-only mode. // SetVariable may be used before Init, but settings will revert to // defaults on End(). bool SetVariable(const char* variable, const char* value); // Eventually instances will be thread-safe and totally independent, // but for now, they all point to the same underlying engine, // and are NOT RE-ENTRANT OR THREAD-SAFE. For now: // it is safe to Init multiple TessBaseAPIs in the same language, use them // sequentially, and End or delete them all, but once one is Ended, you can't // do anything other than End the others. After End, it is safe to Init // again on the same one. // // Start tesseract. Returns zero on success and -1 on failure. // NOTE that the only members that may be called before Init are those // listed above here in the class definition. // // The datapath must be the name of the data directory (no ending /) or // some other file in which the data directory resides (for instance argv[0].) // The language is (usually) an ISO 639-3 string or NULL will default to eng. // It is entirely safe (and eventually will be efficient too) to call // Init multiple times on the same instance to change language, or just // to reset the classifier. // WARNING: On changing languages, all Variables are reset back to their // default values. If you have a rare need to set a Variable that controls // initialization for a second call to Init you should explicitly // call End() and then use SetVariable before Init. This is only a very // rare use case, since there are very few uses that require any variables // to be set before Init. int Init(const char* datapath, const char* language, char **configs, int configs_size, bool configs_global_only); int Init(const char* datapath, const char* language) { return Init(datapath, language, 0, 0, false); } // Init only the lang model component of Tesseract. The only functions // that work after this init are SetVariable and IsValidWord. // WARNING: temporary! This function will be removed from here and placed // in a separate API at some future time. int InitLangMod(const char* datapath, const char* language); // Init everything except the language model. Used to allow initialization for // the specified language without any available dawg models. int InitWithoutLangModel(const char* datapath, const char* language); // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. void ReadConfigFile(const char* filename, bool global_only); // Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. // The mode is stored as an INT_VARIABLE so it can also be modified by // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). void SetPageSegMode(PageSegMode mode); // Return the current page segmentation mode. PageSegMode GetPageSegMode() const; // Set the hint for trading accuracy against speed. // Default is AVS_FASTEST, which is the old behaviour. // Note that this is only a hint. Depending on the language and/or // build configuration, speed and accuracy may not be tradeable. // Also note that despite being an enum, any value in the range // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not // have an effect, depending on the implementation. // The mode is stored as an INT_VARIABLE so it can also be modified by // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string). void SetAccuracyVSpeed(AccuracyVSpeed mode); // Recognize a rectangle from an image and return the result as a string. // May be called many times for a single Init. // Currently has no error checking. // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. // Palette color images will not work properly and must be converted to // 24 bit. // Binary images of 1 bit per pixel may also be given but they must be // byte packed with the MSB of the first byte being the first pixel, and a // 1 represents WHITE. For binary images set bytes_per_pixel=0. // The recognized text is returned as a char* which is coded // as UTF8 and must be freed with the delete [] operator. // // Note that TesseractRect is the simplified convenience interface. // For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, // and one or more of the Get*Text functions below. char* TesseractRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height); // Call between pages or documents etc to free up memory and forget // adaptive data. void ClearAdaptiveClassifier(); // ------------------------Advanced API-------------------------------- // The following methods break TesseractRect into pieces, so you can // get hold of the thresholded image, get the text in different formats, // get bounding boxes, confidences etc. // Provide an image for Tesseract to recognize. Format is as // TesseractRect above. Does not copy the image buffer, or take // ownership. The source image may be destroyed after Recognize is called, // either explicitly or implicitly via one of the Get*Text functions. // SetImage clears all recognition results, and sets the rectangle to the // full image, so it may be followed immediately by a GetUTF8Text, and it // will automatically perform recognition. void SetImage(const unsigned char* imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line); // Provide an image for Tesseract to recognize. As with SetImage above, // Tesseract doesn't take a copy or ownership or pixDestroy the image, so // it must persist until after Recognize. // Pix vs raw, which to use? // Use Pix where possible. A future version of Tesseract may choose to use Pix // as its internal representation and discard IMAGE altogether. // Because of that, an implementation that sources and targets Pix may end up // with less copies than an implementation that does not. void SetImage(const Pix* pix); // Restrict recognition to a sub-rectangle of the image. Call after SetImage. // Each SetRectangle clears the recogntion results so multiple rectangles // can be recognized with the same image. void SetRectangle(int left, int top, int width, int height); // In extreme cases only, usually with a subclass of Thresholder, it // is possible to provide a different Thresholder. The Thresholder may // be preloaded with an image, settings etc, or they may be set after. // Note that Tesseract takes ownership of the Thresholder and will // delete it when it it is replaced or the API is destructed. void SetThresholder(ImageThresholder* thresholder) { if (thresholder_ != 0) delete thresholder_; thresholder_ = thresholder; ClearResults(); } // Get a copy of the internal thresholded image from Tesseract. // Caller takes ownership of the Pix and must pixDestroy it. // May be called any time after SetImage, or after TesseractRect. Pix* GetThresholdedImage(); // Get the result of page layout analysis as a leptonica-style // Boxa, Pixa pair, in reading order. // Can be called before or after Recognize. Boxa* GetRegions(Pixa** pixa); // Get the textlines as a leptonica-style // Boxa, Pixa pair, in reading order. // Can be called before or after Recognize. // If blockids is not NULL, the block-id of each line is also returned as an // array of one element per line. delete [] after use. Boxa* GetTextlines(Pixa** pixa, int** blockids); // Get the words as a leptonica-style // Boxa, Pixa pair, in reading order. // Can be called before or after Recognize. Boxa* GetWords(Pixa** pixa); // Dump the internal binary image to a PGM file. // Deprecated. Use GetThresholdedImage and write the image using pixWrite // instead if possible. void DumpPGM(const char* filename); // Recognize the image from SetAndThresholdImage, generating Tesseract // internal structures. Returns 0 on success. // Optional. The Get*Text functions below will call Recognize if needed. // After Recognize, the output is kept internally until the next SetImage. int Recognize(ETEXT_STRUCT* monitor); // Methods to retrieve information after SetAndThresholdImage(), // Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) // Variant on Recognize used for testing chopper. int RecognizeForChopTest(struct ETEXT_STRUCT* monitor); // The recognized text is returned as a char* which is coded // as UTF8 and must be freed with the delete [] operator. char* GetUTF8Text(); // The recognized text is returned as a char* which is coded in the same // format as a box file used in training. Returned string must be freed with // the delete [] operator. // Constructs coordinates in the original image - not just the rectangle. char* GetBoxText(); // The recognized text is returned as a char* which is coded // as UNLV format Latin-1 with specific reject and suspect codes // and must be freed with the delete [] operator. char* GetUNLVText(); // Returns the (average) confidence value between 0 and 100. int MeanTextConf(); // Returns all word confidences (between 0 and 100) in an array, terminated // by -1. The calling function must delete [] after use. // The number of confidences should correspond to the number of space- // delimited words in GetUTF8Text. int* AllWordConfidences(); // Free up recognition results and any stored image data, without actually // freeing any recognition data that would be time-consuming to reload. // Afterwards, you must call SetImage or TesseractRect before doing // any Recognize or Get* operation. void Clear(); // Close down tesseract and free up all memory. End() is equivalent to // destructing and reconstructing your TessBaseAPI. // Once End() has been used, none of the other API functions may be used // other than Init and anything declared above it in the class definition. void End(); // Check whether a word is valid according to Tesseract's language model // returns 0 if the word is invalid, non-zero if valid. // WARNING: temporary! This function will be removed from here and placed // in a separate API at some future time. int IsValidWord(const char *word); bool GetTextDirection(int* out_offset, float* out_slope); // Set the letter_is_okay function to point somewhere else. void SetDictFunc(DictFunc f); // Estimates the Orientation And Script of the image. // Returns true if the image was processed successfully. bool DetectOS(OSResults*); // This method returns the features associated with the input image. void GetFeatures(INT_FEATURE_ARRAY int_features, int* num_features); // Return the pointer to the i-th dawg loaded into tesseract_ object. const Dawg *GetDawg(int i) const; // Return the number of dawgs loaded into tesseract_ object. int NumDawgs() const; // Return the language used in the last valid initialization. const char* GetLastInitLanguage() const; protected: // Common code for setting the image. Returns true if Init has been called. bool InternalSetImage(); // Run the thresholder to make the thresholded image. If pix is not NULL, // the source is thresholded to pix instead of the internal IMAGE. virtual void Threshold(Pix** pix); // Find lines from the image making the BLOCK_LIST. // Returns 0 on success. int FindLines(); // Delete the pageres and block list ready for a new page. void ClearResults(); // Return the length of the output text string, as UTF8, assuming // one newline per line and one per block, with a terminator, // and assuming a single character reject marker for each rejected character. // Also return the number of recognized blobs in blob_count. int TextLength(int* blob_count); // __________________________ ocropus add-ons ___________________________ // Find lines from the image making the BLOCK_LIST. BLOCK_LIST* FindLinesCreateBlockList(); // Delete a block list. // This is to keep BLOCK_LIST pointer opaque // and let go of including the other headers. static void DeleteBlockList(BLOCK_LIST* block_list); // Adapt to recognize the current image as the given character. // The image must be preloaded and be just an image of a single character. void AdaptToCharacter(const char *unichar_repr, int length, float baseline, float xheight, float descender, float ascender); // Recognize text doing one pass only, using settings for a given pass. /*static*/ PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list); /*static*/ PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list, PAGE_RES* pass1_result); // Extract the OCR results, costs (penalty points for uncertainty), // and the bounding boxes of the characters. static int TesseractExtractResult(char** text, int** lengths, float** costs, int** x0, int** y0, int** x1, int** y1, PAGE_RES* page_res); // Call the Cube OCR engine. Takes the Region, line and word segmentation // information from Tesseract as inputs. Makes changes or populates the // output PAGE_RES object which contains the recogntion results. // The behavior of this function depends on the // current language and the value of the tessedit_accuracyvspeed: // For English (and other Latin based scripts): // If the accuracyvspeed flag is set to any value other than AVS_FASTEST, // Cube uses the word information passed by Tesseract. // Cube will run on a subset of the words segmented and recognized by // Tesseract. The value of the accuracyvspeed and the Tesseract // confidence of a word determines whether Cube runs on it or not and // whether Cube's results override Tesseract's // For Arabic & Hindi: // Cube uses the Region information passed by Tesseract. It then performs // its own line segmentation. This will change once Tesseract's line // segmentation works for Arabic. Cube then segments each line into // phrases. Each phrase is then recognized in phrase mode which allows // spaces in the results. // Note that at this point, the line segmentation algorithm might have // some problems with ill spaced Arabic document. int Cube(); // Run Cube on the lines extracted by Tesseract. int RunCubeOnLines(); // Run Cube on a subset of the words already present in the page_res_ object // The subset, and whether Cube overrides the results is determined by // the SpeedVsAccuracy flag int CubePostProcessWords(); // Create a Cube line object for each line CubeLineObject **CreateLineObjects(Pixa* pixa_lines); // Create a TBox array corresponding to the phrases in the array of // line objects TBOX *CreatePhraseBoxes(Boxa* boxa_lines, CubeLineObject **line_objs, int *phrase_cnt); // Recognize the phrases saving the results to the page_res_ object bool RecognizePhrases(int line_cnt, int phrase_cnt, CubeLineObject **line_objs, TBOX *phrase_boxes); // Recognize a single phrase saving the results to the page_res_ object bool RecognizePhrase(CubeObject *phrase, PAGE_RES_IT *result); // Create the necessary Cube Objects bool CreateCubeObjects(); protected: Tesseract* tesseract_; // The underlying data object. ImageThresholder* thresholder_; // Image thresholding module. bool threshold_done_; // Image has been passed to page_image. BLOCK_LIST* block_list_; // The page layout. PAGE_RES* page_res_; // The page-level data. STRING* input_file_; // Name used by training code. STRING* output_file_; // Name used by debug code. STRING* datapath_; // Current location of tessdata. STRING* language_; // Last initialized language. // Parameters saved from the Thresholder. Needed to rebuild coordinates. int rect_left_; int rect_top_; int rect_width_; int rect_height_; int image_width_; int image_height_; }; } // namespace tesseract. #endif // TESSERACT_CCMAIN_BASEAPI_H__