///////////////////////////////////////////////////////////////////////
// File:        tessdatamanager.cpp
// Description: Functions to handle loading/combining tesseract data files.
// Author:      Daria Antonova
// Created:     Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "tessdatamanager.h"

#include <stdio.h>

#include "serialis.h"
#include "strngs.h"
#include "tprintf.h"
#include "varable.h"

BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");

INT_VAR(global_tessdata_manager_debug_level, 0,
        "Debug level for TessdataManager functions.");

namespace tesseract {

void TessdataManager::Init(const char *data_file_name) {
  int i;
  data_file_ = fopen(data_file_name, "rb");
  if (data_file_ == NULL) {
    tprintf("Error openning data file %s\n", data_file_name);
    exit(1);
  }
  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
  bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
  if (swap) {
    actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
  }
  ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
  fread(offset_table_, sizeof(inT64),
        actual_tessdata_num_entries_, data_file_);
  if (swap) {
    for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
      offset_table_[i] = reverse64(offset_table_[i]);
    }
  }
  if (global_tessdata_manager_debug_level) {
    tprintf("TessdataManager loaded %d types of tesseract data files.\n",
            actual_tessdata_num_entries_);
    for (i = 0; i < actual_tessdata_num_entries_; ++i) {
      tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
    }
  }
}

FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
                                  const char *file_suffix, bool required_file,
                                  bool text_file) {
  STRING file_name = language_data_path_prefix;
  file_name += file_suffix;
  FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
  if (required_file && (file_ptr == NULL)) {
    tprintf("Error openning required file %s\n", file_name.string());
    exit(1);
  }
  return file_ptr;
}

void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
                               bool newline_end) {
  int buffer_size = 1024;
  char *chunk = new char[buffer_size];
  int bytes_read;
  char last_char = 0x0;
  while ((bytes_read = fread(chunk, sizeof(char),
                             buffer_size, input_file))) {
    fwrite(chunk, sizeof(char), bytes_read, output_file);
    last_char = chunk[bytes_read-1];
  }
  if (newline_end) ASSERT_HOST(last_char == '\n');
  delete[] chunk;
}

void TessdataManager::CombineDataFiles(
    const char *language_data_path_prefix,
    const char *output_filename) {
  FILE *file_ptr;
  STRING file_name;
  int i;
  inT64 offset_table[TESSDATA_NUM_ENTRIES];
  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
  FILE *output_file = fopen(output_filename, "wb");
  // Leave some space for recording the offset_table.
  fseek(output_file,
        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);

  // Record language-specific tesseract config file.
  file_ptr = GetFilePtr(language_data_path_prefix,
                        kLangConfigFileSuffix, false, true);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
    CopyFile(file_ptr, output_file, true);
    fclose(file_ptr);
  }

  // Record unicharset.
  file_ptr = GetFilePtr(language_data_path_prefix,
                        kUnicharsetFileSuffix, true, true);
  offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
  CopyFile(file_ptr, output_file, true);
  fclose(file_ptr);

  // Record ambiguities.
  file_ptr = GetFilePtr(language_data_path_prefix,
                        kAmbigsFileSuffix, false, true);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_AMBIGS] = ftell(output_file);
    CopyFile(file_ptr, output_file, true);
    fclose(file_ptr);
  }

  // Record inttemp.
  file_ptr =
    GetFilePtr(language_data_path_prefix,
               kBuiltInTemplatesFileSuffix, false, false);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_INTTEMP] = ftell(output_file);
    CopyFile(file_ptr, output_file, false);
    fclose(file_ptr);

    // Record pffmtable.
    file_ptr = GetFilePtr(language_data_path_prefix,
                          kBuiltInCutoffsFileSuffix, true, true);
    offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
    CopyFile(file_ptr, output_file, true);
    fclose(file_ptr);

    // Record normproto.
    file_ptr = GetFilePtr(language_data_path_prefix,
                          kNormProtoFileSuffix, true, true);
    offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
    CopyFile(file_ptr, output_file, true);
    fclose(file_ptr);
  }

  // Record dawgs.
  file_ptr = GetFilePtr(language_data_path_prefix,
                        kPuncDawgFileSuffix, false, false);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
    CopyFile(file_ptr, output_file, false);
    fclose(file_ptr);
  }

  file_ptr = GetFilePtr(language_data_path_prefix,
                        kSystemDawgFileSuffix, false, false);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
    CopyFile(file_ptr, output_file, false);
    fclose(file_ptr);
  }

  file_ptr = GetFilePtr(language_data_path_prefix,
                        kNumberDawgFileSuffix, false, false);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
    CopyFile(file_ptr, output_file, false);
    fclose(file_ptr);
  }

  file_ptr = GetFilePtr(language_data_path_prefix,
                        kFreqDawgFileSuffix, false, false);
  if (file_ptr != NULL) {
    offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
    CopyFile(file_ptr, output_file, false);
    fclose(file_ptr);
  }

  fseek(output_file, 0, SEEK_SET);
  inT32 num_entries = TESSDATA_NUM_ENTRIES;
  fwrite(&num_entries, sizeof(inT32), 1, output_file);
  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
  fclose(output_file);

  tprintf("TessdataManager combined tesseract data files.\n");
  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
  }
}

}  // namespace tesseract