C++程序  |  314行  |  8.21 KB

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "../include/spellingtable.h"

namespace ime_pinyin {

#ifdef ___BUILD_MODEL___

const char SpellingTable::
    kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};

// "" is the biggest, so that all empty strings will be moved to the end
// _eb mean empty is biggest
int compare_raw_spl_eb(const void* p1, const void* p2) {
  if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
    return 1;

  if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
    return -1;

  return strcmp((static_cast<const RawSpelling*>(p1))->str,
                (static_cast<const RawSpelling*>(p2))->str);
}

size_t get_odd_next(size_t value) {
  size_t v_next = value;
  while (true) {
    size_t v_next_sqrt = (size_t)sqrt(v_next);

    bool is_odd = true;
    for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
      if (v_next % v_dv == 0) {
        is_odd = false;
        break;
      }
    }

    if (is_odd)
      return v_next;

    v_next++;
  }

  // never reach here
  return 0;
}

SpellingTable::SpellingTable() {
  need_score_ = false;
  raw_spellings_ = NULL;
  spelling_buf_ = NULL;
  spelling_num_ = 0;
  total_freq_ = 0;
  frozen_ = true;
}

SpellingTable::~SpellingTable() {
  free_resource();
}

size_t SpellingTable::get_hash_pos(const char* spelling_str) {
  size_t hash_pos = 0;
  for (size_t pos = 0; pos < spelling_size_; pos++) {
    if ('\0' == spelling_str[pos])
      break;
    hash_pos += (size_t)spelling_str[pos];
  }

  hash_pos = hash_pos % spelling_max_num_;
  return hash_pos;
}

size_t SpellingTable::hash_pos_next(size_t hash_pos) {
  hash_pos += 123;
  hash_pos = hash_pos % spelling_max_num_;
  return hash_pos;
}

void SpellingTable::free_resource() {
  if (NULL != raw_spellings_)
    delete [] raw_spellings_;
  raw_spellings_ = NULL;

  if (NULL != spelling_buf_)
    delete [] spelling_buf_;
  spelling_buf_ = NULL;
}

bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
                               bool need_score) {
  if (pure_spl_size == 0 || spl_max_num ==0)
    return false;

  need_score_ = need_score;

  free_resource();

  spelling_size_ = pure_spl_size + 1;
  if (need_score)
    spelling_size_ += 1;
  spelling_max_num_ = get_odd_next(spl_max_num);
  spelling_num_ = 0;

  raw_spellings_ = new RawSpelling[spelling_max_num_];
  spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
  if (NULL == raw_spellings_ || NULL == spelling_buf_) {
    free_resource();
    return false;
  }

  memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
  memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
  frozen_ = false;
  total_freq_ = 0;
  return true;
}

bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
  if (frozen_ || NULL == spelling_str)
    return false;

  for (size_t pos = 0; pos < kNotSupportNum; pos++) {
    if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
      return false;
    }
  }

  total_freq_ += freq;

  size_t hash_pos = get_hash_pos(spelling_str);

  raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';

  if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
              spelling_size_ - 1) == 0) {
    raw_spellings_[hash_pos].freq += freq;
    return true;
  }

  size_t hash_pos_ori = hash_pos;

  while (true) {
    if (strncmp(raw_spellings_[hash_pos].str,
                spelling_str, spelling_size_ - 1) == 0) {
      raw_spellings_[hash_pos].freq += freq;
      return true;
    }

    if ('\0' == raw_spellings_[hash_pos].str[0]) {
      raw_spellings_[hash_pos].freq += freq;
      strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
      raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
      spelling_num_++;
      return true;
    }

    hash_pos = hash_pos_next(hash_pos);
    if (hash_pos_ori == hash_pos)
      return false;
  }

  // never reach here
  return false;
}

bool SpellingTable::contain(const char* spelling_str) {
  if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
    return false;

  size_t hash_pos = get_hash_pos(spelling_str);

  if ('\0' == raw_spellings_[hash_pos].str[0])
    return false;

  if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
      == 0)
    return true;

  size_t hash_pos_ori = hash_pos;

  while (true) {
    hash_pos = hash_pos_next(hash_pos);
    if (hash_pos_ori == hash_pos)
      return false;

    if ('\0' == raw_spellings_[hash_pos].str[0])
      return false;

    if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
        == 0)
      return true;
  }

  // never reach here
  return false;
}

const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
  if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
      NULL == item_size || NULL == spl_num)
    return NULL;

  qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
        compare_raw_spl_eb);

  // After sorting, only the first spelling_num_ items are valid.
  // Copy them to the destination buffer.
  for (size_t pos = 0; pos < spelling_num_; pos++) {
    strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
            spelling_size_);
  }

  if (need_score_) {
    if (kPrintDebug0)
      printf("------------Spelling Possiblities--------------\n");

    double max_score = 0;
    double min_score = 0;

    // After sorting, only the first spelling_num_ items are valid.
    for (size_t pos = 0; pos < spelling_num_; pos++) {
      raw_spellings_[pos].freq /= total_freq_;
      if (need_score_) {
        if (0 == pos) {
          max_score = raw_spellings_[0].freq;
          min_score = max_score;
        } else {
          if (raw_spellings_[pos].freq > max_score)
            max_score = raw_spellings_[pos].freq;
          if (raw_spellings_[pos].freq < min_score)
            min_score = raw_spellings_[pos].freq;
        }
      }
    }

    if (kPrintDebug0)
      printf("-----max psb: %f, min psb: %f\n", max_score, min_score);

    max_score = log(max_score);
    min_score = log(min_score);

    if (kPrintDebug0)
      printf("-----max log value: %f, min log value: %f\n",
             max_score, min_score);

    // The absolute value of min_score is bigger than that of max_score because
    // both of them are negative after log function.
    score_amplifier_ = 1.0 * 255 / min_score;

    double average_score = 0;
    for (size_t pos = 0; pos < spelling_num_; pos++) {
      double score = log(raw_spellings_[pos].freq) * score_amplifier_;
      assert(score >= 0);

      average_score += score;

      // Because of calculation precision issue, score might be a little bigger
      // than 255 after being amplified.
      if (score > 255)
        score = 255;
      char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
      this_spl_buf[spelling_size_ - 1] =
          static_cast<char>((unsigned char)score);

      if (kPrintDebug0) {
        printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
               (unsigned char)this_spl_buf[spelling_size_ -1]);
      }
    }
    average_score /= spelling_num_;
    assert(average_score <= 255);
    average_score_ = static_cast<uint8>(average_score);

    if (kPrintDebug0)
      printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
             average_score_);
  }

  *item_size = spelling_size_;
  *spl_num = spelling_num_;
  frozen_ = true;
  return spelling_buf_;
}

float SpellingTable::get_score_amplifier() {
  return static_cast<float>(score_amplifier_);
}

unsigned char SpellingTable::get_average_score() {
  return average_score_;
}

#endif  // ___BUILD_MODEL___
}  // namespace ime_pinyin