/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <assert.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include "../include/spellingtable.h" namespace ime_pinyin { #ifdef ___BUILD_MODEL___ const char SpellingTable:: kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"}; // "" is the biggest, so that all empty strings will be moved to the end // _eb mean empty is biggest int compare_raw_spl_eb(const void* p1, const void* p2) { if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0]) return 1; if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0]) return -1; return strcmp((static_cast<const RawSpelling*>(p1))->str, (static_cast<const RawSpelling*>(p2))->str); } size_t get_odd_next(size_t value) { size_t v_next = value; while (true) { size_t v_next_sqrt = (size_t)sqrt(v_next); bool is_odd = true; for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) { if (v_next % v_dv == 0) { is_odd = false; break; } } if (is_odd) return v_next; v_next++; } // never reach here return 0; } SpellingTable::SpellingTable() { need_score_ = false; raw_spellings_ = NULL; spelling_buf_ = NULL; spelling_num_ = 0; total_freq_ = 0; frozen_ = true; } SpellingTable::~SpellingTable() { free_resource(); } size_t SpellingTable::get_hash_pos(const char* spelling_str) { size_t hash_pos = 0; for (size_t pos = 0; pos < spelling_size_; pos++) { if ('\0' == spelling_str[pos]) break; hash_pos += (size_t)spelling_str[pos]; } hash_pos = hash_pos % spelling_max_num_; return hash_pos; } size_t SpellingTable::hash_pos_next(size_t hash_pos) { hash_pos += 123; hash_pos = hash_pos % spelling_max_num_; return hash_pos; } void SpellingTable::free_resource() { if (NULL != raw_spellings_) delete [] raw_spellings_; raw_spellings_ = NULL; if (NULL != spelling_buf_) delete [] spelling_buf_; spelling_buf_ = NULL; } bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num, bool need_score) { if (pure_spl_size == 0 || spl_max_num ==0) return false; need_score_ = need_score; free_resource(); spelling_size_ = pure_spl_size + 1; if (need_score) spelling_size_ += 1; spelling_max_num_ = get_odd_next(spl_max_num); spelling_num_ = 0; raw_spellings_ = new RawSpelling[spelling_max_num_]; spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)]; if (NULL == raw_spellings_ || NULL == spelling_buf_) { free_resource(); return false; } memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling)); memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_)); frozen_ = false; total_freq_ = 0; return true; } bool SpellingTable::put_spelling(const char* spelling_str, double freq) { if (frozen_ || NULL == spelling_str) return false; for (size_t pos = 0; pos < kNotSupportNum; pos++) { if (strcmp(spelling_str, kNotSupportList[pos]) == 0) { return false; } } total_freq_ += freq; size_t hash_pos = get_hash_pos(spelling_str); raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) == 0) { raw_spellings_[hash_pos].freq += freq; return true; } size_t hash_pos_ori = hash_pos; while (true) { if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) == 0) { raw_spellings_[hash_pos].freq += freq; return true; } if ('\0' == raw_spellings_[hash_pos].str[0]) { raw_spellings_[hash_pos].freq += freq; strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1); raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; spelling_num_++; return true; } hash_pos = hash_pos_next(hash_pos); if (hash_pos_ori == hash_pos) return false; } // never reach here return false; } bool SpellingTable::contain(const char* spelling_str) { if (NULL == spelling_str || NULL == spelling_buf_ || frozen_) return false; size_t hash_pos = get_hash_pos(spelling_str); if ('\0' == raw_spellings_[hash_pos].str[0]) return false; if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) == 0) return true; size_t hash_pos_ori = hash_pos; while (true) { hash_pos = hash_pos_next(hash_pos); if (hash_pos_ori == hash_pos) return false; if ('\0' == raw_spellings_[hash_pos].str[0]) return false; if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) == 0) return true; } // never reach here return false; } const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) { if (NULL == raw_spellings_ || NULL == spelling_buf_ || NULL == item_size || NULL == spl_num) return NULL; qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling), compare_raw_spl_eb); // After sorting, only the first spelling_num_ items are valid. // Copy them to the destination buffer. for (size_t pos = 0; pos < spelling_num_; pos++) { strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str, spelling_size_); } if (need_score_) { if (kPrintDebug0) printf("------------Spelling Possiblities--------------\n"); double max_score = 0; double min_score = 0; // After sorting, only the first spelling_num_ items are valid. for (size_t pos = 0; pos < spelling_num_; pos++) { raw_spellings_[pos].freq /= total_freq_; if (need_score_) { if (0 == pos) { max_score = raw_spellings_[0].freq; min_score = max_score; } else { if (raw_spellings_[pos].freq > max_score) max_score = raw_spellings_[pos].freq; if (raw_spellings_[pos].freq < min_score) min_score = raw_spellings_[pos].freq; } } } if (kPrintDebug0) printf("-----max psb: %f, min psb: %f\n", max_score, min_score); max_score = log(max_score); min_score = log(min_score); if (kPrintDebug0) printf("-----max log value: %f, min log value: %f\n", max_score, min_score); // The absolute value of min_score is bigger than that of max_score because // both of them are negative after log function. score_amplifier_ = 1.0 * 255 / min_score; double average_score = 0; for (size_t pos = 0; pos < spelling_num_; pos++) { double score = log(raw_spellings_[pos].freq) * score_amplifier_; assert(score >= 0); average_score += score; // Because of calculation precision issue, score might be a little bigger // than 255 after being amplified. if (score > 255) score = 255; char *this_spl_buf = spelling_buf_ + pos * spelling_size_; this_spl_buf[spelling_size_ - 1] = static_cast<char>((unsigned char)score); if (kPrintDebug0) { printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf, (unsigned char)this_spl_buf[spelling_size_ -1]); } } average_score /= spelling_num_; assert(average_score <= 255); average_score_ = static_cast<uint8>(average_score); if (kPrintDebug0) printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_, average_score_); } *item_size = spelling_size_; *spl_num = spelling_num_; frozen_ = true; return spelling_buf_; } float SpellingTable::get_score_amplifier() { return static_cast<float>(score_amplifier_); } unsigned char SpellingTable::get_average_score() { return average_score_; } #endif // ___BUILD_MODEL___ } // namespace ime_pinyin