/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <assert.h> #include <math.h> #include <stdio.h> #include <string.h> #include <time.h> #include "../include/mystdlib.h" #include "../include/ngram.h" namespace ime_pinyin { #define ADD_COUNT 0.3 int comp_double(const void *p1, const void *p2) { if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2)) return -1; if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2)) return 1; return 0; } inline double distance(double freq, double code) { // return fabs(freq - code); return freq * fabs(log(freq) - log(code)); } // Find the index of the code value which is nearest to the given freq int qsearch_nearest(double code_book[], double freq, int start, int end) { if (start == end) return start; if (start + 1 == end) { if (distance(freq, code_book[end]) > distance(freq, code_book[start])) return start; return end; } int mid = (start + end) / 2; if (code_book[mid] > freq) return qsearch_nearest(code_book, freq, start, mid); else return qsearch_nearest(code_book, freq, mid, end); } size_t update_code_idx(double freqs[], size_t num, double code_book[], CODEBOOK_TYPE *code_idx) { size_t changed = 0; for (size_t pos = 0; pos < num; pos++) { CODEBOOK_TYPE idx; idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); if (idx != code_idx[pos]) changed++; code_idx[pos] = idx; } return changed; } double recalculate_kernel(double freqs[], size_t num, double code_book[], CODEBOOK_TYPE *code_idx) { double ret = 0; size_t *item_num = new size_t[kCodeBookSize]; assert(item_num); memset(item_num, 0, sizeof(size_t) * kCodeBookSize); double *cb_new = new double[kCodeBookSize]; assert(cb_new); memset(cb_new, 0, sizeof(double) * kCodeBookSize); for (size_t pos = 0; pos < num; pos++) { ret += distance(freqs[pos], code_book[code_idx[pos]]); cb_new[code_idx[pos]] += freqs[pos]; item_num[code_idx[pos]] += 1; } for (size_t code = 0; code < kCodeBookSize; code++) { assert(item_num[code] > 0); code_book[code] = cb_new[code] / item_num[code]; } delete [] item_num; delete [] cb_new; return ret; } void iterate_codes(double freqs[], size_t num, double code_book[], CODEBOOK_TYPE *code_idx) { size_t iter_num = 0; double delta_last = 0; do { size_t changed = update_code_idx(freqs, num, code_book, code_idx); double delta = recalculate_kernel(freqs, num, code_book, code_idx); if (kPrintDebug0) { printf("---Unigram codebook iteration: %d : %d, %.9f\n", iter_num, changed, delta); } iter_num++; if (iter_num > 1 && (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) break; delta_last = delta; } while (true); } NGram* NGram::instance_ = NULL; NGram::NGram() { initialized_ = false; idx_num_ = 0; lma_freq_idx_ = NULL; sys_score_compensation_ = 0; #ifdef ___BUILD_MODEL___ freq_codes_df_ = NULL; #endif freq_codes_ = NULL; } NGram::~NGram() { if (NULL != lma_freq_idx_) free(lma_freq_idx_); #ifdef ___BUILD_MODEL___ if (NULL != freq_codes_df_) free(freq_codes_df_); #endif if (NULL != freq_codes_) free(freq_codes_); } NGram& NGram::get_instance() { if (NULL == instance_) instance_ = new NGram(); return *instance_; } bool NGram::save_ngram(FILE *fp) { if (!initialized_ || NULL == fp) return false; if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) return false; if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) return false; if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != kCodeBookSize) return false; if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) return false; return true; } bool NGram::load_ngram(FILE *fp) { if (NULL == fp) return false; initialized_ = false; if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) return false; if (NULL != lma_freq_idx_) free(lma_freq_idx_); if (NULL != freq_codes_) free(freq_codes_); lma_freq_idx_ = static_cast<CODEBOOK_TYPE*> (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); freq_codes_ = static_cast<LmaScoreType*> (malloc(kCodeBookSize * sizeof(LmaScoreType))); if (NULL == lma_freq_idx_ || NULL == freq_codes_) return false; if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != kCodeBookSize) return false; if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) return false; initialized_ = true; total_freq_none_sys_ = 0; return true; } void NGram::set_total_freq_none_sys(size_t freq_none_sys) { total_freq_none_sys_ = freq_none_sys; if (0 == total_freq_none_sys_) { sys_score_compensation_ = 0; } else { double factor = static_cast<double>(kSysDictTotalFreq) / ( kSysDictTotalFreq + total_freq_none_sys_); sys_score_compensation_ = static_cast<float>( log(factor) * kLogValueAmplifier); } } // The caller makes sure this oject is initialized. float NGram::get_uni_psb(LemmaIdType lma_id) { return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) + sys_score_compensation_; } float NGram::convert_psb_to_score(double psb) { float score = static_cast<float>( log(psb) * static_cast<double>(kLogValueAmplifier)); if (score > static_cast<float>(kMaxScore)) { score = static_cast<float>(kMaxScore); } return score; } #ifdef ___BUILD_MODEL___ bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, LemmaIdType next_idx_unused) { if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) return false; double total_freq = 0; double *freqs = new double[next_idx_unused]; if (NULL == freqs) return false; freqs[0] = ADD_COUNT; total_freq += freqs[0]; LemmaIdType idx_now = 0; for (size_t pos = 0; pos < lemma_num; pos++) { if (lemma_arr[pos].idx_by_hz == idx_now) continue; idx_now++; assert(lemma_arr[pos].idx_by_hz == idx_now); freqs[idx_now] = lemma_arr[pos].freq; if (freqs[idx_now] <= 0) freqs[idx_now] = 0.3; total_freq += freqs[idx_now]; } double max_freq = 0; idx_num_ = idx_now + 1; assert(idx_now + 1 == next_idx_unused); for (size_t pos = 0; pos < idx_num_; pos++) { freqs[pos] = freqs[pos] / total_freq; assert(freqs[pos] > 0); if (freqs[pos] > max_freq) max_freq = freqs[pos]; } // calculate the code book if (NULL == freq_codes_df_) freq_codes_df_ = new double[kCodeBookSize]; assert(freq_codes_df_); memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); if (NULL == freq_codes_) freq_codes_ = new LmaScoreType[kCodeBookSize]; assert(freq_codes_); memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); size_t freq_pos = 0; for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { bool found = true; while (found) { found = false; double cand = freqs[freq_pos]; for (size_t i = 0; i < code_pos; i++) if (freq_codes_df_[i] == cand) { found = true; break; } if (found) freq_pos++; } freq_codes_df_[code_pos] = freqs[freq_pos]; freq_pos++; } myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); if (NULL == lma_freq_idx_) lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; assert(lma_freq_idx_); iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); delete [] freqs; if (kPrintDebug0) { printf("\n------Language Model Unigram Codebook------\n"); } for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { double log_score = log(freq_codes_df_[code_pos]); float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); if (kPrintDebug0) { printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", code_pos, freq_codes_df_[code_pos], log_score, final_score); } freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score); } initialized_ = true; return true; } #endif } // namespace ime_pinyin