/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef PINYINIME_INCLUDE_NGRAM_H__ #define PINYINIME_INCLUDE_NGRAM_H__ #include <stdio.h> #include <stdlib.h> #include "./dictdef.h" namespace ime_pinyin { typedef unsigned char CODEBOOK_TYPE; static const size_t kCodeBookSize = 256; class NGram { public: // The maximum score of a lemma item. static const LmaScoreType kMaxScore = 0x3fff; // In order to reduce the storage size, the original log value is amplified by // kScoreAmplifier, and we use LmaScoreType to store. // After this process, an item with a lower score has a higher frequency. static const int kLogValueAmplifier = -800; // System words' total frequency. It is not the real total frequency, instead, // It is only used to adjust system lemmas' scores when the user dictionary's // total frequency changes. // In this version, frequencies of system lemmas are fixed. We are considering // to make them changable in next version. static const size_t kSysDictTotalFreq = 100000000; private: static NGram* instance_; bool initialized_; size_t idx_num_; size_t total_freq_none_sys_; // Score compensation for system dictionary lemmas. // Because after user adds some user lemmas, the total frequency changes, and // we use this value to normalize the score. float sys_score_compensation_; #ifdef ___BUILD_MODEL___ double *freq_codes_df_; #endif LmaScoreType *freq_codes_; CODEBOOK_TYPE *lma_freq_idx_; public: NGram(); ~NGram(); static NGram& get_instance(); bool save_ngram(FILE *fp); bool load_ngram(FILE *fp); // Set the total frequency of all none system dictionaries. void set_total_freq_none_sys(size_t freq_none_sys); float get_uni_psb(LemmaIdType lma_id); // Convert a probability to score. Actually, the score will be limited to // kMaxScore, but at runtime, we also need float expression to get accurate // value of the score. // After the conversion, a lower score indicates a higher probability of the // item. static float convert_psb_to_score(double psb); #ifdef ___BUILD_MODEL___ // For constructing the unigram mode model. bool build_unigram(LemmaEntry *lemma_arr, size_t num, LemmaIdType next_idx_unused); #endif }; } #endif // PINYINIME_INCLUDE_NGRAM_H__