/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <assert.h> #include "../include/splparser.h" namespace ime_pinyin { SpellingParser::SpellingParser() { spl_trie_ = SpellingTrie::get_cpinstance(); } bool SpellingParser::is_valid_to_parse(char ch) { return SpellingTrie::is_valid_spl_char(ch); } uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, uint16 spl_idx[], uint16 start_pos[], uint16 max_size, bool &last_is_pre) { if (NULL == splstr || 0 == max_size || 0 == str_len) return 0; if (!SpellingTrie::is_valid_spl_char(splstr[0])) return 0; last_is_pre = false; const SpellingNode *node_this = spl_trie_->root_; uint16 str_pos = 0; uint16 idx_num = 0; if (NULL != start_pos) start_pos[0] = 0; bool last_is_splitter = false; while (str_pos < str_len) { char char_this = splstr[str_pos]; // all characters outside of [a, z] are considered as splitters if (!SpellingTrie::is_valid_spl_char(char_this)) { // test if the current node is endable uint16 id_this = node_this->spelling_idx; if (spl_trie_->if_valid_id_update(&id_this)) { spl_idx[idx_num] = id_this; idx_num++; str_pos++; if (NULL != start_pos) start_pos[idx_num] = str_pos; if (idx_num >= max_size) return idx_num; node_this = spl_trie_->root_; last_is_splitter = true; continue; } else { if (last_is_splitter) { str_pos++; if (NULL != start_pos) start_pos[idx_num] = str_pos; continue; } else { return idx_num; } } } last_is_splitter = false; SpellingNode *found_son = NULL; if (0 == str_pos) { if (char_this >= 'a') found_son = spl_trie_->level1_sons_[char_this - 'a']; else found_son = spl_trie_->level1_sons_[char_this - 'A']; } else { SpellingNode *first_son = node_this->first_son; // Because for Zh/Ch/Sh nodes, they are the last in the buffer and // frequently used, so we scan from the end. for (int i = 0; i < node_this->num_of_son; i++) { SpellingNode *this_son = first_son + i; if (SpellingTrie::is_same_spl_char( this_son->char_this_node, char_this)) { found_son = this_son; break; } } } // found, just move the current node pointer to the the son if (NULL != found_son) { node_this = found_son; } else { // not found, test if it is endable uint16 id_this = node_this->spelling_idx; if (spl_trie_->if_valid_id_update(&id_this)) { // endable, remember the index spl_idx[idx_num] = id_this; idx_num++; if (NULL != start_pos) start_pos[idx_num] = str_pos; if (idx_num >= max_size) return idx_num; node_this = spl_trie_->root_; continue; } else { return idx_num; } } str_pos++; } uint16 id_this = node_this->spelling_idx; if (spl_trie_->if_valid_id_update(&id_this)) { // endable, remember the index spl_idx[idx_num] = id_this; idx_num++; if (NULL != start_pos) start_pos[idx_num] = str_pos; } last_is_pre = !last_is_splitter; return idx_num; } uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, uint16 spl_idx[], uint16 start_pos[], uint16 max_size, bool &last_is_pre) { uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, max_size, last_is_pre); for (uint16 pos = 0; pos < idx_num; pos++) { if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); if (pos == idx_num - 1) { last_is_pre = false; } } } return idx_num; } uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, uint16 spl_idx[], uint16 start_pos[], uint16 max_size, bool &last_is_pre) { if (NULL == splstr || 0 == max_size || 0 == str_len) return 0; if (!SpellingTrie::is_valid_spl_char(splstr[0])) return 0; last_is_pre = false; const SpellingNode *node_this = spl_trie_->root_; uint16 str_pos = 0; uint16 idx_num = 0; if (NULL != start_pos) start_pos[0] = 0; bool last_is_splitter = false; while (str_pos < str_len) { char16 char_this = splstr[str_pos]; // all characters outside of [a, z] are considered as splitters if (!SpellingTrie::is_valid_spl_char(char_this)) { // test if the current node is endable uint16 id_this = node_this->spelling_idx; if (spl_trie_->if_valid_id_update(&id_this)) { spl_idx[idx_num] = id_this; idx_num++; str_pos++; if (NULL != start_pos) start_pos[idx_num] = str_pos; if (idx_num >= max_size) return idx_num; node_this = spl_trie_->root_; last_is_splitter = true; continue; } else { if (last_is_splitter) { str_pos++; if (NULL != start_pos) start_pos[idx_num] = str_pos; continue; } else { return idx_num; } } } last_is_splitter = false; SpellingNode *found_son = NULL; if (0 == str_pos) { if (char_this >= 'a') found_son = spl_trie_->level1_sons_[char_this - 'a']; else found_son = spl_trie_->level1_sons_[char_this - 'A']; } else { SpellingNode *first_son = node_this->first_son; // Because for Zh/Ch/Sh nodes, they are the last in the buffer and // frequently used, so we scan from the end. for (int i = 0; i < node_this->num_of_son; i++) { SpellingNode *this_son = first_son + i; if (SpellingTrie::is_same_spl_char( this_son->char_this_node, char_this)) { found_son = this_son; break; } } } // found, just move the current node pointer to the the son if (NULL != found_son) { node_this = found_son; } else { // not found, test if it is endable uint16 id_this = node_this->spelling_idx; if (spl_trie_->if_valid_id_update(&id_this)) { // endable, remember the index spl_idx[idx_num] = id_this; idx_num++; if (NULL != start_pos) start_pos[idx_num] = str_pos; if (idx_num >= max_size) return idx_num; node_this = spl_trie_->root_; continue; } else { return idx_num; } } str_pos++; } uint16 id_this = node_this->spelling_idx; if (spl_trie_->if_valid_id_update(&id_this)) { // endable, remember the index spl_idx[idx_num] = id_this; idx_num++; if (NULL != start_pos) start_pos[idx_num] = str_pos; } last_is_pre = !last_is_splitter; return idx_num; } uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, uint16 spl_idx[], uint16 start_pos[], uint16 max_size, bool &last_is_pre) { uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, max_size, last_is_pre); for (uint16 pos = 0; pos < idx_num; pos++) { if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); if (pos == idx_num - 1) { last_is_pre = false; } } } return idx_num; } uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, bool *is_pre) { if (NULL == is_pre) return 0; uint16 spl_idx[2]; uint16 start_pos[3]; if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) return 0; if (start_pos[1] != str_len) return 0; return spl_idx[0]; } uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, bool *is_pre) { if (NULL == is_pre) return 0; uint16 spl_idx[2]; uint16 start_pos[3]; if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) return 0; if (start_pos[1] != str_len) return 0; if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { spl_trie_->half_to_full(spl_idx[0], spl_idx); *is_pre = false; } return spl_idx[0]; } uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, uint16 splidx[], uint16 max_size, uint16 &full_id_num, bool &is_pre) { if (max_size <= 0 || !is_valid_to_parse(splstr[0])) return 0; splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); full_id_num = 0; if (0 != splidx[0]) { if (splidx[0] >= kFullSplIdStart) full_id_num = 1; return 1; } return 0; } } // namespace ime_pinyin