/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lang_id/custom-tokenizer.h" #include <ctype.h> #include <string> #include "util/strings/utf8.h" namespace libtextclassifier { namespace nlp_core { namespace lang_id { namespace { inline bool IsTokenSeparator(int num_bytes, const char *curr) { if (num_bytes != 1) { return false; } return !isalpha(*curr); } } // namespace const char *GetSafeEndOfString(const char *data, size_t size) { const char *const hard_end = data + size; const char *curr = data; while (curr < hard_end) { int num_bytes = GetNumBytesForUTF8Char(curr); if (num_bytes == 0) { break; } const char *new_curr = curr + num_bytes; if (new_curr > hard_end) { return curr; } curr = new_curr; } return curr; } void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) { const char *const start = text.data(); const char *curr = start; const char *end = GetSafeEndOfString(start, text.size()); // Corner case: empty safe part of the text. if (curr >= end) { return; } // Number of bytes for UTF8 character starting at *curr. Note: the loop below // is guaranteed to terminate because in each iteration, we move curr by at // least num_bytes, and num_bytes is guaranteed to be > 0. int num_bytes = GetNumBytesForNonZeroUTF8Char(curr); while (curr < end) { // Jump over consecutive token separators. while (IsTokenSeparator(num_bytes, curr)) { curr += num_bytes; if (curr >= end) { return; } num_bytes = GetNumBytesForNonZeroUTF8Char(curr); } // If control reaches this point, we are at beginning of a non-empty token. std::string *word = sentence->add_word(); // Add special token-start character. word->push_back('^'); // Add UTF8 characters to word, until we hit the end of the safe text or a // token separator. while (true) { word->append(curr, num_bytes); curr += num_bytes; if (curr >= end) { break; } num_bytes = GetNumBytesForNonZeroUTF8Char(curr); if (IsTokenSeparator(num_bytes, curr)) { curr += num_bytes; num_bytes = GetNumBytesForNonZeroUTF8Char(curr); break; } } word->push_back('$'); // Note: we intentionally do not token.set_start()/end(), as those fields // are not used by the langid model. } } } // namespace lang_id } // namespace nlp_core } // namespace libtextclassifier