/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/tokenizer.h"
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace libtextclassifier3 {
namespace {
using testing::ElementsAreArray;
class TestingTokenizer : public Tokenizer {
public:
TestingTokenizer(
const TokenizationType type, const UniLib* unilib,
const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
const std::vector<const CodepointRange*>&
internal_tokenizer_codepoint_ranges,
const bool split_on_script_change,
const bool icu_preserve_whitespace_tokens)
: Tokenizer(type, unilib, codepoint_ranges,
internal_tokenizer_codepoint_ranges, split_on_script_change,
icu_preserve_whitespace_tokens) {}
using Tokenizer::FindTokenizationRange;
};
class TestingTokenizerProxy {
public:
TestingTokenizerProxy(
TokenizationType type,
const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
const bool split_on_script_change,
const bool icu_preserve_whitespace_tokens)
: INIT_UNILIB_FOR_TESTING(unilib_) {
const int num_configs = codepoint_range_configs.size();
std::vector<const TokenizationCodepointRange*> configs_fb;
configs_fb.reserve(num_configs);
const int num_internal_configs = internal_codepoint_range_configs.size();
std::vector<const CodepointRange*> internal_configs_fb;
internal_configs_fb.reserve(num_internal_configs);
buffers_.reserve(num_configs + num_internal_configs);
for (int i = 0; i < num_configs; i++) {
flatbuffers::FlatBufferBuilder builder;
builder.Finish(CreateTokenizationCodepointRange(
builder, &codepoint_range_configs[i]));
buffers_.push_back(builder.Release());
configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
buffers_.back().data()));
}
for (int i = 0; i < num_internal_configs; i++) {
flatbuffers::FlatBufferBuilder builder;
builder.Finish(
CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
buffers_.push_back(builder.Release());
internal_configs_fb.push_back(
flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
}
tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
icu_preserve_whitespace_tokens));
}
TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
const TokenizationCodepointRangeT* range =
tokenizer_->FindTokenizationRange(c);
if (range != nullptr) {
return range->role;
} else {
return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
}
}
std::vector<Token> Tokenize(const std::string& utf8_text) const {
return tokenizer_->Tokenize(utf8_text);
}
private:
UniLib unilib_;
std::vector<flatbuffers::DetachedBuffer> buffers_;
std::unique_ptr<TestingTokenizer> tokenizer_;
};
TEST(TokenizerTest, FindTokenizationRange) {
std::vector<TokenizationCodepointRangeT> configs;
TokenizationCodepointRangeT* config;
configs.emplace_back();
config = &configs.back();
config->start = 0;
config->end = 10;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 32;
config->end = 33;
config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 1234;
config->end = 12345;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
{}, /*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false);
// Test hits to the first group.
EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
TokenizationCodepointRange_::Role_DEFAULT_ROLE);
// Test a hit to the second group.
EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
TokenizationCodepointRange_::Role_DEFAULT_ROLE);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
TokenizationCodepointRange_::Role_DEFAULT_ROLE);
// Test hits to the third group.
EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
TokenizationCodepointRange_::Role_DEFAULT_ROLE);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
TokenizationCodepointRange_::Role_DEFAULT_ROLE);
// Test a hit outside.
EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
TokenizationCodepointRange_::Role_DEFAULT_ROLE);
}
TEST(TokenizerTest, TokenizeOnSpace) {
std::vector<TokenizationCodepointRangeT> configs;
TokenizationCodepointRangeT* config;
configs.emplace_back();
config = &configs.back();
// Space character.
config->start = 32;
config->end = 33;
config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
{},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false);
std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
EXPECT_THAT(tokens,
ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
}
TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
std::vector<TokenizationCodepointRangeT> configs;
TokenizationCodepointRangeT* config;
// Latin.
configs.emplace_back();
config = &configs.back();
config->start = 0;
config->end = 32;
config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
config->script_id = 1;
configs.emplace_back();
config = &configs.back();
config->start = 32;
config->end = 33;
config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
config->script_id = 1;
configs.emplace_back();
config = &configs.back();
config->start = 33;
config->end = 0x77F + 1;
config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
config->script_id = 1;
TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
{},
/*split_on_script_change=*/true,
/*icu_preserve_whitespace_tokens=*/false);
EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
Token("전화", 7, 10), Token("(123)", 10, 15),
Token("456-789", 16, 23),
Token("웹사이트", 23, 28)}));
} // namespace
TEST(TokenizerTest, TokenizeComplex) {
std::vector<TokenizationCodepointRangeT> configs;
TokenizationCodepointRangeT* config;
// Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
// Latin - cyrilic.
// 0000..007F; Basic Latin
// 0080..00FF; Latin-1 Supplement
// 0100..017F; Latin Extended-A
// 0180..024F; Latin Extended-B
// 0250..02AF; IPA Extensions
// 02B0..02FF; Spacing Modifier Letters
// 0300..036F; Combining Diacritical Marks
// 0370..03FF; Greek and Coptic
// 0400..04FF; Cyrillic
// 0500..052F; Cyrillic Supplement
// 0530..058F; Armenian
// 0590..05FF; Hebrew
// 0600..06FF; Arabic
// 0700..074F; Syriac
// 0750..077F; Arabic Supplement
configs.emplace_back();
config = &configs.back();
config->start = 0;
config->end = 32;
config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
configs.emplace_back();
config = &configs.back();
config->start = 32;
config->end = 33;
config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 33;
config->end = 0x77F + 1;
config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
// CJK
// 2E80..2EFF; CJK Radicals Supplement
// 3000..303F; CJK Symbols and Punctuation
// 3040..309F; Hiragana
// 30A0..30FF; Katakana
// 3100..312F; Bopomofo
// 3130..318F; Hangul Compatibility Jamo
// 3190..319F; Kanbun
// 31A0..31BF; Bopomofo Extended
// 31C0..31EF; CJK Strokes
// 31F0..31FF; Katakana Phonetic Extensions
// 3200..32FF; Enclosed CJK Letters and Months
// 3300..33FF; CJK Compatibility
// 3400..4DBF; CJK Unified Ideographs Extension A
// 4DC0..4DFF; Yijing Hexagram Symbols
// 4E00..9FFF; CJK Unified Ideographs
// A000..A48F; Yi Syllables
// A490..A4CF; Yi Radicals
// A4D0..A4FF; Lisu
// A500..A63F; Vai
// F900..FAFF; CJK Compatibility Ideographs
// FE30..FE4F; CJK Compatibility Forms
// 20000..2A6DF; CJK Unified Ideographs Extension B
// 2A700..2B73F; CJK Unified Ideographs Extension C
// 2B740..2B81F; CJK Unified Ideographs Extension D
// 2B820..2CEAF; CJK Unified Ideographs Extension E
// 2CEB0..2EBEF; CJK Unified Ideographs Extension F
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
configs.emplace_back();
config = &configs.back();
config->start = 0x2E80;
config->end = 0x2EFF + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x3000;
config->end = 0xA63F + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0xF900;
config->end = 0xFAFF + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0xFE30;
config->end = 0xFE4F + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x20000;
config->end = 0x2A6DF + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x2A700;
config->end = 0x2B73F + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x2B740;
config->end = 0x2B81F + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x2B820;
config->end = 0x2CEAF + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x2CEB0;
config->end = 0x2EBEF + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
configs.emplace_back();
config = &configs.back();
config->start = 0x2F800;
config->end = 0x2FA1F + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
// Thai.
// 0E00..0E7F; Thai
configs.emplace_back();
config = &configs.back();
config->start = 0x0E00;
config->end = 0x0E7F + 1;
config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
{},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false);
std::vector<Token> tokens;
tokens = tokenizer.Tokenize(
"問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
EXPECT_EQ(tokens.size(), 30);
tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
// clang-format off
EXPECT_THAT(
tokens,
ElementsAreArray({Token("問", 0, 1),
Token("少", 1, 2),
Token("目", 2, 3),
Token("hello", 4, 9),
Token("木", 10, 11),
Token("輸", 11, 12),
Token("ย", 12, 13),
Token("า", 13, 14),
Token("ม", 14, 15),
Token("き", 15, 16),
Token("ゃ", 16, 17)}));
// clang-format on
}
#ifdef TC3_TEST_ICU
TEST(TokenizerTest, ICUTokenize) {
TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false);
std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
ASSERT_EQ(tokens,
// clang-format off
std::vector<Token>({Token("พระบาท", 0, 6),
Token("สมเด็จ", 6, 12),
Token("พระ", 12, 15),
Token("ปร", 15, 17),
Token("มิ", 17, 19)}));
// clang-format on
}
TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/true);
std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
ASSERT_EQ(tokens,
// clang-format off
std::vector<Token>({Token("พระบาท", 0, 6),
Token(" ", 6, 7),
Token("สมเด็จ", 7, 13),
Token(" ", 13, 14),
Token("พระ", 14, 17),
Token(" ", 17, 18),
Token("ปร", 18, 20),
Token(" ", 20, 21),
Token("มิ", 21, 23)}));
// clang-format on
}
TEST(TokenizerTest, MixedTokenize) {
std::vector<TokenizationCodepointRangeT> configs;
TokenizationCodepointRangeT* config;
configs.emplace_back();
config = &configs.back();
config->start = 32;
config->end = 33;
config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
std::vector<CodepointRangeT> internal_configs;
CodepointRangeT* interal_config;
internal_configs.emplace_back();
interal_config = &internal_configs.back();
interal_config->start = 0;
interal_config->end = 128;
internal_configs.emplace_back();
interal_config = &internal_configs.back();
interal_config->start = 128;
interal_config->end = 256;
internal_configs.emplace_back();
interal_config = &internal_configs.back();
interal_config->start = 256;
interal_config->end = 384;
internal_configs.emplace_back();
interal_config = &internal_configs.back();
interal_config->start = 384;
interal_config->end = 592;
TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
internal_configs,
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false);
std::vector<Token> tokens = tokenizer.Tokenize(
"こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
ASSERT_EQ(tokens,
// clang-format off
std::vector<Token>({Token("こんにちは", 0, 5),
Token("Japanese-ląnguagę", 5, 22),
Token("text", 23, 27),
Token("世界", 28, 30),
Token("http://www.google.com/", 31, 53)}));
// clang-format on
}
TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
std::vector<TokenizationCodepointRangeT> configs;
TokenizationCodepointRangeT* config;
configs.emplace_back();
config = &configs.back();
config->start = 0;
config->end = 256;
config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
{
TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
configs, {},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/false);
EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
}
{
TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
configs, {},
/*split_on_script_change=*/true,
/*icu_preserve_whitespace_tokens=*/false);
EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
Token("웹사이트", 7, 11)}));
}
}
#endif
} // namespace
} // namespace libtextclassifier3