/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lang_id/lang-id.h" #include <stdio.h> #include <algorithm> #include <limits> #include <memory> #include <string> #include <vector> #include "common/algorithm.h" #include "common/embedding-network-params-from-proto.h" #include "common/embedding-network.pb.h" #include "common/embedding-network.h" #include "common/feature-extractor.h" #include "common/file-utils.h" #include "common/list-of-strings.pb.h" #include "common/memory_image/in-memory-model-data.h" #include "common/mmap.h" #include "common/softmax.h" #include "common/task-context.h" #include "lang_id/custom-tokenizer.h" #include "lang_id/lang-id-brain-interface.h" #include "lang_id/language-identifier-features.h" #include "lang_id/light-sentence-features.h" #include "lang_id/light-sentence.h" #include "lang_id/relevant-script-feature.h" #include "util/base/logging.h" #include "util/base/macros.h" using ::libtextclassifier::nlp_core::file_utils::ParseProtoFromMemory; namespace libtextclassifier { namespace nlp_core { namespace lang_id { namespace { // Default value for the probability threshold; see comments for // LangId::SetProbabilityThreshold(). static const float kDefaultProbabilityThreshold = 0.50; // Default value for min text size below which our model can't provide a // meaningful prediction. static const int kDefaultMinTextSizeInBytes = 20; // Initial value for the default language for LangId::FindLanguage(). The // default language can be changed (for an individual LangId object) using // LangId::SetDefaultLanguage(). static const char kInitialDefaultLanguage[] = ""; // Returns total number of bytes of the words from sentence, without the ^ // (start-of-word) and $ (end-of-word) markers. Note: "real text" means that // this ignores whitespace and punctuation characters from the original text. int GetRealTextSize(const LightSentence &sentence) { int total = 0; for (int i = 0; i < sentence.num_words(); ++i) { TC_DCHECK(!sentence.word(i).empty()); TC_DCHECK_EQ('^', sentence.word(i).front()); TC_DCHECK_EQ('$', sentence.word(i).back()); total += sentence.word(i).size() - 2; } return total; } } // namespace // Class that performs all work behind LangId. class LangIdImpl { public: explicit LangIdImpl(const std::string &filename) { // Using mmap as a fast way to read the model bytes. ScopedMmap scoped_mmap(filename); MmapHandle mmap_handle = scoped_mmap.handle(); if (!mmap_handle.ok()) { TC_LOG(ERROR) << "Unable to read model bytes."; return; } Initialize(mmap_handle.to_stringpiece()); } explicit LangIdImpl(int fd) { // Using mmap as a fast way to read the model bytes. ScopedMmap scoped_mmap(fd); MmapHandle mmap_handle = scoped_mmap.handle(); if (!mmap_handle.ok()) { TC_LOG(ERROR) << "Unable to read model bytes."; return; } Initialize(mmap_handle.to_stringpiece()); } LangIdImpl(const char *ptr, size_t length) { Initialize(StringPiece(ptr, length)); } void Initialize(StringPiece model_bytes) { // Will set valid_ to true only on successful initialization. valid_ = false; // Make sure all relevant features are registered: ContinuousBagOfNgramsFunction::RegisterClass(); RelevantScriptFeature::RegisterClass(); // NOTE(salcianu): code below relies on the fact that the current features // do not rely on data from a TaskInput. Otherwise, one would have to use // the more complex model registration mechanism, which requires more code. InMemoryModelData model_data(model_bytes); TaskContext context; if (!model_data.GetTaskSpec(context.mutable_spec())) { TC_LOG(ERROR) << "Unable to get model TaskSpec"; return; } if (!ParseNetworkParams(model_data, &context)) { return; } if (!ParseListOfKnownLanguages(model_data, &context)) { return; } network_.reset(new EmbeddingNetwork(network_params_.get())); if (!network_->is_valid()) { return; } probability_threshold_ = context.Get("reliability_thresh", kDefaultProbabilityThreshold); min_text_size_in_bytes_ = context.Get("min_text_size_in_bytes", kDefaultMinTextSizeInBytes); version_ = context.Get("version", 0); if (!lang_id_brain_interface_.Init(&context)) { return; } valid_ = true; } void SetProbabilityThreshold(float threshold) { probability_threshold_ = threshold; } void SetDefaultLanguage(const std::string &lang) { default_language_ = lang; } std::string FindLanguage(const std::string &text) const { std::vector<float> scores = ScoreLanguages(text); if (scores.empty()) { return default_language_; } // Softmax label with max score. int label = GetArgMax(scores); float probability = scores[label]; if (probability < probability_threshold_) { return default_language_; } return GetLanguageForSoftmaxLabel(label); } std::vector<std::pair<std::string, float>> FindLanguages( const std::string &text) const { std::vector<float> scores = ScoreLanguages(text); std::vector<std::pair<std::string, float>> result; for (int i = 0; i < scores.size(); i++) { result.push_back({GetLanguageForSoftmaxLabel(i), scores[i]}); } // To avoid crashing clients that always expect at least one predicted // language, we promised (see doc for this method) that the result always // contains at least one element. if (result.empty()) { // We use a tiny probability, such that any client that uses a meaningful // probability threshold ignores this prediction. We don't use 0.0f, to // avoid crashing clients that normalize the probabilities we return here. result.push_back({default_language_, 0.001f}); } return result; } std::vector<float> ScoreLanguages(const std::string &text) const { if (!is_valid()) { return {}; } // Create a Sentence storing the input text. LightSentence sentence; TokenizeTextForLangId(text, &sentence); if (GetRealTextSize(sentence) < min_text_size_in_bytes_) { return {}; } // TODO(salcianu): reuse vector<FeatureVector>. std::vector<FeatureVector> features( lang_id_brain_interface_.NumEmbeddings()); lang_id_brain_interface_.GetFeatures(&sentence, &features); // Predict language. EmbeddingNetwork::Vector scores; network_->ComputeFinalScores(features, &scores); return ComputeSoftmax(scores); } bool is_valid() const { return valid_; } int version() const { return version_; } private: // Returns name of the (in-memory) file for the indicated TaskInput from // context. static std::string GetInMemoryFileNameForTaskInput( const std::string &input_name, TaskContext *context) { TaskInput *task_input = context->GetInput(input_name); if (task_input->part_size() != 1) { TC_LOG(ERROR) << "TaskInput " << input_name << " has " << task_input->part_size() << " parts"; return ""; } return task_input->part(0).file_pattern(); } bool ParseNetworkParams(const InMemoryModelData &model_data, TaskContext *context) { const std::string input_name = "language-identifier-network"; const std::string input_file_name = GetInMemoryFileNameForTaskInput(input_name, context); if (input_file_name.empty()) { TC_LOG(ERROR) << "No input file name for TaskInput " << input_name; return false; } StringPiece bytes = model_data.GetBytesForInputFile(input_file_name); if (bytes.data() == nullptr) { TC_LOG(ERROR) << "Unable to get bytes for TaskInput " << input_name; return false; } std::unique_ptr<EmbeddingNetworkProto> proto(new EmbeddingNetworkProto()); if (!ParseProtoFromMemory(bytes, proto.get())) { TC_LOG(ERROR) << "Unable to parse EmbeddingNetworkProto"; return false; } network_params_.reset( new EmbeddingNetworkParamsFromProto(std::move(proto))); if (!network_params_->is_valid()) { TC_LOG(ERROR) << "EmbeddingNetworkParamsFromProto not valid"; return false; } return true; } // Parses dictionary with known languages (i.e., field languages_) from a // TaskInput of context. Note: that TaskInput should be a ListOfStrings proto // with a single element, the serialized form of a ListOfStrings. // bool ParseListOfKnownLanguages(const InMemoryModelData &model_data, TaskContext *context) { const std::string input_name = "language-name-id-map"; const std::string input_file_name = GetInMemoryFileNameForTaskInput(input_name, context); if (input_file_name.empty()) { TC_LOG(ERROR) << "No input file name for TaskInput " << input_name; return false; } StringPiece bytes = model_data.GetBytesForInputFile(input_file_name); if (bytes.data() == nullptr) { TC_LOG(ERROR) << "Unable to get bytes for TaskInput " << input_name; return false; } ListOfStrings records; if (!ParseProtoFromMemory(bytes, &records)) { TC_LOG(ERROR) << "Unable to parse ListOfStrings from TaskInput " << input_name; return false; } if (records.element_size() != 1) { TC_LOG(ERROR) << "Wrong number of records in TaskInput " << input_name << " : " << records.element_size(); return false; } if (!ParseProtoFromMemory(std::string(records.element(0)), &languages_)) { TC_LOG(ERROR) << "Unable to parse dictionary with known languages"; return false; } return true; } // Returns language code for a softmax label. See comments for languages_ // field. If label is out of range, returns default_language_. std::string GetLanguageForSoftmaxLabel(int label) const { if ((label >= 0) && (label < languages_.element_size())) { return languages_.element(label); } else { TC_LOG(ERROR) << "Softmax label " << label << " outside range [0, " << languages_.element_size() << ")"; return default_language_; } } LangIdBrainInterface lang_id_brain_interface_; // Parameters for the neural network network_ (see below). std::unique_ptr<EmbeddingNetworkParamsFromProto> network_params_; // Neural network to use for scoring. std::unique_ptr<EmbeddingNetwork> network_; // True if this object is ready to perform language predictions. bool valid_; // Only predictions with a probability (confidence) above this threshold are // reported. Otherwise, we report default_language_. float probability_threshold_ = kDefaultProbabilityThreshold; // Min size of the input text for our predictions to be meaningful. Below // this threshold, the underlying model may report a wrong language and a high // confidence score. int min_text_size_in_bytes_ = kDefaultMinTextSizeInBytes; // Version of the model. int version_ = -1; // Known languages: softmax label i (an integer) means languages_.element(i) // (something like "en", "fr", "ru", etc). ListOfStrings languages_; // Language code to return in case of errors. std::string default_language_ = kInitialDefaultLanguage; TC_DISALLOW_COPY_AND_ASSIGN(LangIdImpl); }; LangId::LangId(const std::string &filename) : pimpl_(new LangIdImpl(filename)) { if (!pimpl_->is_valid()) { TC_LOG(ERROR) << "Unable to construct a valid LangId based " << "on the data from " << filename << "; nothing should crash, but " << "accuracy will be bad."; } } LangId::LangId(int fd) : pimpl_(new LangIdImpl(fd)) { if (!pimpl_->is_valid()) { TC_LOG(ERROR) << "Unable to construct a valid LangId based " << "on the data from descriptor " << fd << "; nothing should crash, " << "but accuracy will be bad."; } } LangId::LangId(const char *ptr, size_t length) : pimpl_(new LangIdImpl(ptr, length)) { if (!pimpl_->is_valid()) { TC_LOG(ERROR) << "Unable to construct a valid LangId based " << "on the memory region; nothing should crash, " << "but accuracy will be bad."; } } LangId::~LangId() = default; void LangId::SetProbabilityThreshold(float threshold) { pimpl_->SetProbabilityThreshold(threshold); } void LangId::SetDefaultLanguage(const std::string &lang) { pimpl_->SetDefaultLanguage(lang); } std::string LangId::FindLanguage(const std::string &text) const { return pimpl_->FindLanguage(text); } std::vector<std::pair<std::string, float>> LangId::FindLanguages( const std::string &text) const { return pimpl_->FindLanguages(text); } bool LangId::is_valid() const { return pimpl_->is_valid(); } int LangId::version() const { return pimpl_->version(); } } // namespace lang_id } // namespace nlp_core } // namespace libtextclassifier