/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Feature processing for FFModel (feed-forward SmartSelection model). #ifndef LIBTEXTCLASSIFIER_SMARTSELECT_FEATURE_PROCESSOR_H_ #define LIBTEXTCLASSIFIER_SMARTSELECT_FEATURE_PROCESSOR_H_ #include <memory> #include <string> #include <vector> #include "smartselect/cached-features.h" #include "smartselect/text-classification-model.pb.h" #include "smartselect/token-feature-extractor.h" #include "smartselect/tokenizer.h" #include "smartselect/types.h" #include "util/base/logging.h" #include "util/utf8/unicodetext.h" namespace libtextclassifier { constexpr int kInvalidLabel = -1; // Maps a vector of sparse features and a vector of dense features to a vector // of features that combines both. // The output is written to the memory location pointed to by the last float* // argument. // Returns true on success false on failure. using FeatureVectorFn = std::function<bool(const std::vector<int>&, const std::vector<float>&, float*)>; namespace internal { // Parses the serialized protocol buffer. FeatureProcessorOptions ParseSerializedOptions( const std::string& serialized_options); TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions( const FeatureProcessorOptions& options); // Removes tokens that are not part of a line of the context which contains // given span. void StripTokensFromOtherLines(const std::string& context, CodepointSpan span, std::vector<Token>* tokens); // Splits tokens that contain the selection boundary inside them. // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" void SplitTokensOnSelectionBoundaries(CodepointSpan selection, std::vector<Token>* tokens); // Returns the index of token that corresponds to the codepoint span. int CenterTokenFromClick(CodepointSpan span, const std::vector<Token>& tokens); // Returns the index of token that corresponds to the middle of the codepoint // span. int CenterTokenFromMiddleOfSelection( CodepointSpan span, const std::vector<Token>& selectable_tokens); // Strips the tokens from the tokens vector that are not used for feature // extraction because they are out of scope, or pads them so that there is // enough tokens in the required context_size for all inferences with a click // in relative_click_span. void StripOrPadTokens(TokenSpan relative_click_span, int context_size, std::vector<Token>* tokens, int* click_pos); } // namespace internal // Converts a codepoint span to a token span in the given list of tokens. TokenSpan CodepointSpanToTokenSpan(const std::vector<Token>& selectable_tokens, CodepointSpan codepoint_span); // Converts a token span to a codepoint span in the given list of tokens. CodepointSpan TokenSpanToCodepointSpan( const std::vector<Token>& selectable_tokens, TokenSpan token_span); // Takes care of preparing features for the span prediction model. class FeatureProcessor { public: explicit FeatureProcessor(const FeatureProcessorOptions& options) : feature_extractor_( internal::BuildTokenFeatureExtractorOptions(options)), options_(options), tokenizer_({options.tokenization_codepoint_config().begin(), options.tokenization_codepoint_config().end()}) { MakeLabelMaps(); PrepareCodepointRanges({options.supported_codepoint_ranges().begin(), options.supported_codepoint_ranges().end()}, &supported_codepoint_ranges_); PrepareCodepointRanges( {options.internal_tokenizer_codepoint_ranges().begin(), options.internal_tokenizer_codepoint_ranges().end()}, &internal_tokenizer_codepoint_ranges_); } explicit FeatureProcessor(const std::string& serialized_options) : FeatureProcessor(internal::ParseSerializedOptions(serialized_options)) { } // Tokenizes the input string using the selected tokenization method. std::vector<Token> Tokenize(const std::string& utf8_text) const; // Converts a label into a token span. bool LabelToTokenSpan(int label, TokenSpan* token_span) const; // Gets the total number of selection labels. int GetSelectionLabelCount() const { return label_to_selection_.size(); } // Gets the string value for given collection label. std::string LabelToCollection(int label) const; // Gets the total number of collections of the model. int NumCollections() const { return collection_to_label_.size(); } // Gets the name of the default collection. std::string GetDefaultCollection() const; const FeatureProcessorOptions& GetOptions() const { return options_; } // Tokenizes the context and input span, and finds the click position. void TokenizeAndFindClick(const std::string& context, CodepointSpan input_span, std::vector<Token>* tokens, int* click_pos) const; // Extracts features as a CachedFeatures object that can be used for repeated // inference over token spans in the given context. bool ExtractFeatures(const std::string& context, CodepointSpan input_span, TokenSpan relative_click_span, const FeatureVectorFn& feature_vector_fn, int feature_vector_size, std::vector<Token>* tokens, int* click_pos, std::unique_ptr<CachedFeatures>* cached_features) const; // Fills selection_label_spans with CodepointSpans that correspond to the // selection labels. The CodepointSpans are based on the codepoint ranges of // given tokens. bool SelectionLabelSpans( VectorSpan<Token> tokens, std::vector<CodepointSpan>* selection_label_spans) const; int DenseFeaturesCount() const { return feature_extractor_.DenseFeaturesCount(); } protected: // Represents a codepoint range [start, end). struct CodepointRange { int32 start; int32 end; CodepointRange(int32 arg_start, int32 arg_end) : start(arg_start), end(arg_end) {} }; // Returns the class id corresponding to the given string collection // identifier. There is a catch-all class id that the function returns for // unknown collections. int CollectionToLabel(const std::string& collection) const; // Prepares mapping from collection names to labels. void MakeLabelMaps(); // Gets the number of spannable tokens for the model. // // Spannable tokens are those tokens of context, which the model predicts // selection spans over (i.e., there is 1:1 correspondence between the output // classes of the model and each of the spannable tokens). int GetNumContextTokens() const { return options_.context_size() * 2 + 1; } // Converts a label into a span of codepoint indices corresponding to it // given output_tokens. bool LabelToSpan(int label, const VectorSpan<Token>& output_tokens, CodepointSpan* span) const; // Converts a span to the corresponding label given output_tokens. bool SpanToLabel(const std::pair<CodepointIndex, CodepointIndex>& span, const std::vector<Token>& output_tokens, int* label) const; // Converts a token span to the corresponding label. int TokenSpanToLabel(const std::pair<TokenIndex, TokenIndex>& span) const; void PrepareCodepointRanges( const std::vector<FeatureProcessorOptions::CodepointRange>& codepoint_ranges, std::vector<CodepointRange>* prepared_codepoint_ranges); // Returns the ratio of supported codepoints to total number of codepoints in // the input context around given click position. float SupportedCodepointsRatio(int click_pos, const std::vector<Token>& tokens) const; // Returns true if given codepoint is covered by the given sorted vector of // codepoint ranges. bool IsCodepointInRanges( int codepoint, const std::vector<CodepointRange>& codepoint_ranges) const; // Finds the center token index in tokens vector, using the method defined // in options_. int FindCenterToken(CodepointSpan span, const std::vector<Token>& tokens) const; // Tokenizes the input text using ICU tokenizer. bool ICUTokenize(const std::string& context, std::vector<Token>* result) const; // Takes the result of ICU tokenization and retokenizes stretches of tokens // made of a specific subset of characters using the internal tokenizer. void InternalRetokenize(const std::string& context, std::vector<Token>* tokens) const; // Tokenizes a substring of the unicode string, appending the resulting tokens // to the output vector. The resulting tokens have bounds relative to the full // string. Does nothing if the start of the span is negative. void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span, std::vector<Token>* result) const; const TokenFeatureExtractor feature_extractor_; // Codepoint ranges that define what codepoints are supported by the model. // NOTE: Must be sorted. std::vector<CodepointRange> supported_codepoint_ranges_; // Codepoint ranges that define which tokens (consisting of which codepoints) // should be re-tokenized with the internal tokenizer in the mixed // tokenization mode. // NOTE: Must be sorted. std::vector<CodepointRange> internal_tokenizer_codepoint_ranges_; private: const FeatureProcessorOptions options_; // Mapping between token selection spans and labels ids. std::map<TokenSpan, int> selection_to_label_; std::vector<TokenSpan> label_to_selection_; // Mapping between collections and labels. std::map<std::string, int> collection_to_label_; Tokenizer tokenizer_; }; } // namespace libtextclassifier #endif // LIBTEXTCLASSIFIER_SMARTSELECT_FEATURE_PROCESSOR_H_