普通文本  |  312行  |  9.83 KB

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "token-feature-extractor.h"

#include <cctype>
#include <string>

#include "util/base/logging.h"
#include "util/hash/farmhash.h"
#include "util/strings/stringpiece.h"
#include "util/utf8/unicodetext.h"

namespace libtextclassifier2 {

namespace {

std::string RemapTokenAscii(const std::string& token,
                            const TokenFeatureExtractorOptions& options) {
  if (!options.remap_digits && !options.lowercase_tokens) {
    return token;
  }

  std::string copy = token;
  for (int i = 0; i < token.size(); ++i) {
    if (options.remap_digits && isdigit(copy[i])) {
      copy[i] = '0';
    }
    if (options.lowercase_tokens) {
      copy[i] = tolower(copy[i]);
    }
  }
  return copy;
}

void RemapTokenUnicode(const std::string& token,
                       const TokenFeatureExtractorOptions& options,
                       const UniLib& unilib, UnicodeText* remapped) {
  if (!options.remap_digits && !options.lowercase_tokens) {
    // Leave remapped untouched.
    return;
  }

  UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false);
  remapped->clear();
  for (auto it = word.begin(); it != word.end(); ++it) {
    if (options.remap_digits && unilib.IsDigit(*it)) {
      remapped->AppendCodepoint('0');
    } else if (options.lowercase_tokens) {
      remapped->AppendCodepoint(unilib.ToLower(*it));
    } else {
      remapped->AppendCodepoint(*it);
    }
  }
}

}  // namespace

TokenFeatureExtractor::TokenFeatureExtractor(
    const TokenFeatureExtractorOptions& options, const UniLib& unilib)
    : options_(options), unilib_(unilib) {
  for (const std::string& pattern : options.regexp_features) {
    regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(
        unilib_.CreateRegexPattern(UTF8ToUnicodeText(
            pattern.c_str(), pattern.size(), /*do_copy=*/false))));
  }
}

bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
                                    std::vector<int>* sparse_features,
                                    std::vector<float>* dense_features) const {
  if (!dense_features) {
    return false;
  }
  if (sparse_features) {
    *sparse_features = ExtractCharactergramFeatures(token);
  }
  *dense_features = ExtractDenseFeatures(token, is_in_span);
  return true;
}

std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
    const Token& token) const {
  if (options_.unicode_aware_features) {
    return ExtractCharactergramFeaturesUnicode(token);
  } else {
    return ExtractCharactergramFeaturesAscii(token);
  }
}

std::vector<float> TokenFeatureExtractor::ExtractDenseFeatures(
    const Token& token, bool is_in_span) const {
  std::vector<float> dense_features;

  if (options_.extract_case_feature) {
    if (options_.unicode_aware_features) {
      UnicodeText token_unicode =
          UTF8ToUnicodeText(token.value, /*do_copy=*/false);
      const bool is_upper = unilib_.IsUpper(*token_unicode.begin());
      if (!token.value.empty() && is_upper) {
        dense_features.push_back(1.0);
      } else {
        dense_features.push_back(-1.0);
      }
    } else {
      if (!token.value.empty() && isupper(*token.value.begin())) {
        dense_features.push_back(1.0);
      } else {
        dense_features.push_back(-1.0);
      }
    }
  }

  if (options_.extract_selection_mask_feature) {
    if (is_in_span) {
      dense_features.push_back(1.0);
    } else {
      if (options_.unicode_aware_features) {
        dense_features.push_back(-1.0);
      } else {
        dense_features.push_back(0.0);
      }
    }
  }

  // Add regexp features.
  if (!regex_patterns_.empty()) {
    UnicodeText token_unicode =
        UTF8ToUnicodeText(token.value, /*do_copy=*/false);
    for (int i = 0; i < regex_patterns_.size(); ++i) {
      if (!regex_patterns_[i].get()) {
        dense_features.push_back(-1.0);
        continue;
      }
      auto matcher = regex_patterns_[i]->Matcher(token_unicode);
      int status;
      if (matcher->Matches(&status)) {
        dense_features.push_back(1.0);
      } else {
        dense_features.push_back(-1.0);
      }
    }
  }

  return dense_features;
}

int TokenFeatureExtractor::HashToken(StringPiece token) const {
  if (options_.allowed_chargrams.empty()) {
    return tc2farmhash::Fingerprint64(token) % options_.num_buckets;
  } else {
    // Padding and out-of-vocabulary tokens have extra buckets reserved because
    // they are special and important tokens, and we don't want them to share
    // embedding with other charactergrams.
    // TODO(zilka): Experimentally verify.
    const int kNumExtraBuckets = 2;
    const std::string token_string = token.ToString();
    if (token_string == "<PAD>") {
      return 1;
    } else if (options_.allowed_chargrams.find(token_string) ==
               options_.allowed_chargrams.end()) {
      return 0;  // Out-of-vocabulary.
    } else {
      return (tc2farmhash::Fingerprint64(token) %
              (options_.num_buckets - kNumExtraBuckets)) +
             kNumExtraBuckets;
    }
  }
}

std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
    const Token& token) const {
  std::vector<int> result;
  if (token.is_padding || token.value.empty()) {
    result.push_back(HashToken("<PAD>"));
  } else {
    const std::string word = RemapTokenAscii(token.value, options_);

    // Trim words that are over max_word_length characters.
    const int max_word_length = options_.max_word_length;
    std::string feature_word;
    if (word.size() > max_word_length) {
      feature_word =
          "^" + word.substr(0, max_word_length / 2) + "\1" +
          word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
          "$";
    } else {
      // Add a prefix and suffix to the word.
      feature_word = "^" + word + "$";
    }

    // Upper-bound the number of charactergram extracted to avoid resizing.
    result.reserve(options_.chargram_orders.size() * feature_word.size());

    if (options_.chargram_orders.empty()) {
      result.push_back(HashToken(feature_word));
    } else {
      // Generate the character-grams.
      for (int chargram_order : options_.chargram_orders) {
        if (chargram_order == 1) {
          for (int i = 1; i < feature_word.size() - 1; ++i) {
            result.push_back(
                HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1)));
          }
        } else {
          for (int i = 0;
               i < static_cast<int>(feature_word.size()) - chargram_order + 1;
               ++i) {
            result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i,
                                                   /*len=*/chargram_order)));
          }
        }
      }
    }
  }
  return result;
}

std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
    const Token& token) const {
  std::vector<int> result;
  if (token.is_padding || token.value.empty()) {
    result.push_back(HashToken("<PAD>"));
  } else {
    UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false);
    RemapTokenUnicode(token.value, options_, unilib_, &word);

    // Trim the word if needed by finding a left-cut point and right-cut point.
    auto left_cut = word.begin();
    auto right_cut = word.end();
    for (int i = 0; i < options_.max_word_length / 2; i++) {
      if (left_cut < right_cut) {
        ++left_cut;
      }
      if (left_cut < right_cut) {
        --right_cut;
      }
    }

    std::string feature_word;
    if (left_cut == right_cut) {
      feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
    } else {
      // clang-format off
      feature_word = "^" +
                     word.UTF8Substring(word.begin(), left_cut) +
                     "\1" +
                     word.UTF8Substring(right_cut, word.end()) +
                     "$";
      // clang-format on
    }

    const UnicodeText feature_word_unicode =
        UTF8ToUnicodeText(feature_word, /*do_copy=*/false);

    // Upper-bound the number of charactergram extracted to avoid resizing.
    result.reserve(options_.chargram_orders.size() * feature_word.size());

    if (options_.chargram_orders.empty()) {
      result.push_back(HashToken(feature_word));
    } else {
      // Generate the character-grams.
      for (int chargram_order : options_.chargram_orders) {
        UnicodeText::const_iterator it_start = feature_word_unicode.begin();
        UnicodeText::const_iterator it_end = feature_word_unicode.end();
        if (chargram_order == 1) {
          ++it_start;
          --it_end;
        }

        UnicodeText::const_iterator it_chargram_start = it_start;
        UnicodeText::const_iterator it_chargram_end = it_start;
        bool chargram_is_complete = true;
        for (int i = 0; i < chargram_order; ++i) {
          if (it_chargram_end == it_end) {
            chargram_is_complete = false;
            break;
          }
          ++it_chargram_end;
        }
        if (!chargram_is_complete) {
          continue;
        }

        for (; it_chargram_end <= it_end;
             ++it_chargram_start, ++it_chargram_end) {
          const int length_bytes =
              it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
          result.push_back(HashToken(
              StringPiece(it_chargram_start.utf8_data(), length_bytes)));
        }
      }
    }
  }
  return result;
}

}  // namespace libtextclassifier2