/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lang_id/relevant-script-feature.h"
#include <string>
#include "common/feature-extractor.h"
#include "common/feature-types.h"
#include "common/task-context.h"
#include "common/workspace.h"
#include "lang_id/script-detector.h"
#include "util/base/logging.h"
#include "util/strings/utf8.h"
namespace libtextclassifier {
namespace nlp_core {
namespace lang_id {
bool RelevantScriptFeature::Setup(TaskContext *context) { return true; }
bool RelevantScriptFeature::Init(TaskContext *context) {
set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
return true;
}
void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
const LightSentence &sentence,
FeatureVector *result) const {
// We expect kNumRelevantScripts to be small, so we stack-allocate the array
// of counts. Still, if that changes, we want to find out.
static_assert(
kNumRelevantScripts < 25,
"switch counts to vector<int>: too big for stack-allocated int[]");
// counts[s] is the number of characters with script s.
// Note: {} "value-initializes" the array to zero.
int counts[kNumRelevantScripts]{};
int total_count = 0;
for (int i = 0; i < sentence.num_words(); ++i) {
const std::string &word = sentence.word(i);
const char *const word_end = word.data() + word.size();
const char *curr = word.data();
// Skip over token start '^'.
TC_DCHECK_EQ(*curr, '^');
curr += GetNumBytesForNonZeroUTF8Char(curr);
while (true) {
const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
Script script = GetScript(curr, num_bytes);
// We do this update and the if (...) break below *before* incrementing
// counts[script] in order to skip the token end '$'.
curr += num_bytes;
if (curr >= word_end) {
TC_DCHECK_EQ(*(curr - num_bytes), '$');
break;
}
TC_DCHECK_GE(script, 0);
TC_DCHECK_LT(script, kNumRelevantScripts);
counts[script]++;
total_count++;
}
}
for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
int count = counts[script_id];
if (count > 0) {
const float weight = static_cast<float>(count) / total_count;
FloatFeatureValue value(script_id, weight);
result->add(feature_type(), value.discrete_value);
}
}
}
} // namespace lang_id
} // namespace nlp_core
} // namespace libtextclassifier