// Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ /* * File coleitr.cpp * * Created by: Helena Shih * * Modification History: * * Date Name Description * * 6/23/97 helena Adding comments to make code more readable. * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java * 12/10/99 aliu Ported Thai collation support from Java. * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) * 02/19/01 swquek Removed CollationElementIterator() since it is * private constructor and no calls are made to it * 2012-2014 markus Rewritten in C++ again. */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/coleitr.h" #include "unicode/tblcoll.h" #include "unicode/ustring.h" #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "collationiterator.h" #include "collationsets.h" #include "collationtailoring.h" #include "uassert.h" #include "uhash.h" #include "utf16collationiterator.h" #include "uvectr32.h" /* Constants --------------------------------------------------------------- */ U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) /* CollationElementIterator public constructor/destructor ------------------ */ CollationElementIterator::CollationElementIterator( const CollationElementIterator& other) : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { *this = other; } CollationElementIterator::~CollationElementIterator() { delete iter_; delete offsets_; } /* CollationElementIterator public methods --------------------------------- */ namespace { uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); } uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); } UBool ceNeedsTwoParts(int64_t ce) { return (ce & INT64_C(0xffff00ff003f)) != 0; } } // namespace int32_t CollationElementIterator::getOffset() const { if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { // CollationIterator::previousCE() decrements the CEs length // while it pops CEs from its internal buffer. int32_t i = iter_->getCEsLength(); if (otherHalf_ != 0) { // Return the trailing CE offset while we are in the middle of a 64-bit CE. ++i; } U_ASSERT(i < offsets_->size()); return offsets_->elementAti(i); } return iter_->getOffset(); } /** * Get the ordering priority of the next character in the string. * @return the next character's ordering. Returns NULLORDER if an error has * occured or if the end of string has been reached */ int32_t CollationElementIterator::next(UErrorCode& status) { if (U_FAILURE(status)) { return NULLORDER; } if (dir_ > 1) { // Continue forward iteration. Test this first. if (otherHalf_ != 0) { uint32_t oh = otherHalf_; otherHalf_ = 0; return oh; } } else if (dir_ == 1) { // next() after setOffset() dir_ = 2; } else if (dir_ == 0) { // The iter_ is already reset to the start of the text. dir_ = 2; } else /* dir_ < 0 */ { // illegal change of direction status = U_INVALID_STATE_ERROR; return NULLORDER; } // No need to keep all CEs in the buffer when we iterate. iter_->clearCEsIfNoneRemaining(); int64_t ce = iter_->nextCE(status); if (ce == Collation::NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. uint32_t p = (uint32_t)(ce >> 32); uint32_t lower32 = (uint32_t)ce; uint32_t firstHalf = getFirstHalf(p, lower32); uint32_t secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { otherHalf_ = secondHalf | 0xc0; // continuation CE } return firstHalf; } UBool CollationElementIterator::operator!=( const CollationElementIterator& other) const { return !(*this == other); } UBool CollationElementIterator::operator==( const CollationElementIterator& that) const { if (this == &that) { return TRUE; } return (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && otherHalf_ == that.otherHalf_ && normalizeDir() == that.normalizeDir() && string_ == that.string_ && *iter_ == *that.iter_; } /** * Get the ordering priority of the previous collation element in the string. * @param status the error code status. * @return the previous element's ordering. Returns NULLORDER if an error has * occured or if the start of string has been reached. */ int32_t CollationElementIterator::previous(UErrorCode& status) { if (U_FAILURE(status)) { return NULLORDER; } if (dir_ < 0) { // Continue backwards iteration. Test this first. if (otherHalf_ != 0) { uint32_t oh = otherHalf_; otherHalf_ = 0; return oh; } } else if (dir_ == 0) { iter_->resetToOffset(string_.length()); dir_ = -1; } else if (dir_ == 1) { // previous() after setOffset() dir_ = -1; } else /* dir_ > 1 */ { // illegal change of direction status = U_INVALID_STATE_ERROR; return NULLORDER; } if (offsets_ == NULL) { offsets_ = new UVector32(status); if (offsets_ == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULLORDER; } } // If we already have expansion CEs, then we also have offsets. // Otherwise remember the trailing offset in case we need to // write offsets for an artificial expansion. int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; int64_t ce = iter_->previousCE(*offsets_, status); if (ce == Collation::NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. uint32_t p = (uint32_t)(ce >> 32); uint32_t lower32 = (uint32_t)ce; uint32_t firstHalf = getFirstHalf(p, lower32); uint32_t secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { if (offsets_->isEmpty()) { // When we convert a single 64-bit CE into two 32-bit CEs, // we need to make this artificial expansion behave like a normal expansion. // See CollationIterator::previousCE(). offsets_->addElement(iter_->getOffset(), status); offsets_->addElement(limitOffset, status); } otherHalf_ = firstHalf; return secondHalf | 0xc0; // continuation CE } return firstHalf; } /** * Resets the cursor to the beginning of the string. */ void CollationElementIterator::reset() { iter_ ->resetToOffset(0); otherHalf_ = 0; dir_ = 0; } void CollationElementIterator::setOffset(int32_t newOffset, UErrorCode& status) { if (U_FAILURE(status)) { return; } if (0 < newOffset && newOffset < string_.length()) { int32_t offset = newOffset; do { UChar c = string_.charAt(offset); if (!rbc_->isUnsafe(c) || (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { break; } // Back up to before this unsafe character. --offset; } while (offset > 0); if (offset < newOffset) { // We might have backed up more than necessary. // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, // but for text "chu" setOffset(2) should remain at 2 // although we initially back up to offset 0. // Find the last safe offset no greater than newOffset by iterating forward. int32_t lastSafeOffset = offset; do { iter_->resetToOffset(lastSafeOffset); do { iter_->nextCE(status); if (U_FAILURE(status)) { return; } } while ((offset = iter_->getOffset()) == lastSafeOffset); if (offset <= newOffset) { lastSafeOffset = offset; } } while (offset < newOffset); newOffset = lastSafeOffset; } } iter_->resetToOffset(newOffset); otherHalf_ = 0; dir_ = 1; } /** * Sets the source to the new source string. */ void CollationElementIterator::setText(const UnicodeString& source, UErrorCode& status) { if (U_FAILURE(status)) { return; } string_ = source; const UChar *s = string_.getBuffer(); CollationIterator *newIter; UBool numeric = rbc_->settings->isNumeric(); if (rbc_->settings->dontCheckFCD()) { newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); } else { newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); } if (newIter == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } delete iter_; iter_ = newIter; otherHalf_ = 0; dir_ = 0; } // Sets the source to the new character iterator. void CollationElementIterator::setText(CharacterIterator& source, UErrorCode& status) { if (U_FAILURE(status)) return; source.getText(string_); setText(string_, status); } int32_t CollationElementIterator::strengthOrder(int32_t order) const { UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); // Mask off the unwanted differences. if (s == UCOL_PRIMARY) { order &= 0xffff0000; } else if (s == UCOL_SECONDARY) { order &= 0xffffff00; } return order; } /* CollationElementIterator private constructors/destructors --------------- */ /** * This is the "real" constructor for this class; it constructs an iterator * over the source text using the specified collator */ CollationElementIterator::CollationElementIterator( const UnicodeString &source, const RuleBasedCollator *coll, UErrorCode &status) : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { setText(source, status); } /** * This is the "real" constructor for this class; it constructs an iterator over * the source text using the specified collator */ CollationElementIterator::CollationElementIterator( const CharacterIterator &source, const RuleBasedCollator *coll, UErrorCode &status) : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { // We only call source.getText() which should be const anyway. setText(const_cast<CharacterIterator &>(source), status); } /* CollationElementIterator private methods -------------------------------- */ const CollationElementIterator& CollationElementIterator::operator=( const CollationElementIterator& other) { if (this == &other) { return *this; } CollationIterator *newIter; const FCDUTF16CollationIterator *otherFCDIter = dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); if(otherFCDIter != NULL) { newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); } else { const UTF16CollationIterator *otherIter = dynamic_cast<const UTF16CollationIterator *>(other.iter_); if(otherIter != NULL) { newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); } else { newIter = NULL; } } if(newIter != NULL) { delete iter_; iter_ = newIter; rbc_ = other.rbc_; otherHalf_ = other.otherHalf_; dir_ = other.dir_; string_ = other.string_; } if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { UErrorCode errorCode = U_ZERO_ERROR; if(offsets_ == NULL) { offsets_ = new UVector32(other.offsets_->size(), errorCode); } if(offsets_ != NULL) { offsets_->assign(*other.offsets_, errorCode); } } return *this; } namespace { class MaxExpSink : public ContractionsAndExpansions::CESink { public: MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} virtual ~MaxExpSink(); virtual void handleCE(int64_t /*ce*/) {} virtual void handleExpansion(const int64_t ces[], int32_t length) { if (length <= 1) { // We do not need to add single CEs into the map. return; } int32_t count = 0; // number of CE "halves" for (int32_t i = 0; i < length; ++i) { count += ceNeedsTwoParts(ces[i]) ? 2 : 1; } // last "half" of the last CE int64_t ce = ces[length - 1]; uint32_t p = (uint32_t)(ce >> 32); uint32_t lower32 = (uint32_t)ce; uint32_t lastHalf = getSecondHalf(p, lower32); if (lastHalf == 0) { lastHalf = getFirstHalf(p, lower32); U_ASSERT(lastHalf != 0); } else { lastHalf |= 0xc0; // old-style continuation CE } if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); } } private: UHashtable *maxExpansions; UErrorCode &errorCode; }; MaxExpSink::~MaxExpSink() {} } // namespace UHashtable * CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return NULL; } UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, uhash_compareLong, &errorCode); if (U_FAILURE(errorCode)) { return NULL; } MaxExpSink sink(maxExpansions, errorCode); ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); if (U_FAILURE(errorCode)) { uhash_close(maxExpansions); return NULL; } return maxExpansions; } int32_t CollationElementIterator::getMaxExpansion(int32_t order) const { return getMaxExpansion(rbc_->tailoring->maxExpansions, order); } int32_t CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { if (order == 0) { return 1; } int32_t max; if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { return max; } if ((order & 0xc0) == 0xc0) { // old-style continuation CE return 2; } else { return 1; } } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_COLLATION */