/* ******************************************************************************* * Copyright (C) 2010-2012, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucharstriebuilder.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2010nov14 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #include "unicode/ucharstrie.h" #include "unicode/ucharstriebuilder.h" #include "unicode/unistr.h" #include "unicode/ustring.h" #include "cmemory.h" #include "uarrsort.h" #include "uassert.h" #include "uhash.h" #include "ustr_imp.h" U_NAMESPACE_BEGIN /* * Note: This builder implementation stores (string, value) pairs with full copies * of the 16-bit-unit sequences, until the UCharsTrie is built. * It might(!) take less memory if we collected the data in a temporary, dynamic trie. */ class UCharsTrieElement : public UMemory { public: // Use compiler's default constructor, initializes nothing. void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); UnicodeString getString(const UnicodeString &strings) const { int32_t length=strings[stringOffset]; return strings.tempSubString(stringOffset+1, length); } int32_t getStringLength(const UnicodeString &strings) const { return strings[stringOffset]; } UChar charAt(int32_t index, const UnicodeString &strings) const { return strings[stringOffset+1+index]; } int32_t getValue() const { return value; } int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; private: // The first strings unit contains the string length. // (Compared with a stringLength field here, this saves 2 bytes per string.) int32_t stringOffset; int32_t value; }; void UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } int32_t length=s.length(); if(length>0xffff) { // Too long: We store the length in 1 unit. errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return; } stringOffset=strings.length(); strings.append((UChar)length); value=val; strings.append(s); } int32_t UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { return getString(strings).compare(other.getString(strings)); } UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) : elements(NULL), elementsCapacity(0), elementsLength(0), uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} UCharsTrieBuilder::~UCharsTrieBuilder() { delete[] elements; uprv_free(uchars); } UCharsTrieBuilder & UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return *this; } if(ucharsLength>0) { // Cannot add elements after building. errorCode=U_NO_WRITE_PERMISSION; return *this; } if(elementsLength==elementsCapacity) { int32_t newCapacity; if(elementsCapacity==0) { newCapacity=1024; } else { newCapacity=4*elementsCapacity; } UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; if(newElements==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return *this; } if(elementsLength>0) { uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement)); } delete[] elements; elements=newElements; elementsCapacity=newCapacity; } elements[elementsLength++].setTo(s, value, strings, errorCode); if(U_SUCCESS(errorCode) && strings.isBogus()) { errorCode=U_MEMORY_ALLOCATION_ERROR; } return *this; } U_CDECL_BEGIN static int32_t U_CALLCONV compareElementStrings(const void *context, const void *left, const void *right) { const UnicodeString *strings=static_cast<const UnicodeString *>(context); const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left); const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right); return leftElement->compareStringTo(*rightElement, *strings); } U_CDECL_END UCharsTrie * UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { buildUChars(buildOption, errorCode); UCharsTrie *newTrie=NULL; if(U_SUCCESS(errorCode)) { newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); if(newTrie==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } else { uchars=NULL; // The new trie now owns the array. ucharsCapacity=0; } } return newTrie; } UnicodeString & UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode) { buildUChars(buildOption, errorCode); if(U_SUCCESS(errorCode)) { result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); } return result; } void UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(uchars!=NULL && ucharsLength>0) { // Already built. return; } if(ucharsLength==0) { if(elementsLength==0) { errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return; } if(strings.isBogus()) { errorCode=U_MEMORY_ALLOCATION_ERROR; return; } uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), compareElementStrings, &strings, FALSE, // need not be a stable sort &errorCode); if(U_FAILURE(errorCode)) { return; } // Duplicate strings are not allowed. UnicodeString prev=elements[0].getString(strings); for(int32_t i=1; i<elementsLength; ++i) { UnicodeString current=elements[i].getString(strings); if(prev==current) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } prev.fastCopyFrom(current); } } // Create and UChar-serialize the trie for the elements. ucharsLength=0; int32_t capacity=strings.length(); if(capacity<1024) { capacity=1024; } if(ucharsCapacity<capacity) { uprv_free(uchars); uchars=static_cast<UChar *>(uprv_malloc(capacity*2)); if(uchars==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; ucharsCapacity=0; return; } ucharsCapacity=capacity; } StringTrieBuilder::build(buildOption, elementsLength, errorCode); if(uchars==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } } int32_t UCharsTrieBuilder::getElementStringLength(int32_t i) const { return elements[i].getStringLength(strings); } UChar UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { return elements[i].charAt(unitIndex, strings); } int32_t UCharsTrieBuilder::getElementValue(int32_t i) const { return elements[i].getValue(); } int32_t UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { const UCharsTrieElement &firstElement=elements[first]; const UCharsTrieElement &lastElement=elements[last]; int32_t minStringLength=firstElement.getStringLength(strings); while(++unitIndex<minStringLength && firstElement.charAt(unitIndex, strings)== lastElement.charAt(unitIndex, strings)) {} return unitIndex; } int32_t UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { int32_t length=0; // Number of different units at unitIndex. int32_t i=start; do { UChar unit=elements[i++].charAt(unitIndex, strings); while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { ++i; } ++length; } while(i<limit); return length; } int32_t UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { do { UChar unit=elements[i++].charAt(unitIndex, strings); while(unit==elements[i].charAt(unitIndex, strings)) { ++i; } } while(--count>0); return i; } int32_t UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { while(unit==elements[i].charAt(unitIndex, strings)) { ++i; } return i; } UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) : LinearMatchNode(len, nextNode), s(units) { hash=hash*37+ustr_hashUCharsN(units, len); } UBool UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { if(this==&other) { return TRUE; } if(!LinearMatchNode::operator==(other)) { return FALSE; } const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; return 0==u_memcmp(s, o.s, length); } void UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; next->write(builder); b.write(s, length); offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); } StringTrieBuilder::Node * UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, Node *nextNode) const { return new UCTLinearMatchNode( elements[i].getString(strings).getBuffer()+unitIndex, length, nextNode); } UBool UCharsTrieBuilder::ensureCapacity(int32_t length) { if(uchars==NULL) { return FALSE; // previous memory allocation had failed } if(length>ucharsCapacity) { int32_t newCapacity=ucharsCapacity; do { newCapacity*=2; } while(newCapacity<=length); UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2)); if(newUChars==NULL) { // unable to allocate memory uprv_free(uchars); uchars=NULL; ucharsCapacity=0; return FALSE; } u_memcpy(newUChars+(newCapacity-ucharsLength), uchars+(ucharsCapacity-ucharsLength), ucharsLength); uprv_free(uchars); uchars=newUChars; ucharsCapacity=newCapacity; } return TRUE; } int32_t UCharsTrieBuilder::write(int32_t unit) { int32_t newLength=ucharsLength+1; if(ensureCapacity(newLength)) { ucharsLength=newLength; uchars[ucharsCapacity-ucharsLength]=(UChar)unit; } return ucharsLength; } int32_t UCharsTrieBuilder::write(const UChar *s, int32_t length) { int32_t newLength=ucharsLength+length; if(ensureCapacity(newLength)) { ucharsLength=newLength; u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); } return ucharsLength; } int32_t UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { return write(elements[i].getString(strings).getBuffer()+unitIndex, length); } int32_t UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { return write(i|(isFinal<<15)); } UChar intUnits[3]; int32_t length; if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); intUnits[1]=(UChar)((uint32_t)i>>16); intUnits[2]=(UChar)i; length=3; // } else if(i<=UCharsTrie::kMaxOneUnitValue) { // intUnits[0]=(UChar)(i); // length=1; } else { intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); intUnits[1]=(UChar)i; length=2; } intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); return write(intUnits, length); } int32_t UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { if(!hasValue) { return write(node); } UChar intUnits[3]; int32_t length; if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); intUnits[1]=(UChar)((uint32_t)value>>16); intUnits[2]=(UChar)value; length=3; } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { intUnits[0]=(UChar)((value+1)<<6); length=1; } else { intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); intUnits[1]=(UChar)value; length=2; } intUnits[0]|=(UChar)node; return write(intUnits, length); } int32_t UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { int32_t i=ucharsLength-jumpTarget; U_ASSERT(i>=0); if(i<=UCharsTrie::kMaxOneUnitDelta) { return write(i); } UChar intUnits[3]; int32_t length; if(i<=UCharsTrie::kMaxTwoUnitDelta) { intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); length=1; } else { intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); intUnits[1]=(UChar)(i>>16); length=2; } intUnits[length++]=(UChar)i; return write(intUnits, length); } U_NAMESPACE_END