/* ****************************************************************************** * * Copyright (C) 2001-2011, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: utrie2.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2008aug16 (starting from a copy of utrie.c) * created by: Markus W. Scherer * * This is a common implementation of a Unicode trie. * It is a kind of compressed, serializable table of 16- or 32-bit values associated with * Unicode code points (0..0x10ffff). * This is the second common version of a Unicode trie (hence the name UTrie2). * See utrie2.h for a comparison. * * This file contains only the runtime and enumeration code, for read-only access. * See utrie2_builder.c for the builder code. */ #ifdef UTRIE2_DEBUG # include <stdio.h> #endif #include "unicode/utypes.h" #include "unicode/utf.h" #include "unicode/utf8.h" #include "unicode/utf16.h" #include "cmemory.h" #include "utrie2.h" #include "utrie2_impl.h" #include "uassert.h" /* Public UTrie2 API implementation ----------------------------------------- */ static uint32_t get32(const UNewTrie2 *trie, UChar32 c, UBool fromLSCP) { int32_t i2, block; if(c>=trie->highStart && (!U_IS_LEAD(c) || fromLSCP)) { return trie->data[trie->dataLength-UTRIE2_DATA_GRANULARITY]; } if(U_IS_LEAD(c) && fromLSCP) { i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ (c>>UTRIE2_SHIFT_2); } else { i2=trie->index1[c>>UTRIE2_SHIFT_1]+ ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); } block=trie->index2[i2]; return trie->data[block+(c&UTRIE2_DATA_MASK)]; } U_CAPI uint32_t U_EXPORT2 utrie2_get32(const UTrie2 *trie, UChar32 c) { if(trie->data16!=NULL) { return UTRIE2_GET16(trie, c); } else if(trie->data32!=NULL) { return UTRIE2_GET32(trie, c); } else if((uint32_t)c>0x10ffff) { return trie->errorValue; } else { return get32(trie->newTrie, c, TRUE); } } U_CAPI uint32_t U_EXPORT2 utrie2_get32FromLeadSurrogateCodeUnit(const UTrie2 *trie, UChar32 c) { if(!U_IS_LEAD(c)) { return trie->errorValue; } if(trie->data16!=NULL) { return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c); } else if(trie->data32!=NULL) { return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } else { return get32(trie->newTrie, c, FALSE); } } static inline int32_t u8Index(const UTrie2 *trie, UChar32 c, int32_t i) { int32_t idx= _UTRIE2_INDEX_FROM_CP( trie, trie->data32==NULL ? trie->indexLength : 0, c); return (idx<<3)|i; } U_CAPI int32_t U_EXPORT2 utrie2_internalU8NextIndex(const UTrie2 *trie, UChar32 c, const uint8_t *src, const uint8_t *limit) { int32_t i, length; i=0; /* support 64-bit pointers by avoiding cast of arbitrary difference */ if((limit-src)<=7) { length=(int32_t)(limit-src); } else { length=7; } c=utf8_nextCharSafeBody(src, &i, length, c, -1); return u8Index(trie, c, i); } U_CAPI int32_t U_EXPORT2 utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c, const uint8_t *start, const uint8_t *src) { int32_t i, length; /* support 64-bit pointers by avoiding cast of arbitrary difference */ if((src-start)<=7) { i=length=(int32_t)(src-start); } else { i=length=7; start=src-7; } c=utf8_prevCharSafeBody(start, 0, &i, c, -1); i=length-i; /* number of bytes read backward from src */ return u8Index(trie, c, i); } U_CAPI UTrie2 * U_EXPORT2 utrie2_openFromSerialized(UTrie2ValueBits valueBits, const void *data, int32_t length, int32_t *pActualLength, UErrorCode *pErrorCode) { const UTrie2Header *header; const uint16_t *p16; int32_t actualLength; UTrie2 tempTrie; UTrie2 *trie; if(U_FAILURE(*pErrorCode)) { return 0; } if( length<=0 || (U_POINTER_MASK_LSB(data, 3)!=0) || valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* enough data for a trie header? */ if(length<(int32_t)sizeof(UTrie2Header)) { *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } /* check the signature */ header=(const UTrie2Header *)data; if(header->signature!=UTRIE2_SIG) { *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } /* get the options */ if(valueBits!=(UTrie2ValueBits)(header->options&UTRIE2_OPTIONS_VALUE_BITS_MASK)) { *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } /* get the length values and offsets */ uprv_memset(&tempTrie, 0, sizeof(tempTrie)); tempTrie.indexLength=header->indexLength; tempTrie.dataLength=header->shiftedDataLength<<UTRIE2_INDEX_SHIFT; tempTrie.index2NullOffset=header->index2NullOffset; tempTrie.dataNullOffset=header->dataNullOffset; tempTrie.highStart=header->shiftedHighStart<<UTRIE2_SHIFT_1; tempTrie.highValueIndex=tempTrie.dataLength-UTRIE2_DATA_GRANULARITY; if(valueBits==UTRIE2_16_VALUE_BITS) { tempTrie.highValueIndex+=tempTrie.indexLength; } /* calculate the actual length */ actualLength=(int32_t)sizeof(UTrie2Header)+tempTrie.indexLength*2; if(valueBits==UTRIE2_16_VALUE_BITS) { actualLength+=tempTrie.dataLength*2; } else { actualLength+=tempTrie.dataLength*4; } if(length<actualLength) { *pErrorCode=U_INVALID_FORMAT_ERROR; /* not enough bytes */ return 0; } /* allocate the trie */ trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); if(trie==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return 0; } uprv_memcpy(trie, &tempTrie, sizeof(tempTrie)); trie->memory=(uint32_t *)data; trie->length=actualLength; trie->isMemoryOwned=FALSE; /* set the pointers to its index and data arrays */ p16=(const uint16_t *)(header+1); trie->index=p16; p16+=trie->indexLength; /* get the data */ switch(valueBits) { case UTRIE2_16_VALUE_BITS: trie->data16=p16; trie->data32=NULL; trie->initialValue=trie->index[trie->dataNullOffset]; trie->errorValue=trie->data16[UTRIE2_BAD_UTF8_DATA_OFFSET]; break; case UTRIE2_32_VALUE_BITS: trie->data16=NULL; trie->data32=(const uint32_t *)p16; trie->initialValue=trie->data32[trie->dataNullOffset]; trie->errorValue=trie->data32[UTRIE2_BAD_UTF8_DATA_OFFSET]; break; default: *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } if(pActualLength!=NULL) { *pActualLength=actualLength; } return trie; } U_CAPI UTrie2 * U_EXPORT2 utrie2_openDummy(UTrie2ValueBits valueBits, uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { UTrie2 *trie; UTrie2Header *header; uint32_t *p; uint16_t *dest16; int32_t indexLength, dataLength, length, i; int32_t dataMove; /* >0 if the data is moved to the end of the index array */ if(U_FAILURE(*pErrorCode)) { return 0; } if(valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* calculate the total length of the dummy trie data */ indexLength=UTRIE2_INDEX_1_OFFSET; dataLength=UTRIE2_DATA_START_OFFSET+UTRIE2_DATA_GRANULARITY; length=(int32_t)sizeof(UTrie2Header)+indexLength*2; if(valueBits==UTRIE2_16_VALUE_BITS) { length+=dataLength*2; } else { length+=dataLength*4; } /* allocate the trie */ trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); if(trie==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return 0; } uprv_memset(trie, 0, sizeof(UTrie2)); trie->memory=uprv_malloc(length); if(trie->memory==NULL) { uprv_free(trie); *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return 0; } trie->length=length; trie->isMemoryOwned=TRUE; /* set the UTrie2 fields */ if(valueBits==UTRIE2_16_VALUE_BITS) { dataMove=indexLength; } else { dataMove=0; } trie->indexLength=indexLength; trie->dataLength=dataLength; trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET; trie->dataNullOffset=(uint16_t)dataMove; trie->initialValue=initialValue; trie->errorValue=errorValue; trie->highStart=0; trie->highValueIndex=dataMove+UTRIE2_DATA_START_OFFSET; /* set the header fields */ header=(UTrie2Header *)trie->memory; header->signature=UTRIE2_SIG; /* "Tri2" */ header->options=(uint16_t)valueBits; header->indexLength=(uint16_t)indexLength; header->shiftedDataLength=(uint16_t)(dataLength>>UTRIE2_INDEX_SHIFT); header->index2NullOffset=(uint16_t)UTRIE2_INDEX_2_OFFSET; header->dataNullOffset=(uint16_t)dataMove; header->shiftedHighStart=0; /* fill the index and data arrays */ dest16=(uint16_t *)(header+1); trie->index=dest16; /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT */ for(i=0; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { *dest16++=(uint16_t)(dataMove>>UTRIE2_INDEX_SHIFT); /* null data block */ } /* write UTF-8 2-byte index-2 values, not right-shifted */ for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); } for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ *dest16++=(uint16_t)dataMove; } /* write the 16/32-bit data array */ switch(valueBits) { case UTRIE2_16_VALUE_BITS: /* write 16-bit data values */ trie->data16=dest16; trie->data32=NULL; for(i=0; i<0x80; ++i) { *dest16++=(uint16_t)initialValue; } for(; i<0xc0; ++i) { *dest16++=(uint16_t)errorValue; } /* highValue and reserved values */ for(i=0; i<UTRIE2_DATA_GRANULARITY; ++i) { *dest16++=(uint16_t)initialValue; } break; case UTRIE2_32_VALUE_BITS: /* write 32-bit data values */ p=(uint32_t *)dest16; trie->data16=NULL; trie->data32=p; for(i=0; i<0x80; ++i) { *p++=initialValue; } for(; i<0xc0; ++i) { *p++=errorValue; } /* highValue and reserved values */ for(i=0; i<UTRIE2_DATA_GRANULARITY; ++i) { *p++=initialValue; } break; default: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } return trie; } U_CAPI void U_EXPORT2 utrie2_close(UTrie2 *trie) { if(trie!=NULL) { if(trie->isMemoryOwned) { uprv_free(trie->memory); } if(trie->newTrie!=NULL) { uprv_free(trie->newTrie->data); uprv_free(trie->newTrie); } uprv_free(trie); } } U_CAPI int32_t U_EXPORT2 utrie2_getVersion(const void *data, int32_t length, UBool anyEndianOk) { uint32_t signature; if(length<16 || data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0)) { return 0; } signature=*(const uint32_t *)data; if(signature==UTRIE2_SIG) { return 2; } if(anyEndianOk && signature==UTRIE2_OE_SIG) { return 2; } if(signature==UTRIE_SIG) { return 1; } if(anyEndianOk && signature==UTRIE_OE_SIG) { return 1; } return 0; } U_CAPI int32_t U_EXPORT2 utrie2_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode) { const UTrie2Header *inTrie; UTrie2Header trie; int32_t dataLength, size; UTrie2ValueBits valueBits; if(U_FAILURE(*pErrorCode)) { return 0; } if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* setup and swapping */ if(length>=0 && length<(int32_t)sizeof(UTrie2Header)) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } inTrie=(const UTrie2Header *)inData; trie.signature=ds->readUInt32(inTrie->signature); trie.options=ds->readUInt16(inTrie->options); trie.indexLength=ds->readUInt16(inTrie->indexLength); trie.shiftedDataLength=ds->readUInt16(inTrie->shiftedDataLength); valueBits=(UTrie2ValueBits)(trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK); dataLength=(int32_t)trie.shiftedDataLength<<UTRIE2_INDEX_SHIFT; if( trie.signature!=UTRIE2_SIG || valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits || trie.indexLength<UTRIE2_INDEX_1_OFFSET || dataLength<UTRIE2_DATA_START_OFFSET ) { *pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */ return 0; } size=sizeof(UTrie2Header)+trie.indexLength*2; switch(valueBits) { case UTRIE2_16_VALUE_BITS: size+=dataLength*2; break; case UTRIE2_32_VALUE_BITS: size+=dataLength*4; break; default: *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } if(length>=0) { UTrie2Header *outTrie; if(length<size) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } outTrie=(UTrie2Header *)outData; /* swap the header */ ds->swapArray32(ds, &inTrie->signature, 4, &outTrie->signature, pErrorCode); ds->swapArray16(ds, &inTrie->options, 12, &outTrie->options, pErrorCode); /* swap the index and the data */ switch(valueBits) { case UTRIE2_16_VALUE_BITS: ds->swapArray16(ds, inTrie+1, (trie.indexLength+dataLength)*2, outTrie+1, pErrorCode); break; case UTRIE2_32_VALUE_BITS: ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode); ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, dataLength*4, (uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode); break; default: *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } } return size; } // utrie2_swapAnyVersion() should be defined here but lives in utrie2_builder.c // to avoid a dependency from utrie2.cpp on utrie.c. /* enumeration -------------------------------------------------------------- */ #define MIN_VALUE(a, b) ((a)<(b) ? (a) : (b)) /* default UTrie2EnumValue() returns the input value itself */ static uint32_t U_CALLCONV enumSameValue(const void * /*context*/, uint32_t value) { return value; } /** * Enumerate all ranges of code points with the same relevant values. * The values are transformed from the raw trie entries by the enumValue function. * * Currently requires start<limit and both start and limit must be multiples * of UTRIE2_DATA_BLOCK_LENGTH. * * Optimizations: * - Skip a whole block if we know that it is filled with a single value, * and it is the same as we visited just before. * - Handle the null block specially because we know a priori that it is filled * with a single value. */ static void enumEitherTrie(const UTrie2 *trie, UChar32 start, UChar32 limit, UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context) { const uint32_t *data32; const uint16_t *idx; uint32_t value, prevValue, initialValue; UChar32 c, prev, highStart; int32_t j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; if(enumRange==NULL) { return; } if(enumValue==NULL) { enumValue=enumSameValue; } if(trie->newTrie==NULL) { /* frozen trie */ idx=trie->index; U_ASSERT(idx!=NULL); /* the following code assumes trie->newTrie is not NULL when idx is NULL */ data32=trie->data32; index2NullOffset=trie->index2NullOffset; nullBlock=trie->dataNullOffset; } else { /* unfrozen, mutable trie */ idx=NULL; data32=trie->newTrie->data; U_ASSERT(data32!=NULL); /* the following code assumes idx is not NULL when data32 is NULL */ index2NullOffset=trie->newTrie->index2NullOffset; nullBlock=trie->newTrie->dataNullOffset; } highStart=trie->highStart; /* get the enumeration value that corresponds to an initial-value trie data entry */ initialValue=enumValue(context, trie->initialValue); /* set variables for previous range */ prevI2Block=-1; prevBlock=-1; prev=start; prevValue=0; /* enumerate index-2 blocks */ for(c=start; c<limit && c<highStart;) { /* Code point limit for iterating inside this i2Block. */ UChar32 tempLimit=c+UTRIE2_CP_PER_INDEX_1_ENTRY; if(limit<tempLimit) { tempLimit=limit; } if(c<=0xffff) { if(!U_IS_SURROGATE(c)) { i2Block=c>>UTRIE2_SHIFT_2; } else if(U_IS_SURROGATE_LEAD(c)) { /* * Enumerate values for lead surrogate code points, not code units: * This special block has half the normal length. */ i2Block=UTRIE2_LSCP_INDEX_2_OFFSET; tempLimit=MIN_VALUE(0xdc00, limit); } else { /* * Switch back to the normal part of the index-2 table. * Enumerate the second half of the surrogates block. */ i2Block=0xd800>>UTRIE2_SHIFT_2; tempLimit=MIN_VALUE(0xe000, limit); } } else { /* supplementary code points */ if(idx!=NULL) { i2Block=idx[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+ (c>>UTRIE2_SHIFT_1)]; } else { i2Block=trie->newTrie->index1[c>>UTRIE2_SHIFT_1]; } if(i2Block==prevI2Block && (c-prev)>=UTRIE2_CP_PER_INDEX_1_ENTRY) { /* * The index-2 block is the same as the previous one, and filled with prevValue. * Only possible for supplementary code points because the linear-BMP index-2 * table creates unique i2Block values. */ c+=UTRIE2_CP_PER_INDEX_1_ENTRY; continue; } } prevI2Block=i2Block; if(i2Block==index2NullOffset) { /* this is the null index-2 block */ if(prevValue!=initialValue) { if(prev<c && !enumRange(context, prev, c-1, prevValue)) { return; } prevBlock=nullBlock; prev=c; prevValue=initialValue; } c+=UTRIE2_CP_PER_INDEX_1_ENTRY; } else { /* enumerate data blocks for one index-2 block */ int32_t i2, i2Limit; i2=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; if((c>>UTRIE2_SHIFT_1)==(tempLimit>>UTRIE2_SHIFT_1)) { i2Limit=(tempLimit>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; } else { i2Limit=UTRIE2_INDEX_2_BLOCK_LENGTH; } for(; i2<i2Limit; ++i2) { if(idx!=NULL) { block=(int32_t)idx[i2Block+i2]<<UTRIE2_INDEX_SHIFT; } else { block=trie->newTrie->index2[i2Block+i2]; } if(block==prevBlock && (c-prev)>=UTRIE2_DATA_BLOCK_LENGTH) { /* the block is the same as the previous one, and filled with prevValue */ c+=UTRIE2_DATA_BLOCK_LENGTH; continue; } prevBlock=block; if(block==nullBlock) { /* this is the null data block */ if(prevValue!=initialValue) { if(prev<c && !enumRange(context, prev, c-1, prevValue)) { return; } prev=c; prevValue=initialValue; } c+=UTRIE2_DATA_BLOCK_LENGTH; } else { for(j=0; j<UTRIE2_DATA_BLOCK_LENGTH; ++j) { value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]); if(value!=prevValue) { if(prev<c && !enumRange(context, prev, c-1, prevValue)) { return; } prev=c; prevValue=value; } ++c; } } } } } if(c>limit) { c=limit; /* could be higher if in the index2NullOffset */ } else if(c<limit) { /* c==highStart<limit */ uint32_t highValue; if(idx!=NULL) { highValue= data32!=NULL ? data32[trie->highValueIndex] : idx[trie->highValueIndex]; } else { highValue=trie->newTrie->data[trie->newTrie->dataLength-UTRIE2_DATA_GRANULARITY]; } value=enumValue(context, highValue); if(value!=prevValue) { if(prev<c && !enumRange(context, prev, c-1, prevValue)) { return; } prev=c; prevValue=value; } c=limit; } /* deliver last range */ enumRange(context, prev, c-1, prevValue); } U_CAPI void U_EXPORT2 utrie2_enum(const UTrie2 *trie, UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context) { enumEitherTrie(trie, 0, 0x110000, enumValue, enumRange, context); } U_CAPI void U_EXPORT2 utrie2_enumForLeadSurrogate(const UTrie2 *trie, UChar32 lead, UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context) { if(!U16_IS_LEAD(lead)) { return; } lead=(lead-0xd7c0)<<10; /* start code point */ enumEitherTrie(trie, lead, lead+0x400, enumValue, enumRange, context); } /* C++ convenience wrappers ------------------------------------------------- */ U_NAMESPACE_BEGIN uint16_t BackwardUTrie2StringIterator::previous16() { codePointLimit=codePointStart; if(start>=codePointStart) { codePoint=U_SENTINEL; return 0; } uint16_t result; UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result); return result; } uint16_t ForwardUTrie2StringIterator::next16() { codePointStart=codePointLimit; if(codePointLimit==limit) { codePoint=U_SENTINEL; return 0; } uint16_t result; UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result); return result; } UTrie2 *UTrie2Singleton::getInstance(InstantiatorFn *instantiator, const void *context, UErrorCode &errorCode) { void *duplicate; UTrie2 *instance=(UTrie2 *)singleton.getInstance(instantiator, context, duplicate, errorCode); utrie2_close((UTrie2 *)duplicate); return instance; } U_NAMESPACE_END