/* ******************************************************************************* * * Copyright (C) 2003-2008, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: unorm_it.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2003jan21 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION #include "unicode/uiter.h" #include "unicode/unorm.h" #include "unorm_it.h" #include "cmemory.h" /* UNormIterator ------------------------------------------------------------ */ enum { INITIAL_CAPACITY=100 }; struct UNormIterator { UCharIterator api; UCharIterator *iter; /* * chars and states either use the static buffers * or are allocated in the same memory block * * They are parallel arrays with states[] holding the getState() values * from normalization boundaries, and UITER_NO_STATE in between. */ UChar *chars; uint32_t *states; /* * api.start: first valid character & state in the arrays * api.index: current position * api.limit: one past the last valid character in chars[], but states[limit] is valid * capacity: length of allocated arrays */ int32_t capacity; /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */ uint32_t state; /* there are UChars available before start or after limit? */ UBool hasPrevious, hasNext, isStackAllocated; UNormalizationMode mode; UChar charsBuffer[INITIAL_CAPACITY]; uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */ }; static void initIndexes(UNormIterator *uni, UCharIterator *iter) { /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ UCharIterator *api=&uni->api; if(!iter->hasPrevious(iter)) { /* set indexes to the beginning of the arrays */ api->start=api->index=api->limit=0; uni->hasPrevious=FALSE; uni->hasNext=iter->hasNext(iter); } else if(!iter->hasNext(iter)) { /* set indexes to the end of the arrays */ api->start=api->index=api->limit=uni->capacity; uni->hasNext=FALSE; uni->hasPrevious=iter->hasPrevious(iter); } else { /* set indexes into the middle of the arrays */ api->start=api->index=api->limit=uni->capacity/2; uni->hasPrevious=uni->hasNext=TRUE; } } static UBool reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) { /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ UCharIterator *api=&uni->api; uint32_t *states; UChar *chars; int32_t start, limit; states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2); if(states==NULL) { return FALSE; } chars=(UChar *)(states+(capacity+1)); uni->capacity=capacity; start=api->start; limit=api->limit; if(addAtStart) { /* copy old contents to the end of the new arrays */ int32_t delta; delta=capacity-uni->capacity; uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4); uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4); api->start=start+delta; api->index+=delta; api->limit=limit+delta; } else { /* copy old contents to the beginning of the new arrays */ uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4); uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4); } uni->chars=chars; uni->states=states; return TRUE; } static void moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { /* move array contents up to make room */ int32_t srcIndex, destIndex, limit; limit=api->limit; srcIndex=delta; if(srcIndex>api->start) { /* look for a position in the arrays with a known state */ while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) { ++srcIndex; } } /* now actually move the array contents */ api->start=destIndex=0; while(srcIndex<limit) { chars[destIndex]=chars[srcIndex]; states[destIndex++]=states[srcIndex++]; } /* copy states[limit] as well! */ states[destIndex]=states[srcIndex]; api->limit=destIndex; } static void moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { /* move array contents up to make room */ int32_t srcIndex, destIndex, start; start=api->start; destIndex=((UNormIterator *)api)->capacity; srcIndex=destIndex-delta; if(srcIndex<api->limit) { /* look for a position in the arrays with a known state */ while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) { --srcIndex; } } /* now actually move the array contents */ api->limit=destIndex; /* copy states[limit] as well! */ states[destIndex]=states[srcIndex]; while(srcIndex>start) { chars[--destIndex]=chars[--srcIndex]; states[destIndex]=states[srcIndex]; } api->start=destIndex; } /* normalize forward from the limit, assume hasNext is true */ static UBool readNext(UNormIterator *uni, UCharIterator *iter) { /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ UCharIterator *api=&uni->api; /* make capacity/4 room at the end of the arrays */ int32_t limit, capacity, room; UErrorCode errorCode; limit=api->limit; capacity=uni->capacity; room=capacity/4; if(room>(capacity-limit)) { /* move array contents to make room */ moveContentsTowardStart(api, uni->chars, uni->states, room); api->index=limit=api->limit; uni->hasPrevious=TRUE; } /* normalize starting from the limit position */ errorCode=U_ZERO_ERROR; if(uni->state!=uni->states[limit]) { uiter_setState(iter, uni->states[limit], &errorCode); if(U_FAILURE(errorCode)) { uni->state=UITER_NO_STATE; uni->hasNext=FALSE; return FALSE; } } room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { if(room<=capacity) { /* empty and re-use the arrays */ uni->states[0]=uni->states[limit]; api->start=api->index=api->limit=limit=0; uni->hasPrevious=TRUE; } else { capacity+=room+100; if(!reallocArrays(uni, capacity, FALSE)) { uni->state=UITER_NO_STATE; uni->hasNext=FALSE; return FALSE; } limit=api->limit; } errorCode=U_ZERO_ERROR; uiter_setState(iter, uni->states[limit], &errorCode); room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); } if(U_FAILURE(errorCode) || room==0) { uni->state=UITER_NO_STATE; uni->hasNext=FALSE; return FALSE; } /* room>0 */ ++limit; /* leave the known states[limit] alone */ for(--room; room>0; --room) { /* set unknown states for all but the normalization boundaries */ uni->states[limit++]=UITER_NO_STATE; } uni->states[limit]=uni->state=uiter_getState(iter); uni->hasNext=iter->hasNext(iter); api->limit=limit; return TRUE; } /* normalize backward from the start, assume hasPrevious is true */ static UBool readPrevious(UNormIterator *uni, UCharIterator *iter) { /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ UCharIterator *api=&uni->api; /* make capacity/4 room at the start of the arrays */ int32_t start, capacity, room; UErrorCode errorCode; start=api->start; capacity=uni->capacity; room=capacity/4; if(room>start) { /* move array contents to make room */ moveContentsTowardEnd(api, uni->chars, uni->states, room); api->index=start=api->start; uni->hasNext=TRUE; } /* normalize ending at the start position */ errorCode=U_ZERO_ERROR; if(uni->state!=uni->states[start]) { uiter_setState(iter, uni->states[start], &errorCode); if(U_FAILURE(errorCode)) { uni->state=UITER_NO_STATE; uni->hasPrevious=FALSE; return FALSE; } } room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { if(room<=capacity) { /* empty and re-use the arrays */ uni->states[capacity]=uni->states[start]; api->start=api->index=api->limit=start=capacity; uni->hasNext=TRUE; } else { capacity+=room+100; if(!reallocArrays(uni, capacity, TRUE)) { uni->state=UITER_NO_STATE; uni->hasPrevious=FALSE; return FALSE; } start=api->start; } errorCode=U_ZERO_ERROR; uiter_setState(iter, uni->states[start], &errorCode); room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); } if(U_FAILURE(errorCode) || room==0) { uni->state=UITER_NO_STATE; uni->hasPrevious=FALSE; return FALSE; } /* room>0 */ do { /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */ uni->chars[--start]=uni->chars[--room]; /* set unknown states for all but the normalization boundaries */ uni->states[start]=UITER_NO_STATE; } while(room>0); uni->states[start]=uni->state=uiter_getState(iter); uni->hasPrevious=iter->hasPrevious(iter); api->start=start; return TRUE; } /* Iterator runtime API functions ------------------------------------------- */ static int32_t U_CALLCONV unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) { switch(origin) { case UITER_ZERO: case UITER_START: return 0; case UITER_CURRENT: case UITER_LIMIT: case UITER_LENGTH: return UITER_UNKNOWN_INDEX; default: /* not a valid origin */ /* Should never get here! */ return -1; } } static int32_t U_CALLCONV unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) { UNormIterator *uni=(UNormIterator *)api; UCharIterator *iter=uni->iter; int32_t pos; switch(origin) { case UITER_ZERO: case UITER_START: /* restart from the beginning */ if(uni->hasPrevious) { iter->move(iter, 0, UITER_START); api->start=api->index=api->limit=0; uni->states[api->limit]=uni->state=uiter_getState(iter); uni->hasPrevious=FALSE; uni->hasNext=iter->hasNext(iter); } else { /* we already have the beginning of the normalized text */ api->index=api->start; } break; case UITER_CURRENT: break; case UITER_LIMIT: case UITER_LENGTH: /* restart from the end */ if(uni->hasNext) { iter->move(iter, 0, UITER_LIMIT); api->start=api->index=api->limit=uni->capacity; uni->states[api->limit]=uni->state=uiter_getState(iter); uni->hasPrevious=iter->hasPrevious(iter); uni->hasNext=FALSE; } else { /* we already have the end of the normalized text */ api->index=api->limit; } break; default: return -1; /* Error */ } /* move relative to the current position by delta normalized UChars */ if(delta==0) { /* nothing to do */ } else if(delta>0) { /* go forward until the requested position is in the buffer */ for(;;) { pos=api->index+delta; /* requested position */ delta=pos-api->limit; /* remainder beyond buffered text */ if(delta<=0) { api->index=pos; /* position reached */ break; } /* go to end of buffer and normalize further */ api->index=api->limit; if(!uni->hasNext || !readNext(uni, iter)) { break; /* reached end of text */ } } } else /* delta<0 */ { /* go backward until the requested position is in the buffer */ for(;;) { pos=api->index+delta; /* requested position */ delta=pos-api->start; /* remainder beyond buffered text */ if(delta>=0) { api->index=pos; /* position reached */ break; } /* go to start of buffer and normalize further */ api->index=api->start; if(!uni->hasPrevious || !readPrevious(uni, iter)) { break; /* reached start of text */ } } } if(api->index==api->start && !uni->hasPrevious) { return 0; } else { return UITER_UNKNOWN_INDEX; } } static UBool U_CALLCONV unormIteratorHasNext(UCharIterator *api) { return api->index<api->limit || ((UNormIterator *)api)->hasNext; } static UBool U_CALLCONV unormIteratorHasPrevious(UCharIterator *api) { return api->index>api->start || ((UNormIterator *)api)->hasPrevious; } static UChar32 U_CALLCONV unormIteratorCurrent(UCharIterator *api) { UNormIterator *uni=(UNormIterator *)api; if( api->index<api->limit || (uni->hasNext && readNext(uni, uni->iter)) ) { return uni->chars[api->index]; } else { return U_SENTINEL; } } static UChar32 U_CALLCONV unormIteratorNext(UCharIterator *api) { UNormIterator *uni=(UNormIterator *)api; if( api->index<api->limit || (uni->hasNext && readNext(uni, uni->iter)) ) { return uni->chars[api->index++]; } else { return U_SENTINEL; } } static UChar32 U_CALLCONV unormIteratorPrevious(UCharIterator *api) { UNormIterator *uni=(UNormIterator *)api; if( api->index>api->start || (uni->hasPrevious && readPrevious(uni, uni->iter)) ) { return uni->chars[--api->index]; } else { return U_SENTINEL; } } static uint32_t U_CALLCONV unormIteratorGetState(const UCharIterator *api) { /* not uni->state because that may not be at api->index */ return ((UNormIterator *)api)->states[api->index]; } static void U_CALLCONV unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) { if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { /* do nothing */ } else if(api==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; } else if(state==UITER_NO_STATE) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; } else { UNormIterator *uni=(UNormIterator *)api; UCharIterator *iter=((UNormIterator *)api)->iter; if(state!=uni->state) { uni->state=state; uiter_setState(iter, state, pErrorCode); } /* * Try shortcuts: If the requested state is in the array contents * then just set the index there. * * We assume that the state is unique per position! */ if(state==uni->states[api->index]) { return; } else if(state==uni->states[api->limit]) { api->index=api->limit; return; } else { /* search for the index with this state */ int32_t i; for(i=api->start; i<api->limit; ++i) { if(state==uni->states[i]) { api->index=i; return; } } } /* there is no array index for this state, reset for fresh contents */ initIndexes((UNormIterator *)api, iter); uni->states[api->limit]=state; } } static const UCharIterator unormIterator={ NULL, 0, 0, 0, 0, 0, unormIteratorGetIndex, unormIteratorMove, unormIteratorHasNext, unormIteratorHasPrevious, unormIteratorCurrent, unormIteratorNext, unormIteratorPrevious, NULL, unormIteratorGetState, unormIteratorSetState }; /* Setup functions ---------------------------------------------------------- */ U_CAPI UNormIterator * U_EXPORT2 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) { UNormIterator *uni; /* argument checking */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return NULL; } /* allocate */ uni=NULL; if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) { if(U_ALIGNMENT_OFFSET(stackMem)==0) { /* already aligned */ uni=(UNormIterator *)stackMem; } else { int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem); if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) { /* needs alignment */ uni=(UNormIterator *)((char *)stackMem+align); } } /* else does not fit */ } if(uni!=NULL) { uni->isStackAllocated=TRUE; } else { uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator)); if(uni==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } uni->isStackAllocated=FALSE; } /* * initialize * do not memset because that would unnecessarily initialize the arrays */ uni->iter=NULL; uni->chars=uni->charsBuffer; uni->states=uni->statesBuffer; uni->capacity=INITIAL_CAPACITY; uni->state=UITER_NO_STATE; uni->hasPrevious=uni->hasNext=FALSE; uni->mode=UNORM_NONE; /* set a no-op iterator into the api */ uiter_setString(&uni->api, NULL, 0); return uni; } U_CAPI void U_EXPORT2 unorm_closeIter(UNormIterator *uni) { if(uni!=NULL) { if(uni->states!=uni->statesBuffer) { /* chars and states are allocated in the same memory block */ uprv_free(uni->states); } if(!uni->isStackAllocated) { uprv_free(uni); } } } U_CAPI UCharIterator * U_EXPORT2 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) { /* argument checking */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return NULL; } if(uni==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return NULL; } if( iter==NULL || iter->getState==NULL || iter->setState==NULL || mode<UNORM_NONE || UNORM_MODE_COUNT<=mode ) { /* set a no-op iterator into the api */ uiter_setString(&uni->api, NULL, 0); *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return NULL; } /* set the iterator and initialize */ uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator)); uni->iter=iter; uni->mode=mode; initIndexes(uni, iter); uni->states[uni->api.limit]=uni->state=uiter_getState(iter); return &uni->api; } #endif /* uconfig.h switches */