/* ****************************************************************************** * Copyright (c) 1996-2011, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * File unorm.cpp * * Created by: Vladimir Weinstein 12052000 * * Modification history : * * Date Name Description * 02/01/01 synwee Added normalization quickcheck enum and method. * 02/12/01 synwee Commented out quickcheck util api has been approved * Added private method for doing FCD checks * 02/23/01 synwee Modified quickcheck and checkFCE to run through * string for codepoints < 0x300 for the normalization * mode NFC. * 05/25/01+ Markus Scherer total rewrite, implement all normalization here * instead of just wrappers around normlzr.cpp, * load unorm.dat, support Unicode 3.1 with * supplementary code points, etc. * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code */ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/udata.h" #include "unicode/ustring.h" #include "unicode/uiter.h" #include "unicode/unorm.h" #include "unicode/unorm2.h" #include "normalizer2impl.h" #include "unormimp.h" #include "uprops.h" #include "ustr_imp.h" #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) U_NAMESPACE_USE /* quick check functions ---------------------------------------------------- */ U_CAPI UNormalizationCheckResult U_EXPORT2 unorm_quickCheck(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } U_CAPI UNormalizationCheckResult U_EXPORT2 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); return unorm2_quickCheck( reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), src, srcLength, pErrorCode); } else { return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } } U_CAPI UBool U_EXPORT2 unorm_isNormalized(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } U_CAPI UBool U_EXPORT2 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); return unorm2_isNormalized( reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), src, srcLength, pErrorCode); } else { return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } } /* normalize() API ---------------------------------------------------------- */ /** Public API for normalizing. */ U_CAPI int32_t U_EXPORT2 unorm_normalize(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); if(options&UNORM_UNICODE_3_2) { FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); return unorm2_normalize( reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), src, srcLength, dest, destCapacity, pErrorCode); } else { return unorm2_normalize((const UNormalizer2 *)n2, src, srcLength, dest, destCapacity, pErrorCode); } } /* iteration functions ------------------------------------------------------ */ static int32_t unorm_iterate(UCharIterator *src, UBool forward, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UBool doNormalize, UBool *pNeededToNormalize, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); const UnicodeSet *uni32; if(options&UNORM_UNICODE_3_2) { uni32=uniset_getUnicode32Instance(*pErrorCode); } else { uni32=NULL; // unused } if(U_FAILURE(*pErrorCode)) { return 0; } FilteredNormalizer2 fn2(*n2, *uni32); if(options&UNORM_UNICODE_3_2) { n2=&fn2; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(pNeededToNormalize!=NULL) { *pNeededToNormalize=FALSE; } if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { return u_terminateUChars(dest, destCapacity, 0, pErrorCode); } UnicodeString buffer; UChar32 c; if(forward) { /* get one character and ignore its properties */ buffer.append(uiter_next32(src)); /* get all following characters until we see a boundary */ while((c=uiter_next32(src))>=0) { if(n2->hasBoundaryBefore(c)) { /* back out the latest movement to stop at the boundary */ src->move(src, -U16_LENGTH(c), UITER_CURRENT); break; } else { buffer.append(c); } } } else { while((c=uiter_previous32(src))>=0) { /* always write this character to the front of the buffer */ buffer.insert(0, c); /* stop if this just-copied character is a boundary */ if(n2->hasBoundaryBefore(c)) { break; } } } UnicodeString destString(dest, 0, destCapacity); if(buffer.length()>0 && doNormalize) { n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { *pNeededToNormalize= destString!=buffer; } return destString.length(); } else { /* just copy the source characters */ return buffer.extract(dest, destCapacity, *pErrorCode); } } U_CAPI int32_t U_EXPORT2 unorm_previous(UCharIterator *src, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UBool doNormalize, UBool *pNeededToNormalize, UErrorCode *pErrorCode) { return unorm_iterate(src, FALSE, dest, destCapacity, mode, options, doNormalize, pNeededToNormalize, pErrorCode); } U_CAPI int32_t U_EXPORT2 unorm_next(UCharIterator *src, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UBool doNormalize, UBool *pNeededToNormalize, UErrorCode *pErrorCode) { return unorm_iterate(src, TRUE, dest, destCapacity, mode, options, doNormalize, pNeededToNormalize, pErrorCode); } /* Concatenation of normalized strings -------------------------------------- */ U_CAPI int32_t U_EXPORT2 unorm_concatenate(const UChar *left, int32_t leftLength, const UChar *right, int32_t rightLength, UChar *dest, int32_t destCapacity, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); const UnicodeSet *uni32; if(options&UNORM_UNICODE_3_2) { uni32=uniset_getUnicode32Instance(*pErrorCode); } else { uni32=NULL; // unused } if(U_FAILURE(*pErrorCode)) { return 0; } FilteredNormalizer2 fn2(*n2, *uni32); if(options&UNORM_UNICODE_3_2) { n2=&fn2; } if( destCapacity<0 || (dest==NULL && destCapacity>0) || left==NULL || leftLength<-1 || right==NULL || rightLength<-1 ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* check for overlapping right and destination */ if( dest!=NULL && ((right>=dest && right<(dest+destCapacity)) || (rightLength>0 && dest>=right && dest<(right+rightLength))) ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* allow left==dest */ UnicodeString destString; if(left==dest) { destString.setTo(dest, leftLength, destCapacity); } else { destString.setTo(dest, 0, destCapacity); destString.append(left, leftLength); } return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). extract(dest, destCapacity, *pErrorCode); } #endif /* #if !UCONFIG_NO_NORMALIZATION */