/* ******************************************************************************* * Copyright (C) 2004-2010, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: regex.cpp */ #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/regex.h" #include "unicode/uregex.h" #include "unicode/unistr.h" #include "unicode/ustring.h" #include "unicode/uchar.h" #include "unicode/uobject.h" #include "umutex.h" #include "uassert.h" #include "cmemory.h" #include "regextxt.h" #include <stdio.h> U_NAMESPACE_BEGIN #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) struct RegularExpression: public UMemory { public: RegularExpression(); ~RegularExpression(); int32_t fMagic; RegexPattern *fPat; int32_t *fPatRefCount; UChar *fPatString; int32_t fPatStringLen; RegexMatcher *fMatcher; const UChar *fText; // Text from setText() int32_t fTextLength; // Length provided by user with setText(), which // may be -1. UBool fOwnsText; }; static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII RegularExpression::RegularExpression() { fMagic = REXP_MAGIC; fPat = NULL; fPatRefCount = NULL; fPatString = NULL; fPatStringLen = 0; fMatcher = NULL; fText = NULL; fTextLength = 0; fOwnsText = FALSE; } RegularExpression::~RegularExpression() { delete fMatcher; fMatcher = NULL; if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { delete fPat; uprv_free(fPatString); uprv_free(fPatRefCount); } if (fOwnsText && fText!=NULL) { uprv_free((void *)fText); } fMagic = 0; } U_NAMESPACE_END U_NAMESPACE_USE //---------------------------------------------------------------------------------------- // // validateRE Do boilerplate style checks on API function parameters. // Return TRUE if they look OK. //---------------------------------------------------------------------------------------- static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { if (U_FAILURE(*status)) { return FALSE; } if (re == NULL || re->fMagic != REXP_MAGIC) { *status = U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway if (requiresText && re->fText == NULL && !re->fOwnsText) { *status = U_REGEX_INVALID_STATE; return FALSE; } return TRUE; } //---------------------------------------------------------------------------------------- // // uregex_open // //---------------------------------------------------------------------------------------- U_CAPI URegularExpression * U_EXPORT2 uregex_open( const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status) { if (U_FAILURE(*status)) { return NULL; } if (pattern == NULL || patternLength < -1 || patternLength == 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } int32_t actualPatLen = patternLength; if (actualPatLen == -1) { actualPatLen = u_strlen(pattern); } RegularExpression *re = new RegularExpression; int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); if (re == NULL || refC == NULL || patBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; uprv_free(refC); uprv_free(patBuf); return NULL; } re->fPatRefCount = refC; *re->fPatRefCount = 1; // // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a UText wrapper around // this local copy, to avoid making even more copies. // re->fPatString = patBuf; re->fPatStringLen = patternLength; u_memcpy(patBuf, pattern, actualPatLen); patBuf[actualPatLen] = 0; UText patText = UTEXT_INITIALIZER; utext_openUChars(&patText, patBuf, patternLength, status); // // Compile the pattern // if (pe != NULL) { re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); } else { re->fPat = RegexPattern::compile(&patText, flags, *status); } utext_close(&patText); if (U_FAILURE(*status)) { goto ErrorExit; } // // Create the matcher object // re->fMatcher = re->fPat->matcher(*status); if (U_SUCCESS(*status)) { return (URegularExpression*)re; } ErrorExit: delete re; return NULL; } //---------------------------------------------------------------------------------------- // // uregex_openUText // //---------------------------------------------------------------------------------------- U_CAPI URegularExpression * U_EXPORT2 uregex_openUText(UText *pattern, uint32_t flags, UParseError *pe, UErrorCode *status) { if (U_FAILURE(*status)) { return NULL; } if (pattern == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } int64_t patternNativeLength = utext_nativeLength(pattern); if (patternNativeLength == 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } RegularExpression *re = new RegularExpression; UErrorCode lengthStatus = U_ZERO_ERROR; int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); if (re == NULL || refC == NULL || patBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; uprv_free(refC); uprv_free(patBuf); return NULL; } re->fPatRefCount = refC; *re->fPatRefCount = 1; // // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a read-only UText wrapper // around this local copy, to avoid making even more copies. // re->fPatString = patBuf; re->fPatStringLen = pattern16Length; utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); UText patText = UTEXT_INITIALIZER; utext_openUChars(&patText, patBuf, pattern16Length, status); // // Compile the pattern // if (pe != NULL) { re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); } else { re->fPat = RegexPattern::compile(&patText, flags, *status); } utext_close(&patText); if (U_FAILURE(*status)) { goto ErrorExit; } // // Create the matcher object // re->fMatcher = re->fPat->matcher(*status); if (U_SUCCESS(*status)) { return (URegularExpression*)re; } ErrorExit: delete re; return NULL; } //---------------------------------------------------------------------------------------- // // uregex_close // //---------------------------------------------------------------------------------------- U_CAPI void U_EXPORT2 uregex_close(URegularExpression *re2) { RegularExpression *re = (RegularExpression*)re2; UErrorCode status = U_ZERO_ERROR; if (validateRE(re, &status, FALSE) == FALSE) { return; } delete re; } //---------------------------------------------------------------------------------------- // // uregex_clone // //---------------------------------------------------------------------------------------- U_CAPI URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *source2, UErrorCode *status) { RegularExpression *source = (RegularExpression*)source2; if (validateRE(source, status, FALSE) == FALSE) { return NULL; } RegularExpression *clone = new RegularExpression; if (clone == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } clone->fMatcher = source->fPat->matcher(*status); if (U_FAILURE(*status)) { delete clone; return NULL; } clone->fPat = source->fPat; clone->fPatRefCount = source->fPatRefCount; clone->fPatString = source->fPatString; clone->fPatStringLen = source->fPatStringLen; umtx_atomic_inc(source->fPatRefCount); // Note: fText is not cloned. return (URegularExpression*)clone; } //------------------------------------------------------------------------------ // // uregex_pattern // //------------------------------------------------------------------------------ U_CAPI const UChar * U_EXPORT2 uregex_pattern(const URegularExpression *regexp2, int32_t *patLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return NULL; } if (patLength != NULL) { *patLength = regexp->fPatStringLen; } return regexp->fPatString; } //------------------------------------------------------------------------------ // // uregex_patternUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_patternUText(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; (void)status; return regexp->fPat->patternText(); } //------------------------------------------------------------------------------ // // uregex_flags // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return 0; } int32_t flags = regexp->fPat->flags(); return flags; } //------------------------------------------------------------------------------ // // uregex_setText // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setText(URegularExpression *regexp2, const UChar *text, int32_t textLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return; } if (text == NULL || textLength < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (regexp->fOwnsText && regexp->fText != NULL) { uprv_free((void *)regexp->fText); } regexp->fText = text; regexp->fTextLength = textLength; regexp->fOwnsText = FALSE; UText input = UTEXT_INITIALIZER; utext_openUChars(&input, text, textLength, status); regexp->fMatcher->reset(&input); utext_close(&input); // reset() made a shallow clone, so we don't need this copy } //------------------------------------------------------------------------------ // // uregex_setUText // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setUText(URegularExpression *regexp2, UText *text, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return; } if (text == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (regexp->fOwnsText && regexp->fText != NULL) { uprv_free((void *)regexp->fText); } regexp->fText = NULL; // only fill it in on request regexp->fTextLength = -1; regexp->fOwnsText = TRUE; regexp->fMatcher->reset(text); } //------------------------------------------------------------------------------ // // uregex_getText // //------------------------------------------------------------------------------ U_CAPI const UChar * U_EXPORT2 uregex_getText(URegularExpression *regexp2, int32_t *textLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return NULL; } if (regexp->fText == NULL) { // need to fill in the text UText *inputText = regexp->fMatcher->inputText(); int64_t inputNativeLength = utext_nativeLength(inputText); if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { regexp->fText = inputText->chunkContents; regexp->fTextLength = (int32_t)inputNativeLength; regexp->fOwnsText = FALSE; // because the UText owns it } else { UErrorCode lengthStatus = U_ZERO_ERROR; regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); regexp->fText = inputChars; regexp->fOwnsText = TRUE; // should already be set but just in case } } if (textLength != NULL) { *textLength = regexp->fTextLength; } return regexp->fText; } //------------------------------------------------------------------------------ // // uregex_getUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_getUText(URegularExpression *regexp2, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return dest; } return regexp->fMatcher->getInput(dest); } // BEGIN android-added // Removed this function after Android upgrade to ICU4.6. //------------------------------------------------------------------------------ // // uregex_refreshUText // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_refreshUText(URegularExpression *regexp2, UText *text, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return; } regexp->fMatcher->refreshInputText(text, *status); } // END android-added //------------------------------------------------------------------------------ // // uregex_matches // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_matches(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = FALSE; if (validateRE(regexp, status) == FALSE) { return result; } if (startIndex == -1) { result = regexp->fMatcher->matches(*status); } else { result = regexp->fMatcher->matches(startIndex, *status); } return result; } //------------------------------------------------------------------------------ // // uregex_lookingAt // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_lookingAt(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = FALSE; if (validateRE(regexp, status) == FALSE) { return result; } if (startIndex == -1) { result = regexp->fMatcher->lookingAt(*status); } else { result = regexp->fMatcher->lookingAt(startIndex, *status); } return result; } //------------------------------------------------------------------------------ // // uregex_find // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_find(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = FALSE; if (validateRE(regexp, status) == FALSE) { return result; } if (startIndex == -1) { regexp->fMatcher->resetPreserveRegion(); result = regexp->fMatcher->find(); } else { result = regexp->fMatcher->find(startIndex, *status); } return result; } //------------------------------------------------------------------------------ // // uregex_findNext // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_findNext(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return FALSE; } UBool result = regexp->fMatcher->find(); return result; } //------------------------------------------------------------------------------ // // uregex_groupCount // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_groupCount(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status, FALSE) == FALSE) { return 0; } int32_t result = regexp->fMatcher->groupCount(); return result; } //------------------------------------------------------------------------------ // // uregex_group // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_group(URegularExpression *regexp2, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (destCapacity == 0 || regexp->fText != NULL) { // If preflighting or if we already have the text as UChars, // this is a little cheaper than going through uregex_groupUText() // // Pick up the range of characters from the matcher // int32_t startIx = regexp->fMatcher->start(groupNum, *status); int32_t endIx = regexp->fMatcher->end (groupNum, *status); if (U_FAILURE(*status)) { return 0; } // // Trim length based on buffer capacity // int32_t fullLength = endIx - startIx; int32_t copyLength = fullLength; if (copyLength < destCapacity) { dest[copyLength] = 0; } else if (copyLength == destCapacity) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { copyLength = destCapacity; *status = U_BUFFER_OVERFLOW_ERROR; } // // Copy capture group to user's buffer // if (copyLength > 0) { u_memcpy(dest, ®exp->fText[startIx], copyLength); } return fullLength; } else { UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status); int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); utext_close(groupText); return result; } } //------------------------------------------------------------------------------ // // uregex_groupUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_groupUText(URegularExpression *regexp2, int32_t groupNum, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { UErrorCode emptyTextStatus = U_ZERO_ERROR; return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); } if (regexp->fText != NULL) { // // Pick up the range of characters from the matcher // and use our already-extracted characters // int32_t startIx = regexp->fMatcher->start(groupNum, *status); int32_t endIx = regexp->fMatcher->end (groupNum, *status); if (U_FAILURE(*status)) { UErrorCode emptyTextStatus = U_ZERO_ERROR; return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); } if (dest) { utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); } else { UText groupText = UTEXT_INITIALIZER; utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); utext_close(&groupText); } return dest; } else { return regexp->fMatcher->group(groupNum, dest, *status); } } //------------------------------------------------------------------------------ // // uregex_start // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_start(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } int32_t result = regexp->fMatcher->start(groupNum, *status); return result; } //------------------------------------------------------------------------------ // // uregex_end // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_end(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } int32_t result = regexp->fMatcher->end(groupNum, *status); return result; } //------------------------------------------------------------------------------ // // uregex_reset // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_reset(URegularExpression *regexp2, int32_t index, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return; } regexp->fMatcher->reset(index, *status); } //------------------------------------------------------------------------------ // // uregex_setRegion // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setRegion(URegularExpression *regexp2, int32_t regionStart, int32_t regionLimit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return; } regexp->fMatcher->region(regionStart, regionLimit, *status); } //------------------------------------------------------------------------------ // // uregex_regionStart // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_regionStart(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } return regexp->fMatcher->regionStart(); } //------------------------------------------------------------------------------ // // uregex_regionEnd // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_regionEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } return regexp->fMatcher->regionEnd(); } //------------------------------------------------------------------------------ // // uregex_hasTransparentBounds // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hasTransparentBounds(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return FALSE; } return regexp->fMatcher->hasTransparentBounds(); } //------------------------------------------------------------------------------ // // uregex_useTransparentBounds // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_useTransparentBounds(URegularExpression *regexp2, UBool b, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return; } regexp->fMatcher->useTransparentBounds(b); } //------------------------------------------------------------------------------ // // uregex_hasAnchoringBounds // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hasAnchoringBounds(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return FALSE; } return regexp->fMatcher->hasAnchoringBounds(); } //------------------------------------------------------------------------------ // // uregex_useAnchoringBounds // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_useAnchoringBounds(URegularExpression *regexp2, UBool b, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return; } regexp->fMatcher->useAnchoringBounds(b); } //------------------------------------------------------------------------------ // // uregex_hitEnd // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hitEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return FALSE; } return regexp->fMatcher->hitEnd(); } //------------------------------------------------------------------------------ // // uregex_requireEnd // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_requireEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return FALSE; } return regexp->fMatcher->requireEnd(); } //------------------------------------------------------------------------------ // // uregex_setTimeLimit // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setTimeLimit(URegularExpression *regexp2, int32_t limit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { regexp->fMatcher->setTimeLimit(limit, *status); } } //------------------------------------------------------------------------------ // // uregex_getTimeLimit // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_getTimeLimit(const URegularExpression *regexp2, UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { retVal = regexp->fMatcher->getTimeLimit(); } return retVal; } //------------------------------------------------------------------------------ // // uregex_setStackLimit // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setStackLimit(URegularExpression *regexp2, int32_t limit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { regexp->fMatcher->setStackLimit(limit, *status); } } //------------------------------------------------------------------------------ // // uregex_getStackLimit // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_getStackLimit(const URegularExpression *regexp2, UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { retVal = regexp->fMatcher->getStackLimit(); } return retVal; } //------------------------------------------------------------------------------ // // uregex_setMatchCallback // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setMatchCallback(URegularExpression *regexp2, URegexMatchCallback *callback, const void *context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { regexp->fMatcher->setMatchCallback(callback, context, *status); } } //------------------------------------------------------------------------------ // // uregex_getMatchCallback // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_getMatchCallback(const URegularExpression *regexp2, URegexMatchCallback **callback, const void **context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status)) { regexp->fMatcher->getMatchCallback(*callback, *context, *status); } } //------------------------------------------------------------------------------ // // uregex_replaceAll // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_replaceAll(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, UChar *destBuf, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || destBuf == NULL && destCapacity > 0 || destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t len = 0; uregex_reset(regexp2, 0, status); // Note: Seperate error code variables for findNext() and appendReplacement() // are used so that destination buffer overflow errors // in appendReplacement won't stop findNext() from working. // appendReplacement() and appendTail() special case incoming buffer // overflow errors, continuing to return the correct length. UErrorCode findStatus = *status; while (uregex_findNext(regexp2, &findStatus)) { len += uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); if (U_FAILURE(findStatus)) { // If anything went wrong with the findNext(), make that error trump // whatever may have happened with the append() operations. // Errors in findNext() are not expected. *status = findStatus; } return len; } //------------------------------------------------------------------------------ // // uregex_replaceAllUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_replaceAllUText(URegularExpression *regexp2, UText *replacementText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } if (replacementText == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); return dest; } //------------------------------------------------------------------------------ // // uregex_replaceFirst // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_replaceFirst(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, UChar *destBuf, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || destBuf == NULL && destCapacity > 0 || destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t len = 0; UBool findSucceeded; uregex_reset(regexp2, 0, status); findSucceeded = uregex_find(regexp2, 0, status); if (findSucceeded) { len = uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); return len; } //------------------------------------------------------------------------------ // // uregex_replaceFirstUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_replaceFirstUText(URegularExpression *regexp2, UText *replacementText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } if (replacementText == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); return dest; } //------------------------------------------------------------------------------ // // uregex_appendReplacement // //------------------------------------------------------------------------------ U_NAMESPACE_BEGIN // // Dummy class, because these functions need to be friends of class RegexMatcher, // and stand-alone C functions don't work as friends // class RegexCImpl { public: inline static int32_t appendReplacement(RegularExpression *regexp, const UChar *replacementText, int32_t replacementLength, UChar **destBuf, int32_t *destCapacity, UErrorCode *status); inline static int32_t appendTail(RegularExpression *regexp, UChar **destBuf, int32_t *destCapacity, UErrorCode *status); inline static int32_t split(RegularExpression *regexp, UChar *destBuf, int32_t destCapacity, int32_t *requiredCapacity, UChar *destFields[], int32_t destFieldsCapacity, UErrorCode *status); }; U_NAMESPACE_END static const UChar BACKSLASH = 0x5c; static const UChar DOLLARSIGN = 0x24; // // Move a character to an output buffer, with bounds checking on the index. // Index advances even if capacity is exceeded, for preflight size computations. // This little sequence is used a LOT. // static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { if (*idx < bufCapacity) { buf[*idx] = c; } (*idx)++; } // // appendReplacement, the actual implementation. // int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, const UChar *replacementText, int32_t replacementLength, UChar **destBuf, int32_t *destCapacity, UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle. UBool pendingBufferOverflow = FALSE; if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { pendingBufferOverflow = TRUE; *status = U_ZERO_ERROR; } // // Validate all paramters // if (validateRE(regexp, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || destCapacity == NULL || destBuf == NULL || *destBuf == NULL && *destCapacity > 0 || *destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } RegexMatcher *m = regexp->fMatcher; if (m->fMatch == FALSE) { *status = U_REGEX_INVALID_STATE; return 0; } UChar *dest = *destBuf; int32_t capacity = *destCapacity; int32_t destIdx = 0; int32_t i; // If it wasn't supplied by the caller, get the length of the replacement text. // TODO: slightly smarter logic in the copy loop could watch for the NUL on // the fly and avoid this step. if (replacementLength == -1) { replacementLength = u_strlen(replacementText); } // Copy input string from the end of previous match to start of current match if (regexp->fText != NULL) { int32_t matchStart; int32_t lastMatchEnd; if (UTEXT_USES_U16(m->fInputText)) { lastMatchEnd = (int32_t)m->fLastMatchEnd; matchStart = (int32_t)m->fMatchStart; } else { // !!!: Would like a better way to do this! UErrorCode status = U_ZERO_ERROR; lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); status = U_ZERO_ERROR; matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); } for (i=lastMatchEnd; i<matchStart; i++) { appendToBuf(regexp->fText[i], &destIdx, dest, capacity); } } else { UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); } // scan the replacement text, looking for substitutions ($n) and \escapes. int32_t replIdx = 0; while (replIdx < replacementLength) { UChar c = replacementText[replIdx]; replIdx++; if (c != DOLLARSIGN && c != BACKSLASH) { // Common case, no substitution, no escaping, // just copy the char to the dest buf. appendToBuf(c, &destIdx, dest, capacity); continue; } if (c == BACKSLASH) { // Backslash Escape. Copy the following char out without further checks. // Note: Surrogate pairs don't need any special handling // The second half wont be a '$' or a '\', and // will move to the dest normally on the next // loop iteration. if (replIdx >= replacementLength) { break; } c = replacementText[replIdx]; if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. UChar32 escapedChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &replIdx, // Index is updated by unescapeAt replacementLength, // Length of replacement text (void *)replacementText); if (escapedChar != (UChar32)0xFFFFFFFF) { if (escapedChar <= 0xffff) { appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); } else { appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); } continue; } // Note: if the \u escape was invalid, just fall through and // treat it as a plain \<anything> escape. } // Plain backslash escape. Just put out the escaped character. appendToBuf(c, &destIdx, dest, capacity); replIdx++; continue; } // We've got a $. Pick up a capture group number if one follows. // Consume at most the number of digits necessary for the largest capture // number that is valid for this pattern. int32_t numDigits = 0; int32_t groupNum = 0; UChar32 digitC; for (;;) { if (replIdx >= replacementLength) { break; } U16_GET(replacementText, 0, replIdx, replacementLength, digitC); if (u_isdigit(digitC) == FALSE) { break; } U16_FWD_1(replacementText, replIdx, replacementLength); groupNum=groupNum*10 + u_charDigitValue(digitC); numDigits++; if (numDigits >= m->fPattern->fMaxCaptureDigits) { break; } } if (numDigits == 0) { // The $ didn't introduce a group number at all. // Treat it as just part of the substitution text. appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); continue; } // Finally, append the capture group data to the destination. destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); if (*status == U_BUFFER_OVERFLOW_ERROR) { // Ignore buffer overflow when extracting the group. We need to // continue on to get full size of the untruncated result. We will // raise our own buffer overflow error at the end. *status = U_ZERO_ERROR; } if (U_FAILURE(*status)) { // Can fail if group number is out of range. break; } } // // Nul Terminate the dest buffer if possible. // Set the appropriate buffer overflow or not terminated error, if needed. // if (destIdx < capacity) { dest[destIdx] = 0; } else if (destIdx == *destCapacity) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { *status = U_BUFFER_OVERFLOW_ERROR; } // // Return an updated dest buffer and capacity to the caller. // if (destIdx > 0 && *destCapacity > 0) { if (destIdx < capacity) { *destBuf += destIdx; *destCapacity -= destIdx; } else { *destBuf += capacity; *destCapacity = 0; } } // If we came in with a buffer overflow, make sure we go out with one also. // (A zero length match right at the end of the previous match could // make this function succeed even though a previous call had overflowed the buf) if (pendingBufferOverflow && U_SUCCESS(*status)) { *status = U_BUFFER_OVERFLOW_ERROR; } return destIdx; } // // appendReplacement the actual API function, // U_CAPI int32_t U_EXPORT2 uregex_appendReplacement(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, UChar **destBuf, int32_t *destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendReplacement( regexp, replacementText, replacementLength,destBuf, destCapacity, status); } // // uregex_appendReplacementUText...can just use the normal C++ method // U_CAPI void U_EXPORT2 uregex_appendReplacementUText(URegularExpression *regexp2, UText *replText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; regexp->fMatcher->appendReplacement(dest, replText, *status); } //------------------------------------------------------------------------------ // // uregex_appendTail // //------------------------------------------------------------------------------ int32_t RegexCImpl::appendTail(RegularExpression *regexp, UChar **destBuf, int32_t *destCapacity, UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle. UBool pendingBufferOverflow = FALSE; if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { pendingBufferOverflow = TRUE; *status = U_ZERO_ERROR; } if (validateRE(regexp, status) == FALSE) { return 0; } if (destCapacity == NULL || destBuf == NULL || *destBuf == NULL && *destCapacity > 0 || *destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } RegexMatcher *m = regexp->fMatcher; int32_t destIdx = 0; int32_t destCap = *destCapacity; UChar *dest = *destBuf; if (regexp->fText != NULL) { int32_t srcIdx; int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); if (nativeIdx == -1) { srcIdx = 0; } else if (UTEXT_USES_U16(m->fInputText)) { srcIdx = (int32_t)nativeIdx; } else { UErrorCode status = U_ZERO_ERROR; srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); } for (;;) { if (srcIdx == regexp->fTextLength) { break; } UChar c = regexp->fText[srcIdx]; if (c == 0 && regexp->fTextLength == -1) { regexp->fTextLength = srcIdx; break; } if (destIdx < destCap) { dest[destIdx] = c; } else { // We've overflowed the dest buffer. // If the total input string length is known, we can // compute the total buffer size needed without scanning through the string. if (regexp->fTextLength > 0) { destIdx += (regexp->fTextLength - srcIdx); break; } } srcIdx++; destIdx++; } } else { int64_t srcIdx; if (m->fMatch) { // The most recent call to find() succeeded. srcIdx = m->fMatchEnd; } else { // The last call to find() on this matcher failed(). // Look back to the end of the last find() that succeeded for src index. srcIdx = m->fLastMatchEnd; if (srcIdx == -1) { // There has been no successful match with this matcher. // We want to copy the whole string. srcIdx = 0; } } destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); } // // NUL terminate the output string, if possible, otherwise issue the // appropriate error or warning. // if (destIdx < destCap) { dest[destIdx] = 0; } else if (destIdx == destCap) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { *status = U_BUFFER_OVERFLOW_ERROR; } // // Update the user's buffer ptr and capacity vars to reflect the // amount used. // if (destIdx < destCap) { *destBuf += destIdx; *destCapacity -= destIdx; } else { *destBuf += destCap; *destCapacity = 0; } if (pendingBufferOverflow && U_SUCCESS(*status)) { *status = U_BUFFER_OVERFLOW_ERROR; } return destIdx; } // // appendTail the actual API function // U_CAPI int32_t U_EXPORT2 uregex_appendTail(URegularExpression *regexp2, UChar **destBuf, int32_t *destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); } // // uregex_appendTailUText...can just use the normal C++ method // U_CAPI UText * U_EXPORT2 uregex_appendTailUText(URegularExpression *regexp2, UText *dest) { RegularExpression *regexp = (RegularExpression*)regexp2; return regexp->fMatcher->appendTail(dest); } //------------------------------------------------------------------------------ // // copyString Internal utility to copy a string to an output buffer, // while managing buffer overflow and preflight size // computation. NUL termination is added to destination, // and the NUL is counted in the output size. // //------------------------------------------------------------------------------ #if 0 static void copyString(UChar *destBuffer, // Destination buffer. int32_t destCapacity, // Total capacity of dest buffer int32_t *destIndex, // Index into dest buffer. Updated on return. // Update not clipped to destCapacity. const UChar *srcPtr, // Pointer to source string int32_t srcLen) // Source string len. { int32_t si; int32_t di = *destIndex; UChar c; for (si=0; si<srcLen; si++) { c = srcPtr[si]; if (di < destCapacity) { destBuffer[di] = c; di++; } else { di += srcLen - si; break; } } if (di<destCapacity) { destBuffer[di] = 0; } di++; *destIndex = di; } #endif //------------------------------------------------------------------------------ // // uregex_split // //------------------------------------------------------------------------------ int32_t RegexCImpl::split(RegularExpression *regexp, UChar *destBuf, int32_t destCapacity, int32_t *requiredCapacity, UChar *destFields[], int32_t destFieldsCapacity, UErrorCode *status) { // // Reset for the input text // regexp->fMatcher->reset(); UText *inputText = regexp->fMatcher->fInputText; int64_t nextOutputStringStart = 0; int64_t inputLen = regexp->fMatcher->fInputLength; if (inputLen == 0) { return 0; } // // Loop through the input text, searching for the delimiter pattern // int32_t i; // Index of the field being processed. int32_t destIdx = 0; // Next available position in destBuf; int32_t numCaptureGroups = regexp->fMatcher->groupCount(); UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted for (i=0; ; i++) { if (i>=destFieldsCapacity-1) { // There are one or zero output strings left. // Fill the last output string with whatever is left from the input, then exit the loop. // ( i will be == destFieldsCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) if (inputLen > nextOutputStringStart) { if (i != destFieldsCapacity-1) { // No fields are left. Recycle the last one for holding the trailing part of // the input string. i = destFieldsCapacity-1; destIdx = (int32_t)(destFields[i] - destFields[0]); } destFields[i] = &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); } break; } if (regexp->fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. destFields[i] = &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); if (tStatus == U_BUFFER_OVERFLOW_ERROR) { tStatus = U_ZERO_ERROR; } else { *status = tStatus; } nextOutputStringStart = regexp->fMatcher->fMatchEnd; // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { // If we've run out of output string slots, bail out. if (i==destFieldsCapacity-1) { break; } i++; // Set up to extract the capture group contents into the dest buffer. destFields[i] = &destBuf[destIdx]; tStatus = U_ZERO_ERROR; int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); destIdx += t + 1; // Record the space used in the output string buffer. // +1 for the NUL that terminates the string. if (tStatus == U_BUFFER_OVERFLOW_ERROR) { tStatus = U_ZERO_ERROR; } else { *status = tStatus; } } if (nextOutputStringStart == inputLen) { // The delimiter was at the end of the string. We're done. break; } } else { // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. destFields[i] = &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); break; } } // Zero out any unused portion of the destFields array int j; for (j=i+1; j<destFieldsCapacity; j++) { destFields[j] = NULL; } if (requiredCapacity != NULL) { *requiredCapacity = destIdx; } if (destIdx > destCapacity) { *status = U_BUFFER_OVERFLOW_ERROR; } return i+1; } // // uregex_split The actual API function // U_CAPI int32_t U_EXPORT2 uregex_split(URegularExpression *regexp2, UChar *destBuf, int32_t destCapacity, int32_t *requiredCapacity, UChar *destFields[], int32_t destFieldsCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, status) == FALSE) { return 0; } if (destBuf == NULL && destCapacity > 0 || destCapacity < 0 || destFields == NULL || destFieldsCapacity < 1 ) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); } // // uregex_splitUText...can just use the normal C++ method // U_CAPI int32_t U_EXPORT2 uregex_splitUText(URegularExpression *regexp2, UText *destFields[], int32_t destFieldsCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); } #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS