C++程序  |  1798行  |  59.76 KB

/*
*******************************************************************************
*   Copyright (C) 2004-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  regex.cpp
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/regex.h"
#include "unicode/uregex.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uobject.h"
#include "umutex.h"
#include "uassert.h"
#include "cmemory.h"

#include "regextxt.h"

#include <stdio.h>

U_NAMESPACE_BEGIN

#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)

struct RegularExpression: public UMemory {
public:
    RegularExpression();
    ~RegularExpression();
    int32_t           fMagic;
    RegexPattern     *fPat;
    int32_t          *fPatRefCount;
    UChar            *fPatString;
    int32_t           fPatStringLen;
    RegexMatcher     *fMatcher;
    const UChar      *fText;         // Text from setText()
    int32_t           fTextLength;   // Length provided by user with setText(), which
                                     //  may be -1.
    UBool             fOwnsText;
};

static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII

RegularExpression::RegularExpression() {
    fMagic        = REXP_MAGIC;
    fPat          = NULL;
    fPatRefCount  = NULL;
    fPatString    = NULL;
    fPatStringLen = 0;
    fMatcher      = NULL;
    fText         = NULL;
    fTextLength   = 0;
    fOwnsText     = FALSE;
}

RegularExpression::~RegularExpression() {
    delete fMatcher;
    fMatcher = NULL;
    if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
        delete fPat;
        uprv_free(fPatString);
        uprv_free(fPatRefCount);
    }
    if (fOwnsText && fText!=NULL) {
        uprv_free((void *)fText);
    }
    fMagic = 0;
}

U_NAMESPACE_END

U_NAMESPACE_USE

//----------------------------------------------------------------------------------------
//
//   validateRE    Do boilerplate style checks on API function parameters.
//                 Return TRUE if they look OK.
//----------------------------------------------------------------------------------------
static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
    if (U_FAILURE(*status)) {
        return FALSE;
    }
    if (re == NULL || re->fMagic != REXP_MAGIC) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return FALSE;
    }
    // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
    if (requiresText && re->fText == NULL && !re->fOwnsText) {
        *status = U_REGEX_INVALID_STATE;
        return FALSE;
    }
    return TRUE;
}

//----------------------------------------------------------------------------------------
//
//    uregex_open
//
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression *  U_EXPORT2
uregex_open( const  UChar          *pattern,
                    int32_t         patternLength,
                    uint32_t        flags,
                    UParseError    *pe,
                    UErrorCode     *status) {

    if (U_FAILURE(*status)) {
        return NULL;
    }
    if (pattern == NULL || patternLength < -1 || patternLength == 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    int32_t actualPatLen = patternLength;
    if (actualPatLen == -1) {
        actualPatLen = u_strlen(pattern);
    }

    RegularExpression *re     = new RegularExpression;
    int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
    if (re == NULL || refC == NULL || patBuf == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        delete re;
        uprv_free(refC);
        uprv_free(patBuf);
        return NULL;
    }
    re->fPatRefCount = refC;
    *re->fPatRefCount = 1;

    //
    // Make a copy of the pattern string, so we can return it later if asked.
    //    For compiling the pattern, we will use a UText wrapper around
    //    this local copy, to avoid making even more copies.
    //
    re->fPatString    = patBuf;
    re->fPatStringLen = patternLength;
    u_memcpy(patBuf, pattern, actualPatLen);
    patBuf[actualPatLen] = 0;
    
    UText patText = UTEXT_INITIALIZER;
    utext_openUChars(&patText, patBuf, patternLength, status);

    //
    // Compile the pattern
    //
    if (pe != NULL) {
        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    } else {
        re->fPat = RegexPattern::compile(&patText, flags, *status);
    }
    utext_close(&patText);
    
    if (U_FAILURE(*status)) {
        goto ErrorExit;
    }

    //
    // Create the matcher object
    //
    re->fMatcher = re->fPat->matcher(*status);
    if (U_SUCCESS(*status)) {
        return (URegularExpression*)re;
    }

ErrorExit:
    delete re;
    return NULL;

}

//----------------------------------------------------------------------------------------
//
//    uregex_openUText
//
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression *  U_EXPORT2
uregex_openUText(UText          *pattern,
                 uint32_t        flags,
                 UParseError    *pe,
                 UErrorCode     *status) {
    
    if (U_FAILURE(*status)) {
        return NULL;
    }
    if (pattern == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    
    int64_t patternNativeLength = utext_nativeLength(pattern);
    
    if (patternNativeLength == 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    
    RegularExpression *re     = new RegularExpression;
    
    UErrorCode lengthStatus = U_ZERO_ERROR;
    int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
    
    int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
    if (re == NULL || refC == NULL || patBuf == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        delete re;
        uprv_free(refC);
        uprv_free(patBuf);
        return NULL;
    }
    re->fPatRefCount = refC;
    *re->fPatRefCount = 1;
    
    //
    // Make a copy of the pattern string, so we can return it later if asked.
    //    For compiling the pattern, we will use a read-only UText wrapper
    //    around this local copy, to avoid making even more copies.
    //
    re->fPatString    = patBuf;
    re->fPatStringLen = pattern16Length;
    utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
    
    UText patText = UTEXT_INITIALIZER;
    utext_openUChars(&patText, patBuf, pattern16Length, status);
    
    //
    // Compile the pattern
    //
    if (pe != NULL) {
        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    } else {
        re->fPat = RegexPattern::compile(&patText, flags, *status);
    }
    utext_close(&patText);
    
    if (U_FAILURE(*status)) {
        goto ErrorExit;
    }
    
    //
    // Create the matcher object
    //
    re->fMatcher = re->fPat->matcher(*status);
    if (U_SUCCESS(*status)) {
        return (URegularExpression*)re;
    }
    
ErrorExit:
    delete re;
    return NULL;
    
}

//----------------------------------------------------------------------------------------
//
//    uregex_close
//
//----------------------------------------------------------------------------------------
U_CAPI void  U_EXPORT2
uregex_close(URegularExpression  *re2) {
    RegularExpression *re = (RegularExpression*)re2;
    UErrorCode  status = U_ZERO_ERROR;
    if (validateRE(re, &status, FALSE) == FALSE) {
        return;
    }
    delete re;
}


//----------------------------------------------------------------------------------------
//
//    uregex_clone
//
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression * U_EXPORT2 
uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
    RegularExpression *source = (RegularExpression*)source2;
    if (validateRE(source, status, FALSE) == FALSE) {
        return NULL;
    }

    RegularExpression *clone = new RegularExpression;
    if (clone == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }

    clone->fMatcher = source->fPat->matcher(*status);
    if (U_FAILURE(*status)) {
        delete clone;
        return NULL;
    }

    clone->fPat          = source->fPat;
    clone->fPatRefCount  = source->fPatRefCount; 
    clone->fPatString    = source->fPatString;
    clone->fPatStringLen = source->fPatStringLen;
    umtx_atomic_inc(source->fPatRefCount);
    // Note:  fText is not cloned.

    return (URegularExpression*)clone;
}




//------------------------------------------------------------------------------
//
//    uregex_pattern
//
//------------------------------------------------------------------------------
U_CAPI const UChar * U_EXPORT2 
uregex_pattern(const  URegularExpression *regexp2,
                      int32_t            *patLength,
                      UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return NULL;
    }
    if (patLength != NULL) {
        *patLength = regexp->fPatStringLen;
    }
    return regexp->fPatString;
}


//------------------------------------------------------------------------------
//
//    uregex_patternUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_patternUText(const URegularExpression *regexp2,
                          UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    (void)status;
    return regexp->fPat->patternText();
}


//------------------------------------------------------------------------------
//
//    uregex_flags
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return 0;
    }
    int32_t flags = regexp->fPat->flags();
    return flags;
}


//------------------------------------------------------------------------------
//
//    uregex_setText
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_setText(URegularExpression *regexp2,
               const UChar        *text,
               int32_t             textLength,
               UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return;
    }
    if (text == NULL || textLength < -1) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    
    if (regexp->fOwnsText && regexp->fText != NULL) {
        uprv_free((void *)regexp->fText);
    }
    
    regexp->fText       = text;
    regexp->fTextLength = textLength;
    regexp->fOwnsText   = FALSE;
    
    UText input = UTEXT_INITIALIZER;
    utext_openUChars(&input, text, textLength, status);
    regexp->fMatcher->reset(&input);
    utext_close(&input); // reset() made a shallow clone, so we don't need this copy
}


//------------------------------------------------------------------------------
//
//    uregex_setUText
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_setUText(URegularExpression *regexp2,
                UText              *text,
                UErrorCode         *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return;
    }
    if (text == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    
    if (regexp->fOwnsText && regexp->fText != NULL) {
        uprv_free((void *)regexp->fText);
    }
    
    regexp->fText       = NULL; // only fill it in on request
    regexp->fTextLength = -1;
    regexp->fOwnsText   = TRUE;
    regexp->fMatcher->reset(text);
}



//------------------------------------------------------------------------------
//
//    uregex_getText
//
//------------------------------------------------------------------------------
U_CAPI const UChar * U_EXPORT2 
uregex_getText(URegularExpression *regexp2,
               int32_t            *textLength,
               UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return NULL;
    }
    
    if (regexp->fText == NULL) {
        // need to fill in the text
        UText *inputText = regexp->fMatcher->inputText();
        int64_t inputNativeLength = utext_nativeLength(inputText);
        if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
            regexp->fText = inputText->chunkContents;
            regexp->fTextLength = (int32_t)inputNativeLength;
            regexp->fOwnsText = FALSE; // because the UText owns it
        } else {
            UErrorCode lengthStatus = U_ZERO_ERROR;
            regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
            
            utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
            regexp->fText = inputChars;
            regexp->fOwnsText = TRUE; // should already be set but just in case
        }
    }
    
    if (textLength != NULL) {
        *textLength = regexp->fTextLength;
    }
    return regexp->fText;
}


//------------------------------------------------------------------------------
//
//    uregex_getUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_getUText(URegularExpression *regexp2,
                UText              *dest,
                UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return dest;
    }
    return regexp->fMatcher->getInput(dest);
}

// BEGIN android-added
// Removed this function after Android upgrade to ICU4.6.
//------------------------------------------------------------------------------
//
//    uregex_refreshUText
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_refreshUText(URegularExpression *regexp2,
                    UText              *text,
                    UErrorCode         *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return;
    }
    regexp->fMatcher->refreshInputText(text, *status);
}
// END android-added


//------------------------------------------------------------------------------
//
//    uregex_matches
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_matches(URegularExpression *regexp2,
                int32_t            startIndex,
                UErrorCode        *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    UBool result = FALSE;
    if (validateRE(regexp, status) == FALSE) {
        return result;
    }
    if (startIndex == -1) {
        result = regexp->fMatcher->matches(*status);
    } else {
        result = regexp->fMatcher->matches(startIndex, *status);
    }
    return result;
}



//------------------------------------------------------------------------------
//
//    uregex_lookingAt
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_lookingAt(URegularExpression *regexp2,
                 int32_t             startIndex,
                 UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    UBool result = FALSE;
    if (validateRE(regexp, status) == FALSE) {
        return result;
    }
    if (startIndex == -1) {
        result = regexp->fMatcher->lookingAt(*status);
    } else {
        result = regexp->fMatcher->lookingAt(startIndex, *status);
    }
    return result;
}



//------------------------------------------------------------------------------
//
//    uregex_find
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_find(URegularExpression *regexp2,
            int32_t             startIndex, 
            UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    UBool result = FALSE;
    if (validateRE(regexp, status) == FALSE) {
        return result;
    }
    if (startIndex == -1) {
        regexp->fMatcher->resetPreserveRegion();
        result = regexp->fMatcher->find();
    } else {
        result = regexp->fMatcher->find(startIndex, *status);
    }
    return result;
}

//------------------------------------------------------------------------------
//
//    uregex_findNext
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_findNext(URegularExpression *regexp2,
                UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return FALSE;
    }
    UBool result = regexp->fMatcher->find();
    return result;
}

//------------------------------------------------------------------------------
//
//    uregex_groupCount
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_groupCount(URegularExpression *regexp2,
                  UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status, FALSE) == FALSE) {
        return 0;
    }
    int32_t  result = regexp->fMatcher->groupCount();
    return result;
}


//------------------------------------------------------------------------------
//
//    uregex_group
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_group(URegularExpression *regexp2,
             int32_t             groupNum,
             UChar              *dest,
             int32_t             destCapacity,
             UErrorCode          *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    
    if (destCapacity == 0 || regexp->fText != NULL) {
        // If preflighting or if we already have the text as UChars,
        // this is a little cheaper than going through uregex_groupUText()
        
        //
        // Pick up the range of characters from the matcher
        //
        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
        if (U_FAILURE(*status)) {
            return 0;
        }

        //
        // Trim length based on buffer capacity
        // 
        int32_t fullLength = endIx - startIx;
        int32_t copyLength = fullLength;
        if (copyLength < destCapacity) {
            dest[copyLength] = 0;
        } else if (copyLength == destCapacity) {
            *status = U_STRING_NOT_TERMINATED_WARNING;
        } else {
            copyLength = destCapacity;
            *status = U_BUFFER_OVERFLOW_ERROR;
        }
        
        //
        // Copy capture group to user's buffer
        //
        if (copyLength > 0) {
            u_memcpy(dest, &regexp->fText[startIx], copyLength);
        }
        return fullLength;
    } else {
        UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status);
        int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
        utext_close(groupText);
        return result;
    }
}


//------------------------------------------------------------------------------
//
//    uregex_groupUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2 
uregex_groupUText(URegularExpression *regexp2,
                  int32_t             groupNum,
                  UText              *dest,
                  UErrorCode         *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        UErrorCode emptyTextStatus = U_ZERO_ERROR;
        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    }

    if (regexp->fText != NULL) {
        //
        // Pick up the range of characters from the matcher
        // and use our already-extracted characters
        //
        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
        if (U_FAILURE(*status)) {
            UErrorCode emptyTextStatus = U_ZERO_ERROR;
            return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
        }
        
        if (dest) {
            utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
        } else {
            UText groupText = UTEXT_INITIALIZER;
            utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
            dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
            utext_close(&groupText);
        }
        
        return dest;
    } else {
        return regexp->fMatcher->group(groupNum, dest, *status);
    }
}


//------------------------------------------------------------------------------
//
//    uregex_start
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_start(URegularExpression *regexp2,
             int32_t             groupNum,
             UErrorCode          *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    int32_t result = regexp->fMatcher->start(groupNum, *status);
    return result;
}


//------------------------------------------------------------------------------
//
//    uregex_end
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_end(URegularExpression   *regexp2,
           int32_t               groupNum,
           UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    int32_t result = regexp->fMatcher->end(groupNum, *status);
    return result;
}

//------------------------------------------------------------------------------
//
//    uregex_reset
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_reset(URegularExpression    *regexp2,
             int32_t               index,
             UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return;
    }
    regexp->fMatcher->reset(index, *status);
}


//------------------------------------------------------------------------------
//
//    uregex_setRegion
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_setRegion(URegularExpression   *regexp2,
                 int32_t               regionStart,
                 int32_t               regionLimit,
                 UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return;
    }
    regexp->fMatcher->region(regionStart, regionLimit, *status);
}


//------------------------------------------------------------------------------
//
//    uregex_regionStart
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_regionStart(const  URegularExpression   *regexp2,
                          UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    return regexp->fMatcher->regionStart();
}


//------------------------------------------------------------------------------
//
//    uregex_regionEnd
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_regionEnd(const  URegularExpression   *regexp2,
                        UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    return regexp->fMatcher->regionEnd();
}


//------------------------------------------------------------------------------
//
//    uregex_hasTransparentBounds
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
                                   UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return FALSE;
    }
    return regexp->fMatcher->hasTransparentBounds();
}


//------------------------------------------------------------------------------
//
//    uregex_useTransparentBounds
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_useTransparentBounds(URegularExpression    *regexp2,
                            UBool                  b,
                            UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return;
    }
    regexp->fMatcher->useTransparentBounds(b);
}


//------------------------------------------------------------------------------
//
//    uregex_hasAnchoringBounds
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
                                 UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return FALSE;
    }
    return regexp->fMatcher->hasAnchoringBounds();
}


//------------------------------------------------------------------------------
//
//    uregex_useAnchoringBounds
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_useAnchoringBounds(URegularExpression    *regexp2,
                          UBool                  b,
                          UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return;
    }
    regexp->fMatcher->useAnchoringBounds(b);
}


//------------------------------------------------------------------------------
//
//    uregex_hitEnd
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_hitEnd(const  URegularExpression   *regexp2,
                     UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return FALSE;
    }
    return regexp->fMatcher->hitEnd();
}


//------------------------------------------------------------------------------
//
//    uregex_requireEnd
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2 
uregex_requireEnd(const  URegularExpression   *regexp2,
                         UErrorCode           *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return FALSE;
    }
    return regexp->fMatcher->requireEnd();
}


//------------------------------------------------------------------------------
//
//    uregex_setTimeLimit
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_setTimeLimit(URegularExpression   *regexp2,
                    int32_t               limit,
                    UErrorCode           *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status)) {
        regexp->fMatcher->setTimeLimit(limit, *status);
    }
}



//------------------------------------------------------------------------------
//
//    uregex_getTimeLimit
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_getTimeLimit(const  URegularExpression   *regexp2,
                           UErrorCode           *status) {
    int32_t retVal = 0;
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status)) {
        retVal = regexp->fMatcher->getTimeLimit();
    }
    return retVal;
}



//------------------------------------------------------------------------------
//
//    uregex_setStackLimit
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_setStackLimit(URegularExpression   *regexp2,
                     int32_t               limit,
                     UErrorCode           *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status)) {
        regexp->fMatcher->setStackLimit(limit, *status);
    }
}



//------------------------------------------------------------------------------
//
//    uregex_getStackLimit
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_getStackLimit(const  URegularExpression   *regexp2,
                            UErrorCode           *status) {
    int32_t retVal = 0;
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status)) {
        retVal = regexp->fMatcher->getStackLimit();
    }
    return retVal;
}


//------------------------------------------------------------------------------
//
//    uregex_setMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setMatchCallback(URegularExpression      *regexp2,
                        URegexMatchCallback     *callback,
                        const void              *context,
                        UErrorCode              *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status)) {
        regexp->fMatcher->setMatchCallback(callback, context, *status);
    }
}


//------------------------------------------------------------------------------
//
//    uregex_getMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2 
uregex_getMatchCallback(const URegularExpression    *regexp2,
                        URegexMatchCallback        **callback,
                        const void                 **context,
                        UErrorCode                  *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
     if (validateRE(regexp, status)) {
         regexp->fMatcher->getMatchCallback(*callback, *context, *status);
     }
}


//------------------------------------------------------------------------------
//
//    uregex_replaceAll
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_replaceAll(URegularExpression    *regexp2,
                  const UChar           *replacementText,
                  int32_t                replacementLength,
                  UChar                 *destBuf,
                  int32_t                destCapacity,
                  UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (replacementText == NULL || replacementLength < -1 ||
        destBuf == NULL && destCapacity > 0 ||
        destCapacity < 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    int32_t   len = 0;

    uregex_reset(regexp2, 0, status);

    // Note: Seperate error code variables for findNext() and appendReplacement()
    //       are used so that destination buffer overflow errors
    //       in appendReplacement won't stop findNext() from working.
    //       appendReplacement() and appendTail() special case incoming buffer
    //       overflow errors, continuing to return the correct length.
    UErrorCode  findStatus = *status;
    while (uregex_findNext(regexp2, &findStatus)) {
        len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
                                        &destBuf, &destCapacity, status);
    }
    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
    
    if (U_FAILURE(findStatus)) {
        // If anything went wrong with the findNext(), make that error trump
        //   whatever may have happened with the append() operations.
        //   Errors in findNext() are not expected.
        *status = findStatus;
    }

    return len;
}


//------------------------------------------------------------------------------
//
//    uregex_replaceAllUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2 
uregex_replaceAllUText(URegularExpression    *regexp2,
                       UText                 *replacementText,
                       UText                 *dest,
                       UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (replacementText == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    
    dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
    return dest;
}
    

//------------------------------------------------------------------------------
//
//    uregex_replaceFirst
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2 
uregex_replaceFirst(URegularExpression  *regexp2,
                    const UChar         *replacementText,
                    int32_t              replacementLength,
                    UChar               *destBuf,
                    int32_t              destCapacity,
                    UErrorCode          *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (replacementText == NULL || replacementLength < -1 ||
        destBuf == NULL && destCapacity > 0 ||
        destCapacity < 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    int32_t   len = 0;
    UBool     findSucceeded;
    uregex_reset(regexp2, 0, status);
    findSucceeded = uregex_find(regexp2, 0, status);
    if (findSucceeded) {
        len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 
                                       &destBuf, &destCapacity, status);
    }
    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);

    return len;
}


//------------------------------------------------------------------------------
//
//    uregex_replaceFirstUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2 
uregex_replaceFirstUText(URegularExpression  *regexp2,
                         UText                 *replacementText,
                         UText                 *dest,
                         UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (replacementText == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    
    dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
    return dest;
}


//------------------------------------------------------------------------------
//
//    uregex_appendReplacement
//
//------------------------------------------------------------------------------

U_NAMESPACE_BEGIN
//
//  Dummy class, because these functions need to be friends of class RegexMatcher,
//               and stand-alone C functions don't work as friends
//
class RegexCImpl {
 public:
   inline static  int32_t appendReplacement(RegularExpression    *regexp,
                      const UChar           *replacementText,
                      int32_t                replacementLength,
                      UChar                **destBuf,
                      int32_t               *destCapacity,
                      UErrorCode            *status);

   inline static int32_t appendTail(RegularExpression    *regexp,
        UChar                **destBuf,
        int32_t               *destCapacity,
        UErrorCode            *status);
                  
    inline static int32_t split(RegularExpression    *regexp,
        UChar                 *destBuf,
        int32_t                destCapacity,
        int32_t               *requiredCapacity,
        UChar                 *destFields[],
        int32_t                destFieldsCapacity,
        UErrorCode            *status);
};

U_NAMESPACE_END



static const UChar BACKSLASH  = 0x5c;
static const UChar DOLLARSIGN = 0x24;

//
//  Move a character to an output buffer, with bounds checking on the index.
//      Index advances even if capacity is exceeded, for preflight size computations.
//      This little sequence is used a LOT.
//
static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
    if (*idx < bufCapacity) {
        buf[*idx] = c;
    }
    (*idx)++;
}


//
//  appendReplacement, the actual implementation.
//
int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
                                      const UChar           *replacementText,
                                      int32_t                replacementLength,
                                      UChar                **destBuf,
                                      int32_t               *destCapacity,
                                      UErrorCode            *status)  {

    // If we come in with a buffer overflow error, don't suppress the operation.
    //  A series of appendReplacements, appendTail need to correctly preflight
    //  the buffer size when an overflow happens somewhere in the middle.
    UBool pendingBufferOverflow = FALSE;
    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
        pendingBufferOverflow = TRUE;
        *status = U_ZERO_ERROR;
    }

    //
    // Validate all paramters
    //
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (replacementText == NULL || replacementLength < -1 ||
        destCapacity == NULL || destBuf == NULL || 
        *destBuf == NULL && *destCapacity > 0 ||
        *destCapacity < 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    RegexMatcher *m = regexp->fMatcher;
    if (m->fMatch == FALSE) {
        *status = U_REGEX_INVALID_STATE;
        return 0;
    }

    UChar    *dest             = *destBuf;
    int32_t   capacity         = *destCapacity;
    int32_t   destIdx          =  0;
    int32_t   i;
    
    // If it wasn't supplied by the caller,  get the length of the replacement text.
    //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
    //          the fly and avoid this step.
    if (replacementLength == -1) {
        replacementLength = u_strlen(replacementText);
    }

    // Copy input string from the end of previous match to start of current match
    if (regexp->fText != NULL) {
        int32_t matchStart;
        int32_t lastMatchEnd;
        if (UTEXT_USES_U16(m->fInputText)) {
            lastMatchEnd = (int32_t)m->fLastMatchEnd;
            matchStart = (int32_t)m->fMatchStart;
        } else {
            // !!!: Would like a better way to do this!
            UErrorCode status = U_ZERO_ERROR;
            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
            status = U_ZERO_ERROR;
            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
        }
        for (i=lastMatchEnd; i<matchStart; i++) {
            appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
        }        
    } else {
        UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
        destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
                                 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
    }
    

    // scan the replacement text, looking for substitutions ($n) and \escapes.
    int32_t  replIdx = 0;
    while (replIdx < replacementLength) {
        UChar  c = replacementText[replIdx];
        replIdx++;
        if (c != DOLLARSIGN && c != BACKSLASH) {
            // Common case, no substitution, no escaping, 
            //  just copy the char to the dest buf.
            appendToBuf(c, &destIdx, dest, capacity);
            continue;
        }

        if (c == BACKSLASH) {
            // Backslash Escape.  Copy the following char out without further checks.
            //                    Note:  Surrogate pairs don't need any special handling
            //                           The second half wont be a '$' or a '\', and
            //                           will move to the dest normally on the next
            //                           loop iteration.
            if (replIdx >= replacementLength) {
                break;
            }
            c = replacementText[replIdx];

            if (c==0x55/*U*/ || c==0x75/*u*/) {
                // We have a \udddd or \Udddddddd escape sequence.
                UChar32 escapedChar = 
                    u_unescapeAt(uregex_ucstr_unescape_charAt,
                       &replIdx,                   // Index is updated by unescapeAt 
                       replacementLength,          // Length of replacement text
                       (void *)replacementText);

                if (escapedChar != (UChar32)0xFFFFFFFF) {
                    if (escapedChar <= 0xffff) {
                        appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
                    } else {
                        appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
                        appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
                    }
                    continue;
                }
                // Note:  if the \u escape was invalid, just fall through and
                //        treat it as a plain \<anything> escape.
            }

            // Plain backslash escape.  Just put out the escaped character.
            appendToBuf(c, &destIdx, dest, capacity);

            replIdx++;
            continue;
        }



        // We've got a $.  Pick up a capture group number if one follows.
        // Consume at most the number of digits necessary for the largest capture
        // number that is valid for this pattern.

        int32_t numDigits = 0;
        int32_t groupNum  = 0;
        UChar32 digitC;
        for (;;) {
            if (replIdx >= replacementLength) {
                break;
            }
            U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
            if (u_isdigit(digitC) == FALSE) {
                break;
            }

            U16_FWD_1(replacementText, replIdx, replacementLength);
            groupNum=groupNum*10 + u_charDigitValue(digitC);
            numDigits++;
            if (numDigits >= m->fPattern->fMaxCaptureDigits) {
                break;
            }
        }


        if (numDigits == 0) {
            // The $ didn't introduce a group number at all.
            // Treat it as just part of the substitution text.
            appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
            continue;
        }

        // Finally, append the capture group data to the destination.
        destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
        if (*status == U_BUFFER_OVERFLOW_ERROR) {
            // Ignore buffer overflow when extracting the group.  We need to
            //   continue on to get full size of the untruncated result.  We will
            //   raise our own buffer overflow error at the end.
            *status = U_ZERO_ERROR;
        }

        if (U_FAILURE(*status)) {
            // Can fail if group number is out of range.
            break;
        }

    }

    //
    //  Nul Terminate the dest buffer if possible.
    //  Set the appropriate buffer overflow or not terminated error, if needed.
    //
    if (destIdx < capacity) {
        dest[destIdx] = 0;
    } else if (destIdx == *destCapacity) {
        *status = U_STRING_NOT_TERMINATED_WARNING;
    } else {
        *status = U_BUFFER_OVERFLOW_ERROR;
    }
    
    //
    // Return an updated dest buffer and capacity to the caller.
    //
    if (destIdx > 0 &&  *destCapacity > 0) {
        if (destIdx < capacity) {
            *destBuf      += destIdx;
            *destCapacity -= destIdx;
        } else {
            *destBuf      += capacity;
            *destCapacity =  0;
        }
    }

    // If we came in with a buffer overflow, make sure we go out with one also.
    //   (A zero length match right at the end of the previous match could
    //    make this function succeed even though a previous call had overflowed the buf)
    if (pendingBufferOverflow && U_SUCCESS(*status)) {
        *status = U_BUFFER_OVERFLOW_ERROR;
    }

    return destIdx;
}

//
//   appendReplacement   the actual API function,
//
U_CAPI int32_t U_EXPORT2 
uregex_appendReplacement(URegularExpression    *regexp2,
                         const UChar           *replacementText,
                         int32_t                replacementLength,
                         UChar                **destBuf,
                         int32_t               *destCapacity,
                         UErrorCode            *status) {
    
    RegularExpression *regexp = (RegularExpression*)regexp2;
    return RegexCImpl::appendReplacement(
        regexp, replacementText, replacementLength,destBuf, destCapacity, status);
}

//
//   uregex_appendReplacementUText...can just use the normal C++ method
//
U_CAPI void U_EXPORT2 
uregex_appendReplacementUText(URegularExpression    *regexp2,
                              UText                 *replText,
                              UText                 *dest,
                              UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    regexp->fMatcher->appendReplacement(dest, replText, *status);
}


//------------------------------------------------------------------------------
//
//    uregex_appendTail
//
//------------------------------------------------------------------------------
int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
                               UChar                **destBuf,
                               int32_t               *destCapacity,
                               UErrorCode            *status)
{

    // If we come in with a buffer overflow error, don't suppress the operation.
    //  A series of appendReplacements, appendTail need to correctly preflight
    //  the buffer size when an overflow happens somewhere in the middle.
    UBool pendingBufferOverflow = FALSE;
    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
        pendingBufferOverflow = TRUE;
        *status = U_ZERO_ERROR;
    }

    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    
    if (destCapacity == NULL || destBuf == NULL || 
        *destBuf == NULL && *destCapacity > 0 ||
        *destCapacity < 0)
    {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    RegexMatcher *m = regexp->fMatcher;

    int32_t  destIdx     = 0;
    int32_t  destCap     = *destCapacity;
    UChar    *dest       = *destBuf;
    
    if (regexp->fText != NULL) {
        int32_t srcIdx;
        int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
        if (nativeIdx == -1) {
            srcIdx = 0;
        } else if (UTEXT_USES_U16(m->fInputText)) {
            srcIdx = (int32_t)nativeIdx;
        } else {
            UErrorCode status = U_ZERO_ERROR;
            srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
        }
            
        for (;;) {
            if (srcIdx == regexp->fTextLength) {
                break;
            }
            UChar c = regexp->fText[srcIdx];
            if (c == 0 && regexp->fTextLength == -1) {
                regexp->fTextLength = srcIdx;
                break;
            }
            if (destIdx < destCap) {
                dest[destIdx] = c;
            } else {
                // We've overflowed the dest buffer.
                //  If the total input string length is known, we can
                //    compute the total buffer size needed without scanning through the string.
                if (regexp->fTextLength > 0) {
                    destIdx += (regexp->fTextLength - srcIdx);
                    break;
                }
            }
            srcIdx++;
            destIdx++;
        }            
    } else {
        int64_t  srcIdx;
        if (m->fMatch) {
            // The most recent call to find() succeeded.  
            srcIdx = m->fMatchEnd;
        } else {
            // The last call to find() on this matcher failed().
            //   Look back to the end of the last find() that succeeded for src index.
            srcIdx = m->fLastMatchEnd;
            if (srcIdx == -1)  {
                // There has been no successful match with this matcher.
                //   We want to copy the whole string.
                srcIdx = 0;
            }
        }

        destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
    }

    //
    //  NUL terminate the output string, if possible, otherwise issue the
    //   appropriate error or warning.
    //
    if (destIdx < destCap) {
        dest[destIdx] = 0;
    } else  if (destIdx == destCap) {
        *status = U_STRING_NOT_TERMINATED_WARNING;
    } else {
        *status = U_BUFFER_OVERFLOW_ERROR;
    }

    //
    // Update the user's buffer ptr and capacity vars to reflect the
    //   amount used.
    //
    if (destIdx < destCap) {
        *destBuf      += destIdx;
        *destCapacity -= destIdx;
    } else {
        *destBuf      += destCap;
        *destCapacity  = 0;
    }

    if (pendingBufferOverflow && U_SUCCESS(*status)) {
        *status = U_BUFFER_OVERFLOW_ERROR;
    }

    return destIdx;
}


//
//   appendTail   the actual API function
//
U_CAPI int32_t U_EXPORT2 
uregex_appendTail(URegularExpression    *regexp2,
                  UChar                **destBuf,
                  int32_t               *destCapacity,
                  UErrorCode            *status)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
}


//
//   uregex_appendTailUText...can just use the normal C++ method
//
U_CAPI UText * U_EXPORT2 
uregex_appendTailUText(URegularExpression    *regexp2,
                       UText                 *dest)  {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    return regexp->fMatcher->appendTail(dest);
}


//------------------------------------------------------------------------------
//
//    copyString     Internal utility to copy a string to an output buffer,
//                   while managing buffer overflow and preflight size
//                   computation.  NUL termination is added to destination,
//                   and the NUL is counted in the output size.
//
//------------------------------------------------------------------------------
#if 0
static void copyString(UChar        *destBuffer,    //  Destination buffer.
                       int32_t       destCapacity,  //  Total capacity of dest buffer
                       int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
                                                    //    Update not clipped to destCapacity.
                       const UChar  *srcPtr,        //  Pointer to source string
                       int32_t       srcLen)        //  Source string len.
{
    int32_t  si;
    int32_t  di = *destIndex;
    UChar    c;

    for (si=0; si<srcLen;  si++) {
        c = srcPtr[si];
        if (di < destCapacity) {
            destBuffer[di] = c;
            di++;
        } else {
            di += srcLen - si;
            break;
        }
    }
    if (di<destCapacity) {
        destBuffer[di] = 0;
    }
    di++;
    *destIndex = di;
}
#endif

//------------------------------------------------------------------------------
//
//    uregex_split
//
//------------------------------------------------------------------------------
int32_t RegexCImpl::split(RegularExpression     *regexp,
                          UChar                 *destBuf,
                          int32_t                destCapacity,
                          int32_t               *requiredCapacity,
                          UChar                 *destFields[],
                          int32_t                destFieldsCapacity,
                          UErrorCode            *status) {
    //
    // Reset for the input text
    //
    regexp->fMatcher->reset();
    UText *inputText = regexp->fMatcher->fInputText;
    int64_t   nextOutputStringStart = 0;
    int64_t   inputLen = regexp->fMatcher->fInputLength;
    if (inputLen == 0) {
        return 0;
    }

    //
    // Loop through the input text, searching for the delimiter pattern
    //
    int32_t   i;             // Index of the field being processed.
    int32_t   destIdx = 0;   // Next available position in destBuf;
    int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
    UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
    for (i=0; ; i++) {
        if (i>=destFieldsCapacity-1) {
            // There are one or zero output strings left.
            // Fill the last output string with whatever is left from the input, then exit the loop.
            //  ( i will be == destFieldsCapacity if we filled the output array while processing
            //    capture groups of the delimiter expression, in which case we will discard the
            //    last capture group saved in favor of the unprocessed remainder of the
            //    input string.)
            if (inputLen > nextOutputStringStart) {
                if (i != destFieldsCapacity-1) {
                    // No fields are left.  Recycle the last one for holding the trailing part of
                    //   the input string.
                    i = destFieldsCapacity-1;
                    destIdx = (int32_t)(destFields[i] - destFields[0]);
                }
                
                destFields[i] = &destBuf[destIdx];
                destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
                                             &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
            }
            break;
        }
        
        if (regexp->fMatcher->find()) {
            // We found another delimiter.  Move everything from where we started looking
            //  up until the start of the delimiter into the next output string.
            destFields[i] = &destBuf[destIdx];
            
            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
            if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
                tStatus = U_ZERO_ERROR;
            } else {
                *status = tStatus;
            }
            nextOutputStringStart = regexp->fMatcher->fMatchEnd;
            
            // If the delimiter pattern has capturing parentheses, the captured
            //  text goes out into the next n destination strings.
            int32_t groupNum;
            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
                // If we've run out of output string slots, bail out.
                if (i==destFieldsCapacity-1) {
                    break;
                }
                i++;
                
                // Set up to extract the capture group contents into the dest buffer.
                destFields[i] = &destBuf[destIdx];
                tStatus = U_ZERO_ERROR;
                int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
                destIdx += t + 1;    // Record the space used in the output string buffer.
                                     //  +1 for the NUL that terminates the string.
                if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
                    tStatus = U_ZERO_ERROR;
                } else {
                    *status = tStatus;
                }
            }

            if (nextOutputStringStart == inputLen) {
                // The delimiter was at the end of the string.  We're done.
                break;
            }

        }
        else
        {
            // We ran off the end of the input while looking for the next delimiter.
            // All the remaining text goes into the current output string.
            destFields[i] = &destBuf[destIdx];
            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
            break;
        }
    }

    // Zero out any unused portion of the destFields array
    int j;
    for (j=i+1; j<destFieldsCapacity; j++) {
        destFields[j] = NULL;
    }

    if (requiredCapacity != NULL) {
        *requiredCapacity = destIdx;
    }
    if (destIdx > destCapacity) {
        *status = U_BUFFER_OVERFLOW_ERROR;
    }
    return i+1;
}

//
//   uregex_split   The actual API function
//
U_CAPI int32_t U_EXPORT2 
uregex_split(URegularExpression      *regexp2,
             UChar                   *destBuf,
             int32_t                  destCapacity,
             int32_t                 *requiredCapacity,
             UChar                   *destFields[],
             int32_t                  destFieldsCapacity,
             UErrorCode              *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    if (validateRE(regexp, status) == FALSE) {
        return 0;
    }
    if (destBuf == NULL && destCapacity > 0 ||
        destCapacity < 0 ||
        destFields == NULL ||
        destFieldsCapacity < 1 ) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    
    return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
}
    

//
//   uregex_splitUText...can just use the normal C++ method
//
U_CAPI int32_t U_EXPORT2 
uregex_splitUText(URegularExpression    *regexp2,
                  UText                 *destFields[],
                  int32_t                destFieldsCapacity,
                  UErrorCode            *status) {
    RegularExpression *regexp = (RegularExpression*)regexp2;
    return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
}


#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS