/* ********************************************************************** * Copyright (c) 2002-2004, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 01/21/2002 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "strrepl.h" #include "rbt_data.h" #include "util.h" #include "unicode/uniset.h" U_NAMESPACE_BEGIN static const UChar EMPTY[] = { 0 }; // empty string: "" UnicodeReplacer::~UnicodeReplacer() {} UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) /** * Construct a StringReplacer that sets the emits the given output * text and sets the cursor to the given position. * @param theOutput text that will replace input text when the * replace() method is called. May contain stand-in characters * that represent nested replacers. * @param theCursorPos cursor position that will be returned by * the replace() method * @param theData transliterator context object that translates * stand-in characters to UnicodeReplacer objects */ StringReplacer::StringReplacer(const UnicodeString& theOutput, int32_t theCursorPos, const TransliterationRuleData* theData) { output = theOutput; cursorPos = theCursorPos; hasCursor = TRUE; data = theData; isComplex = TRUE; } /** * Construct a StringReplacer that sets the emits the given output * text and does not modify the cursor. * @param theOutput text that will replace input text when the * replace() method is called. May contain stand-in characters * that represent nested replacers. * @param theData transliterator context object that translates * stand-in characters to UnicodeReplacer objects */ StringReplacer::StringReplacer(const UnicodeString& theOutput, const TransliterationRuleData* theData) { output = theOutput; cursorPos = 0; hasCursor = FALSE; data = theData; isComplex = TRUE; } /** * Copy constructor. */ StringReplacer::StringReplacer(const StringReplacer& other) : UnicodeFunctor(other), UnicodeReplacer(other) { output = other.output; cursorPos = other.cursorPos; hasCursor = other.hasCursor; data = other.data; isComplex = other.isComplex; } /** * Destructor */ StringReplacer::~StringReplacer() { } /** * Implement UnicodeFunctor */ UnicodeFunctor* StringReplacer::clone() const { return new StringReplacer(*this); } /** * Implement UnicodeFunctor */ UnicodeReplacer* StringReplacer::toReplacer() const { return (UnicodeReplacer*) this; } /** * UnicodeReplacer API */ int32_t StringReplacer::replace(Replaceable& text, int32_t start, int32_t limit, int32_t& cursor) { int32_t outLen; int32_t newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.handleReplaceBetween(start, limit, output); outLen = output.length(); // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This preserves * the integrity of indices into the key and surrounding context while * generating the output text. */ UnicodeString buf; int32_t oOutput; // offset into 'output' isComplex = FALSE; // The temporary buffer starts at tempStart, and extends // to destLimit. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int32_t tempStart = text.length(); // start of temp buffer int32_t destStart = tempStart; // copy new text to here if (start > 0) { int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1)); text.copy(start-len, start, tempStart); destStart += len; } else { UnicodeString str((UChar) 0xFFFF); text.handleReplaceBetween(tempStart, tempStart, str); destStart++; } int32_t destLimit = destStart; for (oOutput=0; oOutput<output.length(); ) { if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } UChar32 c = output.char32At(oOutput); UnicodeReplacer* r = data->lookupReplacer(c); if (r == NULL) { // Accumulate straight (non-segment) text. buf.append(c); } else { isComplex = TRUE; // Insert any accumulated straight text. if (buf.length() > 0) { text.handleReplaceBetween(destLimit, destLimit, buf); destLimit += buf.length(); buf.truncate(0); } // Delegate output generation to replacer object int32_t len = r->replace(text, destLimit, destLimit, cursor); destLimit += len; } oOutput += UTF_CHAR_LENGTH(c); } // Insert any accumulated straight text. if (buf.length() > 0) { text.handleReplaceBetween(destLimit, destLimit, buf); destLimit += buf.length(); } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.copy(destStart, destLimit, start); text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY); // Delete the old text (the key) text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int32_t n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1)); ++n; } newStart += n; } else if (cursorPos > output.length()) { newStart = start + outLen; int32_t n = cursorPos - output.length(); // Outside the output string, cursorPos counts code points while (n > 0 && newStart < text.length()) { newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); --n; } newStart += n; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor = newStart; } return outLen; } /** * UnicodeReplacer API */ UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, UBool escapeUnprintable) const { rule.truncate(0); UnicodeString quoteBuf; int32_t cursor = cursorPos; // Handle a cursor preceding the output if (hasCursor && cursor < 0) { while (cursor++ < 0) { ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); } // Fall through and append '|' below } for (int32_t i=0; i<output.length(); ++i) { if (hasCursor && i == cursor) { ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); } UChar c = output.charAt(i); // Ok to use 16-bits here UnicodeReplacer* r = data->lookupReplacer(c); if (r == NULL) { ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); } else { UnicodeString buf; r->toReplacerPattern(buf, escapeUnprintable); buf.insert(0, (UChar)0x20); buf.append((UChar)0x20); ICU_Utility::appendToRule(rule, buf, TRUE, escapeUnprintable, quoteBuf); } } // Handle a cursor after the output. Use > rather than >= because // if cursor == output.length() it is at the end of the output, // which is the default position, so we need not emit it. if (hasCursor && cursor > output.length()) { cursor -= output.length(); while (cursor-- > 0) { ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); } ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); } // Flush quoteBuf out to result ICU_Utility::appendToRule(rule, -1, TRUE, escapeUnprintable, quoteBuf); return rule; } /** * Implement UnicodeReplacer */ void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { UChar32 ch; for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) { ch = output.char32At(i); UnicodeReplacer* r = data->lookupReplacer(ch); if (r == NULL) { toUnionTo.add(ch); } else { r->addReplacementSetTo(toUnionTo); } } } /** * UnicodeFunctor API */ void StringReplacer::setData(const TransliterationRuleData* d) { data = d; int32_t i = 0; while (i<output.length()) { UChar32 c = output.char32At(i); UnicodeFunctor* f = data->lookup(c); if (f != NULL) { f->setData(data); } i += UTF_CHAR_LENGTH(c); } } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof