// // rbbisetb.cpp // /* *************************************************************************** * Copyright (C) 2002-2008 International Business Machines Corporation * * and others. All rights reserved. * *************************************************************************** */ // // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules // (part of the rule building process.) // // Starting with the rules parse tree from the scanner, // // - Enumerate the set of UnicodeSets that are referenced // by the RBBI rules. // - compute a set of non-overlapping character ranges // with all characters within a range belonging to the same // set of input uniocde sets. // - Derive a set of non-overlapping UnicodeSet (like things) // that will correspond to columns in the state table for // the RBBI execution engine. All characters within one // of these sets belong to the same set of the original // UnicodeSets from the user's rules. // - construct the trie table that maps input characters // to the index of the matching non-overlapping set of set from // the previous step. // #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/uniset.h" #include "utrie.h" #include "uvector.h" #include "uassert.h" #include "cmemory.h" #include "cstring.h" #include "rbbisetb.h" #include "rbbinode.h" //------------------------------------------------------------------------ // // getFoldedRBBIValue Call-back function used during building of Trie table. // Folding value: just store the offset (16 bits) // if there is any non-0 entry. // (It'd really be nice if the Trie builder would provide a // simple default, so this function could go away from here.) // //------------------------------------------------------------------------ /* folding value: just store the offset (16 bits) if there is any non-0 entry */ U_CDECL_BEGIN static uint32_t U_CALLCONV getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) { uint32_t value; UChar32 limit; UBool inBlockZero; limit=start+0x400; while(start<limit) { value=utrie_get32(trie, start, &inBlockZero); if(inBlockZero) { start+=UTRIE_DATA_BLOCK_LENGTH; } else if(value!=0) { return (uint32_t)(offset|0x8000); } else { ++start; } } return 0; } U_CDECL_END U_NAMESPACE_BEGIN //------------------------------------------------------------------------ // // Constructor // //------------------------------------------------------------------------ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) { fRB = rb; fStatus = rb->fStatus; fRangeList = 0; fTrie = 0; fTrieSize = 0; fGroupCount = 0; fSawBOF = FALSE; } //------------------------------------------------------------------------ // // Destructor // //------------------------------------------------------------------------ RBBISetBuilder::~RBBISetBuilder() { RangeDescriptor *nextRangeDesc; // Walk through & delete the linked list of RangeDescriptors for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { RangeDescriptor *r = nextRangeDesc; nextRangeDesc = r->fNext; delete r; } utrie_close(fTrie); } //------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::build() { RBBINode *usetNode; RangeDescriptor *rlRange; if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} // // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(*fStatus); // will check for status here if (fRangeList == NULL) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } fRangeList->fStartChar = 0; fRangeList->fEndChar = 0x10ffff; if (U_FAILURE(*fStatus)) { return; } // // Find the set of non-overlapping ranges of characters // int ni; for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; int32_t inputSetRangeCount = inputSet->getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange->fEndChar < inputSetRangeBegin) { rlRange = rlRange->fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange->fStartChar < inputSetRangeBegin) { rlRange->split(inputSetRangeBegin, *fStatus); if (U_FAILURE(*fStatus)) { return; } continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange->fEndChar > inputSetRangeEnd) { rlRange->split(inputSetRangeEnd+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { rlRange->fIncludesSets->addElement(usetNode, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange->fEndChar) { inputSetRangeIndex++; } rlRange = rlRange->fNext; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RangeDescriptor *rlSearchRange; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; break; } } if (rlRange->fNum == 0) { fGroupCount ++; rlRange->fNum = fGroupCount+2; rlRange->setDictionaryFlag(); addValToSets(rlRange->fIncludesSets, fGroupCount+2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; UnicodeString eofString(eofUString); UnicodeString bofString(bofUString); for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; if (inputSet->contains(eofString)) { addValToSet(usetNode, 1); } if (inputSet->contains(bofString)) { addValToSet(usetNode, 2); fSawBOF = TRUE; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} // // Build the Trie table for mapping UChar32 values to the corresponding // range group number // fTrie = utrie_open(NULL, // Pre-existing trie to be filled in NULL, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead surrogate unit value TRUE); // Keep Latin 1 in separately for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); } } //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- int32_t RBBISetBuilder::getTrieSize() /*const*/ { fTrieSize = utrie_serialize(fTrie, NULL, // Buffer 0, // Capacity getFoldedRBBIValue, TRUE, // Reduce to 16 bits fStatus); // RBBIDebugPrintf("Trie table size is %d\n", trieSize); return fTrieSize; } //----------------------------------------------------------------------------------- // // serializeTrie() Put the serialized trie at the specified address. // Trust the caller to have given us enough memory. // getTrieSize() MUST be called first. // //----------------------------------------------------------------------------------- void RBBISetBuilder::serializeTrie(uint8_t *where) { utrie_serialize(fTrie, where, // Buffer fTrieSize, // Capacity getFoldedRBBIValue, TRUE, // Reduce to 16 bits fStatus); } //------------------------------------------------------------------------ // // addValToSets Add a runtime-mapped input value to each uset from a // list of uset nodes. (val corresponds to a state table column.) // For each of the original Unicode sets - which correspond // directly to uset nodes - a logically equivalent expression // is constructed in terms of the remapped runtime input // symbol set. This function adds one runtime input symbol to // a list of sets. // // The "logically equivalent expression" is the tree for an // or-ing together of all of the symbols that go into the set. // //------------------------------------------------------------------------ void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { int32_t ix; for (ix=0; ix<sets->size(); ix++) { RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); addValToSet(usetNode, val); } } void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { RBBINode *leafNode = new RBBINode(RBBINode::leafChar); if (leafNode == NULL) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } leafNode->fVal = (unsigned short)val; if (usetNode->fLeftChild == NULL) { usetNode->fLeftChild = leafNode; leafNode->fParent = usetNode; } else { // There are already input symbols present for this set. // Set up an OR node, with the previous stuff as the left child // and the new value as the right child. RBBINode *orNode = new RBBINode(RBBINode::opOr); if (orNode == NULL) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } orNode->fLeftChild = usetNode->fLeftChild; orNode->fRightChild = leafNode; orNode->fLeftChild->fParent = orNode; orNode->fRightChild->fParent = orNode; usetNode->fLeftChild = orNode; orNode->fParent = usetNode; } } //------------------------------------------------------------------------ // // getNumCharCategories // //------------------------------------------------------------------------ int32_t RBBISetBuilder::getNumCharCategories() const { return fGroupCount + 3; } //------------------------------------------------------------------------ // // sawBOF // //------------------------------------------------------------------------ UBool RBBISetBuilder::sawBOF() const { return fSawBOF; } //------------------------------------------------------------------------ // // getFirstChar Given a runtime RBBI character category, find // the first UChar32 that is in the set of chars // in the category. //------------------------------------------------------------------------ UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { RangeDescriptor *rlRange; UChar32 retVal = (UChar32)-1; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { if (rlRange->fNum == category) { retVal = rlRange->fStartChar; break; } } return retVal; } //------------------------------------------------------------------------ // // printRanges A debugging function. // dump out all of the range definitions. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRanges() { RangeDescriptor *rlRange; int i; RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); for (i=0; i<rlRange->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode *setRef = usetNode->fParent; if (setRef != NULL) { RBBINode *varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } RBBIDebugPrintf("\n"); } } #endif //------------------------------------------------------------------------ // // printRangeGroups A debugging function. // dump out all of the range groups. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRangeGroups() { RangeDescriptor *rlRange; RangeDescriptor *tRange; int i; int lastPrintedGroupNum = 0; RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { int groupNum = rlRange->fNum & 0xbfff; if (groupNum > lastPrintedGroupNum) { lastPrintedGroupNum = groupNum; RBBIDebugPrintf("%2i ", groupNum); if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} for (i=0; i<rlRange->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode *setRef = usetNode->fParent; if (setRef != NULL) { RBBINode *varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } i = 0; for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { if (tRange->fNum == rlRange->fNum) { if (i++ % 5 == 0) { RBBIDebugPrintf("\n "); } RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); } } RBBIDebugPrintf("\n"); } } RBBIDebugPrintf("\n"); } #endif //------------------------------------------------------------------------ // // printSets A debugging function. // dump out all of the set definitions. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printSets() { int i; RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); for (i=0; ; i++) { RBBINode *usetNode; RBBINode *setRef; RBBINode *varRef; UnicodeString setName; usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); if (usetNode == NULL) { break; } RBBIDebugPrintf("%3d ", i); setName = UNICODE_STRING("anonymous", 9); setRef = usetNode->fParent; if (setRef != NULL) { varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); RBBI_DEBUG_printUnicodeString(usetNode->fText); RBBIDebugPrintf("\n"); if (usetNode->fLeftChild != NULL) { usetNode->fLeftChild->printTree(TRUE); } } RBBIDebugPrintf("\n"); } #endif //------------------------------------------------------------------------------------- // // RangeDescriptor copy constructor // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { int i; this->fStartChar = other.fStartChar; this->fEndChar = other.fEndChar; this->fNum = other.fNum; this->fNext = NULL; UErrorCode oldstatus = status; this->fIncludesSets = new UVector(status); if (U_FAILURE(oldstatus)) { status = oldstatus; } if (U_FAILURE(status)) { return; } /* test for NULL */ if (this->fIncludesSets == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (i=0; i<other.fIncludesSets->size(); i++) { this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); } } //------------------------------------------------------------------------------------- // // RangeDesriptor default constructor // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(UErrorCode &status) { this->fStartChar = 0; this->fEndChar = 0; this->fNum = 0; this->fNext = NULL; UErrorCode oldstatus = status; this->fIncludesSets = new UVector(status); if (U_FAILURE(oldstatus)) { status = oldstatus; } if (U_FAILURE(status)) { return; } /* test for NULL */ if(this->fIncludesSets == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } } //------------------------------------------------------------------------------------- // // RangeDesriptor Destructor // //------------------------------------------------------------------------------------- RangeDescriptor::~RangeDescriptor() { delete fIncludesSets; fIncludesSets = NULL; } //------------------------------------------------------------------------------------- // // RangeDesriptor::split() // //------------------------------------------------------------------------------------- void RangeDescriptor::split(UChar32 where, UErrorCode &status) { U_ASSERT(where>fStartChar && where<=fEndChar); RangeDescriptor *nr = new RangeDescriptor(*this, status); if(nr == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(status)) { delete nr; return; } // RangeDescriptor copy constructor copies all fields. // Only need to update those that are different after the split. nr->fStartChar = where; this->fEndChar = where-1; nr->fNext = this->fNext; this->fNext = nr; } //------------------------------------------------------------------------------------- // // RangeDescriptor::setDictionaryFlag // // Character Category Numbers that include characters from // the original Unicode Set named "dictionary" have bit 14 // set to 1. The RBBI runtime engine uses this to trigger // use of the word dictionary. // // This function looks through the Unicode Sets that it // (the range) includes, and sets the bit in fNum when // "dictionary" is among them. // // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // //------------------------------------------------------------------------------------- void RangeDescriptor::setDictionaryFlag() { int i; for (i=0; i<this->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); UnicodeString setName; RBBINode *setRef = usetNode->fParent; if (setRef != NULL) { RBBINode *varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals. this->fNum |= 0x4000; break; } } } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */