xmlparser.cpp - Android社区 - https://www.androidos.net.cn/

/*
*******************************************************************************
*
*   Copyright (C) 2004-2008, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  xmlparser.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2004jul21
*   created by: Andy Heninger
*/

#include <stdio.h>
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/regex.h"
#include "filestrm.h"
#include "xmlparser.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION

// character constants
enum {
    x_QUOT=0x22,
    x_AMP=0x26,
    x_APOS=0x27,
    x_LT=0x3c,
    x_GT=0x3e,
    x_l=0x6c
};

#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"

// XML #4
#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"

//  XML #5
#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"

//  XML #6
#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)

//
//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
//                             used for parsing.
//
UXMLParser::UXMLParser(UErrorCode &status) :
      //  XML Declaration.  XML Production #23.
      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
      //            allow for a possible leading BOM.
      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
      
      //  XML Comment   production #15
      //     example:  "
      //       note, does not detect an illegal "--" within comments
      mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status),
      
      //  XML Spaces
      //      production [3]
      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
      
      //  XML Doctype decl  production #28
      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
      //       or      "<!DOCTYPE foo [internal dtd]>
      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
      //           Some internal dtd subsets could confuse this simple-minded
      //           attempt at skipping over them, specifically, occcurences
      //           of closeing square brackets.  These could appear in comments, 
      //           or in parameter entity declarations, for example.
      mXMLDoctype(UnicodeString(
           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
           ), 0, status),
      
      //  XML PI     production #16
      //     example   "<?target stuff?>
      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
      
      //  XML Element Start   Productions #40, #41
      //          example   <foo att1='abc'  att2="d e f" >
      //      capture #1:  the tag name
      //
      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
          "(?:" 
                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
          ")*"                                                             //   * for zero or more attributes.
          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
      
      //  XML Element End     production #42
      //     example   </foo>
      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
      
      // XML Element Empty    production #44
      //     example   <foo att1="abc"   att2="d e f" />
      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
          "(?:" 
                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
          ")*"                                                             //   * for zero or more attributes.
          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"

// XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),

// Attribute name = "value".  XML Productions 10, 40/41
      //  Capture group 1 is name, 
      //                2 is the attribute value, including the quotes.
      //
      //   Note that attributes are scanned twice.  The first time is with
      //        the regex for an entire element start.  There, the attributes
      //        are checked syntactically, but not separted out one by one.
      //        Here, we match a single attribute, and make its name and
      //        attribute value available to the parser code.
      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),

mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),

// Match any of the new-line sequences in content.
      //   All are changed to \u000a.
      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),

// & char references
      //   We will figure out what we've got based on which capture group has content.
      //   The last one is a catchall for unrecognized entity references..
      //             1     2     3      4      5           6                    7          8
      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
                0, status),

fNames(status),
      fElementStack(status),
      fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
      {
      }

UXMLParser *
UXMLParser::createParser(UErrorCode &errorCode) {
    if (U_FAILURE(errorCode)) {
        return NULL;
    } else {
        return new UXMLParser(errorCode);
    }
}

UXMLParser::~UXMLParser() {}

UXMLElement *
UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
    char bytes[4096], charsetBuffer[100];
    FileStream *f;
    const char *charset, *pb;
    UnicodeString src;
    UConverter *cnv;
    UChar *buffer, *pu;
    int32_t fileLength, bytesLength, length, capacity;
    UBool flush;

if(U_FAILURE(errorCode)) {
        return NULL;
    }

f=T_FileStream_open(filename, "rb");
    if(f==NULL) {
        errorCode=U_FILE_ACCESS_ERROR;
        return NULL;
    }

bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    if(bytesLength<(int32_t)sizeof(bytes)) {
        // we have already read the entire file
        fileLength=bytesLength;
    } else {
        // get the file length
        fileLength=T_FileStream_size(f);
    }

/*
     * get the charset:
     * 1. Unicode signature
     * 2. treat as ISO-8859-1 and read XML encoding="charser"
     * 3. default to UTF-8
     */
    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
    if(U_SUCCESS(errorCode) && charset!=NULL) {
        // open converter according to Unicode signature
        cnv=ucnv_open(charset, &errorCode);
    } else {
        // read as Latin-1 and parse the XML declaration and encoding
        cnv=ucnv_open("ISO-8859-1", &errorCode);
        if(U_FAILURE(errorCode)) {
            // unexpected error opening Latin-1 converter
            goto exit;
        }

buffer=src.getBuffer(bytesLength);
        if(buffer==NULL) {
            // unexpected failure to reserve some string capacity
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            goto exit;
        }
        pb=bytes;
        pu=buffer;
        ucnv_toUnicode(
            cnv,
            &pu, buffer+src.getCapacity(),
            &pb, bytes+bytesLength,
            NULL, TRUE, &errorCode);
        src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
        ucnv_close(cnv);
        cnv=NULL;
        if(U_FAILURE(errorCode)) {
            // unexpected error in conversion from Latin-1
            src.remove();
            goto exit;
        }

// parse XML declaration
        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
            int32_t declEnd=mXMLDecl.end(errorCode);
            // go beyond <?xml
            int32_t pos=src.indexOf((UChar)x_l)+1;

mAttrValue.reset(src);
            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
                UnicodeString attName  = mAttrValue.group(1, errorCode);
                UnicodeString attValue = mAttrValue.group(2, errorCode);

if(attName==UNICODE_STRING("encoding", 8)) {
                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
                    charset=charsetBuffer;
                    break;
                }
                pos = mAttrValue.end(2, errorCode);
            }

if(charset==NULL) {
                // default to UTF-8
                charset="UTF-8";
            }
            cnv=ucnv_open(charset, &errorCode);
        }
    }

if(U_FAILURE(errorCode)) {
        // unable to open the converter
        goto exit;
    }

// convert the file contents
    capacity=fileLength;        // estimated capacity
    src.getBuffer(capacity);
    src.releaseBuffer(0);       // zero length
    flush=FALSE;
    for(;;) {
        // convert contents of bytes[bytesLength]
        pb=bytes;
        for(;;) {
            length=src.length();
            buffer=src.getBuffer(capacity);
            if(buffer==NULL) {
                // unexpected failure to reserve some string capacity
                errorCode=U_MEMORY_ALLOCATION_ERROR;
                goto exit;
            }

pu=buffer+length;
            ucnv_toUnicode(
                cnv, &pu, buffer+src.getCapacity(),
                &pb, bytes+bytesLength,
                NULL, FALSE, &errorCode);
            src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
                errorCode=U_ZERO_ERROR;
                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
            } else {
                break;
            }
        }

if(U_FAILURE(errorCode)) {
            break; // conversion error
        }

if(flush) {
            break; // completely converted the file
        }

// read next block
        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
        if(bytesLength==0) {
            // reached end of file, convert once more to flush the converter
            flush=TRUE;
        }
    };

exit:
    ucnv_close(cnv);
    T_FileStream_close(f);

if(U_SUCCESS(errorCode)) {
        return parse(src, errorCode);
    } else {
        return NULL;
    }
}

UXMLElement *
UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
    if(U_FAILURE(status)) {
        return NULL;
    }

UXMLElement   *root = NULL;
    fPos = 0; // TODO use just a local pos variable and pass it into functions
              // where necessary?

// set all matchers to work on the input string
    mXMLDecl.reset(src);
    mXMLComment.reset(src);
    mXMLSP.reset(src);
    mXMLDoctype.reset(src);
    mXMLPI.reset(src);
    mXMLElemStart.reset(src);
    mXMLElemEnd.reset(src);
    mXMLElemEmpty.reset(src);
    mXMLCharData.reset(src);
    mAttrValue.reset(src);
    mAttrNormalizer.reset(src);
    mNewLineNormalizer.reset(src);
    mAmps.reset(src);

// Consume the XML Declaration, if present.
    if (mXMLDecl.lookingAt(fPos, status)) {
        fPos = mXMLDecl.end(status);
    }

// Consume "misc" [XML production 27] appearing before DocType
    parseMisc(status);

// Consume a DocType declaration, if present.
    if (mXMLDoctype.lookingAt(fPos, status)) {
        fPos = mXMLDoctype.end(status);
    }

// Consume additional "misc" [XML production 27] appearing after the DocType
    parseMisc(status);

// Get the root element
    if (mXMLElemEmpty.lookingAt(fPos, status)) {
        // Root is an empty element (no nested elements or content)
        root = createElement(mXMLElemEmpty, status);
        fPos = mXMLElemEmpty.end(status);
    } else {
        if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
            error("Root Element expected", status);
            goto errorExit;
        }
        root = createElement(mXMLElemStart, status);
        UXMLElement  *el = root;

//
        // This is the loop that consumes the root element of the document,
        //      including all nested content.   Nested elements are handled by
        //      explicit pushes/pops of the element stack; there is no recursion
        //      in the control flow of this code.
        //      "el" always refers to the current element, the one to which content
        //      is being added.  It is above the top of the element stack.
        for (;;) {
            // Nested Element Start
            if (mXMLElemStart.lookingAt(fPos, status)) {
                UXMLElement *t = createElement(mXMLElemStart, status);
                el->fChildren.addElement(t, status);
                t->fParent = el;
                fElementStack.push(el, status);
                el = t;
                continue;
            }

// Text Content.  String is concatenated onto the current node's content,
            //                but only if it contains something other than spaces.
            UnicodeString s = scanContent(status);
            if (s.length() > 0) {
                mXMLSP.reset(s);
                if (mXMLSP.matches(status) == FALSE) {
                    // This chunk of text contains something other than just
                    //  white space. Make a child node for it.
                    replaceCharRefs(s, status);
                    el->fChildren.addElement(s.clone(), status);
                }
                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
                continue;
            }

// Comments.  Discard.
            if (mXMLComment.lookingAt(fPos, status)) {
                fPos = mXMLComment.end(status);
                continue;
            }

// PIs.  Discard.
            if (mXMLPI.lookingAt(fPos, status)) {
                fPos = mXMLPI.end(status);
                continue;
            }

// Element End
            if (mXMLElemEnd.lookingAt(fPos, status)) {
                fPos = mXMLElemEnd.end(0, status);
                const UnicodeString name = mXMLElemEnd.group(1, status);
                if (name != *el->fName) {
                    error("Element start / end tag mismatch", status);
                    goto errorExit;
                }
                if (fElementStack.empty()) {
                    // Close of the root element.  We're done with the doc.
                    el = NULL;
                    break;
                }
                el = (UXMLElement *)fElementStack.pop();
                continue;
            }

// Empty Element.  Stored as a child of the current element, but not stacked.
            if (mXMLElemEmpty.lookingAt(fPos, status)) {
                UXMLElement *t = createElement(mXMLElemEmpty, status);
                el->fChildren.addElement(t, status);
                continue;
            }

// Hit something within the document that doesn't match anything.
            //   It's an error.
            error("Unrecognized markup", status);
            break;
        }

if (el != NULL || !fElementStack.empty()) {
            // We bailed out early, for some reason.
            error("Root element not closed.", status);
            goto errorExit;
        }
    }

// Root Element parse is complete.
    // Consume the annoying xml "Misc" that can appear at the end of the doc.
    parseMisc(status);

// We should have reached the end of the input
    if (fPos != src.length()) {
        error("Extra content at the end of the document", status);
        goto errorExit;
    }

// Success!
    return root;

errorExit:
    delete root;
    return NULL;
}

//
//  createElement
//      We've just matched an element start tag.  Create and fill in a UXMLElement object
//      for it.
//
UXMLElement *
UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
    // First capture group is the element's name.
    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);

// Scan for attributes.
    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name

while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
        UnicodeString attName  = mAttrValue.group(1, status);
        UnicodeString attValue = mAttrValue.group(2, status);

// Trim the quotes from the att value.  These are left over from the original regex
        //   that parsed the attribue, which couldn't conveniently strip them.
        attValue.remove(0,1);                    // one char from the beginning
        attValue.truncate(attValue.length()-1);  // and one from the end.
        
        // XML Attribue value normalization. 
        // This is one of the really screwy parts of the XML spec.
        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
        // Note that non-validating parsers must treat all entities as type CDATA
        //   which simplifies things some.

// Att normalization step 1:  normalize any newlines in the attribute value
        mNewLineNormalizer.reset(attValue);
        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);

// Next change all xml white space chars to plain \u0020 spaces.
        mAttrNormalizer.reset(attValue);
        UnicodeString oneSpace((UChar)0x0020);
        attValue = mAttrNormalizer.replaceAll(oneSpace, status);

// Replace character entities.
        replaceCharRefs(attValue, status);

// Save the attribute name and value in our document structure.
        el->fAttNames.addElement((void *)intern(attName, status), status);
        el->fAttValues.addElement(attValue.clone(), status);
        pos = mAttrValue.end(2, status);
    }
    fPos = mEl.end(0, status);
    return el;
}

//
//  parseMisc
//     Consume XML "Misc" [production #27]
//        which is any combination of space, PI and comments
//      Need to watch end-of-input because xml MISC stuff is allowed after
//        the document element, so we WILL scan off the end in this function
//
void
UXMLParser::parseMisc(UErrorCode &status)  {
    for (;;) {
        if (fPos >= mXMLPI.input().length()) {
            break;
        }
        if (mXMLPI.lookingAt(fPos, status)) {
            fPos = mXMLPI.end(status);
            continue;
        }
        if (mXMLSP.lookingAt(fPos, status)) {
            fPos = mXMLSP.end(status);
            continue;
        }
        if (mXMLComment.lookingAt(fPos, status)) {
            fPos = mXMLComment.end(status);
            continue;
        }
        break;
    }
}

//
//  Scan for document content.
//
UnicodeString
UXMLParser::scanContent(UErrorCode &status) {
    UnicodeString  result;
    if (mXMLCharData.lookingAt(fPos, status)) {
        result = mXMLCharData.group(0, status);
        // Normalize the new-lines.  (Before char ref substitution)
        mNewLineNormalizer.reset(result);
        result = mNewLineNormalizer.replaceAll(fOneLF, status);
        
        // TODO:  handle CDATA
        fPos = mXMLCharData.end(0, status);
    }

return result;
}

//
//   replaceCharRefs
//
//      replace the char entities <  & { ካ etc. in a string
//       with the corresponding actual character.
//
void
UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
    UnicodeString result;
    UnicodeString replacement;
    int     i;

mAmps.reset(s);
    // See the initialization for the regex matcher mAmps.
    //    Which entity we've matched is determined by which capture group has content,
    //      which is flaged by start() of that group not being -1.
    while (mAmps.find()) {
        if (mAmps.start(1, status) != -1) {
            replacement.setTo((UChar)x_AMP);
        } else if (mAmps.start(2, status) != -1) {
            replacement.setTo((UChar)x_LT);
        } else if (mAmps.start(3, status) != -1) {
            replacement.setTo((UChar)x_GT);
        } else if (mAmps.start(4, status) != -1) {
            replacement.setTo((UChar)x_APOS);
        } else if (mAmps.start(5, status) != -1) {
            replacement.setTo((UChar)x_QUOT);
        } else if (mAmps.start(6, status) != -1) {
            UnicodeString hexString = mAmps.group(6, status);
            UChar32 val = 0;
            for (i=0; i<hexString.length(); i++) {
                val = (val << 4) + u_digit(hexString.charAt(i), 16);
            }
            // TODO:  some verification that the character is valid
            replacement.setTo(val);
        } else if (mAmps.start(7, status) != -1) {
            UnicodeString decimalString = mAmps.group(7, status);
            UChar32 val = 0;
            for (i=0; i<decimalString.length(); i++) {
                val = val*10 + u_digit(decimalString.charAt(i), 10);
            }
            // TODO:  some verification that the character is valid
            replacement.setTo(val);
        } else {
            // An unrecognized &entity;  Leave it alone.
            //  TODO:  check that it really looks like an entity, and is not some
            //         random & in the text.
            replacement = mAmps.group(0, status);
        }
        mAmps.appendReplacement(result, replacement, status);
    }
    mAmps.appendTail(result);
    s = result;
}

void
UXMLParser::error(const char *message, UErrorCode &status) {
    // TODO:  something better here...
    const UnicodeString &src=mXMLDecl.input();
    int  line = 0;
    int  ci = 0;
    while (ci < fPos && ci>=0) {
        ci = src.indexOf((UChar)0x0a, ci+1);
        line++;
    }
    fprintf(stderr, "Error: %s at line %d\n", message, line);
    if (U_SUCCESS(status)) {
        status = U_PARSE_ERROR;
    }
}

// intern strings like in Java

const UnicodeString *
UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
    const UHashElement *he=fNames.find(s);
    if(he!=NULL) {
        // already a known name, return its hashed key pointer
        return (const UnicodeString *)he->key.pointer;
    } else {
        // add this new name and return its hashed key pointer
        fNames.puti(s, 0, errorCode);
        he=fNames.find(s);
        return (const UnicodeString *)he->key.pointer;
    }
}

const UnicodeString *
UXMLParser::findName(const UnicodeString &s) const {
    const UHashElement *he=fNames.find(s);
    if(he!=NULL) {
        // a known name, return its hashed key pointer
        return (const UnicodeString *)he->key.pointer;
    } else {
        // unknown name
        return NULL;
    }
}

// UXMLElement ------------------------------------------------------------- ***

UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
   fParser(parser),
   fName(name),
   fAttNames(errorCode),
   fAttValues(errorCode),
   fChildren(errorCode),
   fParent(NULL)
{
}

UXMLElement::~UXMLElement() {
    int   i;
    // attribute names are owned by the UXMLParser, don't delete them here
    for (i=fAttValues.size()-1; i>=0; i--) {
        delete (UObject *)fAttValues.elementAt(i);
    }
    for (i=fChildren.size()-1; i>=0; i--) {
        delete (UObject *)fChildren.elementAt(i);
    }
}

const UnicodeString &
UXMLElement::getTagName() const {
    return *fName;
}

UnicodeString
UXMLElement::getText(UBool recurse) const {
    UnicodeString text;
    appendText(text, recurse);
    return text;
}

void
UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
    const UObject *node;
    int32_t i, count=fChildren.size();
    for(i=0; i<count; ++i) {
        node=(const UObject *)fChildren.elementAt(i);
        if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
            text.append(*(const UnicodeString *)node);
        } else if(recurse) /* must be a UXMLElement */ {
            ((const UXMLElement *)node)->appendText(text, recurse);
        }
    }
}

int32_t
UXMLElement::countAttributes() const {
    return fAttNames.size();
}

const UnicodeString *
UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
    if(0<=i && i<fAttNames.size()) {
        name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
        value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
    } else {
        return NULL;
    }
}

const UnicodeString *
UXMLElement::getAttribute(const UnicodeString &name) const {
    // search for the attribute name by comparing the interned pointer,
    // not the string contents
    const UnicodeString *p=fParser->findName(name);
    if(p==NULL) {
        return NULL; // no such attribute seen by the parser at all
    }

int32_t i, count=fAttNames.size();
    for(i=0; i<count; ++i) {
        if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
            return (const UnicodeString *)fAttValues.elementAt(i);
        }
    }
    return NULL;
}

int32_t
UXMLElement::countChildren() const {
    return fChildren.size();
}

const UObject *
UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
    if(0<=i && i<fChildren.size()) {
        const UObject *node=(const UObject *)fChildren.elementAt(i);
        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
            type=UXML_NODE_TYPE_ELEMENT;
        } else {
            type=UXML_NODE_TYPE_STRING;
        }
        return node;
    } else {
        return NULL;
    }
}

const UXMLElement *
UXMLElement::nextChildElement(int32_t &i) const {
    if(i<0) {
        return NULL;
    }

const UObject *node;
    int32_t count=fChildren.size();
    while(i<count) {
        node=(const UObject *)fChildren.elementAt(i++);
        // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
        // if(node instanceof UXMLElement) {
        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
            return (const UXMLElement *)node;
        }
    }
    return NULL;
}

const UXMLElement *
UXMLElement::getChildElement(const UnicodeString &name) const {
    // search for the element name by comparing the interned pointer,
    // not the string contents
    const UnicodeString *p=fParser->findName(name);
    if(p==NULL) {
        return NULL; // no such element seen by the parser at all
    }

const UObject *node;
    int32_t i, count=fChildren.size();
    for(i=0; i<count; ++i) {
        node=(const UObject *)fChildren.elementAt(i);
        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
            const UXMLElement *elem=(const UXMLElement *)node;
            if(p==elem->fName) {
                return elem;
            }
        }
    }
    return NULL;
}

U_NAMESPACE_END

#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

C++程序 | 826行 | 27.75 KB

/*
*******************************************************************************
*
*   Copyright (C) 2004-2008, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  xmlparser.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2004jul21
*   created by: Andy Heninger
*/

#include <stdio.h>
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/regex.h"
#include "filestrm.h"
#include "xmlparser.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION

// character constants
enum {
    x_QUOT=0x22,
    x_AMP=0x26,
    x_APOS=0x27,
    x_LT=0x3c,
    x_GT=0x3e,
    x_l=0x6c
};

#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"

// XML #4
#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"

//  XML #5
#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"

//  XML #6
#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)

//
//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
//                             used for parsing.
//
UXMLParser::UXMLParser(UErrorCode &status) :
      //  XML Declaration.  XML Production #23.
      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
      //            allow for a possible leading BOM.
      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
      
      //  XML Comment   production #15
      //     example:  "<!-- whatever -->
      //       note, does not detect an illegal "--" within comments
      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
      
      //  XML Spaces
      //      production [3]
      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
      
      //  XML Doctype decl  production #28
      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
      //       or      "<!DOCTYPE foo [internal dtd]>
      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
      //           Some internal dtd subsets could confuse this simple-minded
      //           attempt at skipping over them, specifically, occcurences
      //           of closeing square brackets.  These could appear in comments, 
      //           or in parameter entity declarations, for example.
      mXMLDoctype(UnicodeString(
           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
           ), 0, status),
      
      //  XML PI     production #16
      //     example   "<?target stuff?>
      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
      
      //  XML Element Start   Productions #40, #41
      //          example   <foo att1='abc'  att2="d e f" >
      //      capture #1:  the tag name
      //
      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
          "(?:" 
                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
          ")*"                                                             //   * for zero or more attributes.
          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
      
      //  XML Element End     production #42
      //     example   </foo>
      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
      
      // XML Element Empty    production #44
      //     example   <foo att1="abc"   att2="d e f" />
      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
          "(?:" 
                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
          ")*"                                                             //   * for zero or more attributes.
          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
      

      // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),

      // Attribute name = "value".  XML Productions 10, 40/41
      //  Capture group 1 is name, 
      //                2 is the attribute value, including the quotes.
      //
      //   Note that attributes are scanned twice.  The first time is with
      //        the regex for an entire element start.  There, the attributes
      //        are checked syntactically, but not separted out one by one.
      //        Here, we match a single attribute, and make its name and
      //        attribute value available to the parser code.
      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),


      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),

      // Match any of the new-line sequences in content.
      //   All are changed to \u000a.
      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),

      // & char references
      //   We will figure out what we've got based on which capture group has content.
      //   The last one is a catchall for unrecognized entity references..
      //             1     2     3      4      5           6                    7          8
      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
                0, status),

      fNames(status),
      fElementStack(status),
      fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
      {
      }

UXMLParser *
UXMLParser::createParser(UErrorCode &errorCode) {
    if (U_FAILURE(errorCode)) {
        return NULL;
    } else {
        return new UXMLParser(errorCode);
    }
}

UXMLParser::~UXMLParser() {}

UXMLElement *
UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
    char bytes[4096], charsetBuffer[100];
    FileStream *f;
    const char *charset, *pb;
    UnicodeString src;
    UConverter *cnv;
    UChar *buffer, *pu;
    int32_t fileLength, bytesLength, length, capacity;
    UBool flush;

    if(U_FAILURE(errorCode)) {
        return NULL;
    }

    f=T_FileStream_open(filename, "rb");
    if(f==NULL) {
        errorCode=U_FILE_ACCESS_ERROR;
        return NULL;
    }

    bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    if(bytesLength<(int32_t)sizeof(bytes)) {
        // we have already read the entire file
        fileLength=bytesLength;
    } else {
        // get the file length
        fileLength=T_FileStream_size(f);
    }

    /*
     * get the charset:
     * 1. Unicode signature
     * 2. treat as ISO-8859-1 and read XML encoding="charser"
     * 3. default to UTF-8
     */
    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
    if(U_SUCCESS(errorCode) && charset!=NULL) {
        // open converter according to Unicode signature
        cnv=ucnv_open(charset, &errorCode);
    } else {
        // read as Latin-1 and parse the XML declaration and encoding
        cnv=ucnv_open("ISO-8859-1", &errorCode);
        if(U_FAILURE(errorCode)) {
            // unexpected error opening Latin-1 converter
            goto exit;
        }

        buffer=src.getBuffer(bytesLength);
        if(buffer==NULL) {
            // unexpected failure to reserve some string capacity
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            goto exit;
        }
        pb=bytes;
        pu=buffer;
        ucnv_toUnicode(
            cnv,
            &pu, buffer+src.getCapacity(),
            &pb, bytes+bytesLength,
            NULL, TRUE, &errorCode);
        src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
        ucnv_close(cnv);
        cnv=NULL;
        if(U_FAILURE(errorCode)) {
            // unexpected error in conversion from Latin-1
            src.remove();
            goto exit;
        }

        // parse XML declaration
        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
            int32_t declEnd=mXMLDecl.end(errorCode);
            // go beyond <?xml
            int32_t pos=src.indexOf((UChar)x_l)+1;

            mAttrValue.reset(src);
            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
                UnicodeString attName  = mAttrValue.group(1, errorCode);
                UnicodeString attValue = mAttrValue.group(2, errorCode);

                // Trim the quotes from the att value.  These are left over from the original regex
                //   that parsed the attribue, which couldn't conveniently strip them.
                attValue.remove(0,1);                    // one char from the beginning
                attValue.truncate(attValue.length()-1);  // and one from the end.

                if(attName==UNICODE_STRING("encoding", 8)) {
                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
                    charset=charsetBuffer;
                    break;
                }
                pos = mAttrValue.end(2, errorCode);
            }

            if(charset==NULL) {
                // default to UTF-8
                charset="UTF-8";
            }
            cnv=ucnv_open(charset, &errorCode);
        }
    }

    if(U_FAILURE(errorCode)) {
        // unable to open the converter
        goto exit;
    }

    // convert the file contents
    capacity=fileLength;        // estimated capacity
    src.getBuffer(capacity);
    src.releaseBuffer(0);       // zero length
    flush=FALSE;
    for(;;) {
        // convert contents of bytes[bytesLength]
        pb=bytes;
        for(;;) {
            length=src.length();
            buffer=src.getBuffer(capacity);
            if(buffer==NULL) {
                // unexpected failure to reserve some string capacity
                errorCode=U_MEMORY_ALLOCATION_ERROR;
                goto exit;
            }

            pu=buffer+length;
            ucnv_toUnicode(
                cnv, &pu, buffer+src.getCapacity(),
                &pb, bytes+bytesLength,
                NULL, FALSE, &errorCode);
            src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
                errorCode=U_ZERO_ERROR;
                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
            } else {
                break;
            }
        }

        if(U_FAILURE(errorCode)) {
            break; // conversion error
        }

        if(flush) {
            break; // completely converted the file
        }

        // read next block
        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
        if(bytesLength==0) {
            // reached end of file, convert once more to flush the converter
            flush=TRUE;
        }
    };

exit:
    ucnv_close(cnv);
    T_FileStream_close(f);

    if(U_SUCCESS(errorCode)) {
        return parse(src, errorCode);
    } else {
        return NULL;
    }
}

UXMLElement *
UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
    if(U_FAILURE(status)) {
        return NULL;
    }

    UXMLElement   *root = NULL;
    fPos = 0; // TODO use just a local pos variable and pass it into functions
              // where necessary?

    // set all matchers to work on the input string
    mXMLDecl.reset(src);
    mXMLComment.reset(src);
    mXMLSP.reset(src);
    mXMLDoctype.reset(src);
    mXMLPI.reset(src);
    mXMLElemStart.reset(src);
    mXMLElemEnd.reset(src);
    mXMLElemEmpty.reset(src);
    mXMLCharData.reset(src);
    mAttrValue.reset(src);
    mAttrNormalizer.reset(src);
    mNewLineNormalizer.reset(src);
    mAmps.reset(src);

    // Consume the XML Declaration, if present.
    if (mXMLDecl.lookingAt(fPos, status)) {
        fPos = mXMLDecl.end(status);
    }

    // Consume "misc" [XML production 27] appearing before DocType
    parseMisc(status);

    // Consume a DocType declaration, if present.
    if (mXMLDoctype.lookingAt(fPos, status)) {
        fPos = mXMLDoctype.end(status);
    }

    // Consume additional "misc" [XML production 27] appearing after the DocType
    parseMisc(status);

    // Get the root element
    if (mXMLElemEmpty.lookingAt(fPos, status)) {
        // Root is an empty element (no nested elements or content)
        root = createElement(mXMLElemEmpty, status);
        fPos = mXMLElemEmpty.end(status);
    } else {
        if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
            error("Root Element expected", status);
            goto errorExit;
        }
        root = createElement(mXMLElemStart, status);
        UXMLElement  *el = root;

        //
        // This is the loop that consumes the root element of the document,
        //      including all nested content.   Nested elements are handled by
        //      explicit pushes/pops of the element stack; there is no recursion
        //      in the control flow of this code.
        //      "el" always refers to the current element, the one to which content
        //      is being added.  It is above the top of the element stack.
        for (;;) {
            // Nested Element Start
            if (mXMLElemStart.lookingAt(fPos, status)) {
                UXMLElement *t = createElement(mXMLElemStart, status);
                el->fChildren.addElement(t, status);
                t->fParent = el;
                fElementStack.push(el, status);
                el = t;
                continue;
            }

            // Text Content.  String is concatenated onto the current node's content,
            //                but only if it contains something other than spaces.
            UnicodeString s = scanContent(status);
            if (s.length() > 0) {
                mXMLSP.reset(s);
                if (mXMLSP.matches(status) == FALSE) {
                    // This chunk of text contains something other than just
                    //  white space. Make a child node for it.
                    replaceCharRefs(s, status);
                    el->fChildren.addElement(s.clone(), status);
                }
                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
                continue;
            }

            // Comments.  Discard.
            if (mXMLComment.lookingAt(fPos, status)) {
                fPos = mXMLComment.end(status);
                continue;
            }

            // PIs.  Discard.
            if (mXMLPI.lookingAt(fPos, status)) {
                fPos = mXMLPI.end(status);
                continue;
            }

            // Element End
            if (mXMLElemEnd.lookingAt(fPos, status)) {
                fPos = mXMLElemEnd.end(0, status);
                const UnicodeString name = mXMLElemEnd.group(1, status);
                if (name != *el->fName) {
                    error("Element start / end tag mismatch", status);
                    goto errorExit;
                }
                if (fElementStack.empty()) {
                    // Close of the root element.  We're done with the doc.
                    el = NULL;
                    break;
                }
                el = (UXMLElement *)fElementStack.pop();
                continue;
            }

            // Empty Element.  Stored as a child of the current element, but not stacked.
            if (mXMLElemEmpty.lookingAt(fPos, status)) {
                UXMLElement *t = createElement(mXMLElemEmpty, status);
                el->fChildren.addElement(t, status);
                continue;
            }

            // Hit something within the document that doesn't match anything.
            //   It's an error.
            error("Unrecognized markup", status);
            break;
        }

        if (el != NULL || !fElementStack.empty()) {
            // We bailed out early, for some reason.
            error("Root element not closed.", status);
            goto errorExit;
        }
    }

    // Root Element parse is complete.
    // Consume the annoying xml "Misc" that can appear at the end of the doc.
    parseMisc(status);

    // We should have reached the end of the input
    if (fPos != src.length()) {
        error("Extra content at the end of the document", status);
        goto errorExit;
    }

    // Success!
    return root;

errorExit:
    delete root;
    return NULL;
}

//
//  createElement
//      We've just matched an element start tag.  Create and fill in a UXMLElement object
//      for it.
//
UXMLElement *
UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
    // First capture group is the element's name.
    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);

    // Scan for attributes.
    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name

    while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
        UnicodeString attName  = mAttrValue.group(1, status);
        UnicodeString attValue = mAttrValue.group(2, status);

        // Trim the quotes from the att value.  These are left over from the original regex
        //   that parsed the attribue, which couldn't conveniently strip them.
        attValue.remove(0,1);                    // one char from the beginning
        attValue.truncate(attValue.length()-1);  // and one from the end.
        
        // XML Attribue value normalization. 
        // This is one of the really screwy parts of the XML spec.
        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
        // Note that non-validating parsers must treat all entities as type CDATA
        //   which simplifies things some.

        // Att normalization step 1:  normalize any newlines in the attribute value
        mNewLineNormalizer.reset(attValue);
        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);

        // Next change all xml white space chars to plain \u0020 spaces.
        mAttrNormalizer.reset(attValue);
        UnicodeString oneSpace((UChar)0x0020);
        attValue = mAttrNormalizer.replaceAll(oneSpace, status);

        // Replace character entities.
        replaceCharRefs(attValue, status);

        // Save the attribute name and value in our document structure.
        el->fAttNames.addElement((void *)intern(attName, status), status);
        el->fAttValues.addElement(attValue.clone(), status);
        pos = mAttrValue.end(2, status);
    }
    fPos = mEl.end(0, status);
    return el;
}

//
//  parseMisc
//     Consume XML "Misc" [production #27]
//        which is any combination of space, PI and comments
//      Need to watch end-of-input because xml MISC stuff is allowed after
//        the document element, so we WILL scan off the end in this function
//
void
UXMLParser::parseMisc(UErrorCode &status)  {
    for (;;) {
        if (fPos >= mXMLPI.input().length()) {
            break;
        }
        if (mXMLPI.lookingAt(fPos, status)) {
            fPos = mXMLPI.end(status);
            continue;
        }
        if (mXMLSP.lookingAt(fPos, status)) {
            fPos = mXMLSP.end(status);
            continue;
        }
        if (mXMLComment.lookingAt(fPos, status)) {
            fPos = mXMLComment.end(status);
            continue;
        }
        break;
    }
}

//
//  Scan for document content.
//
UnicodeString
UXMLParser::scanContent(UErrorCode &status) {
    UnicodeString  result;
    if (mXMLCharData.lookingAt(fPos, status)) {
        result = mXMLCharData.group(0, status);
        // Normalize the new-lines.  (Before char ref substitution)
        mNewLineNormalizer.reset(result);
        result = mNewLineNormalizer.replaceAll(fOneLF, status);
        
        // TODO:  handle CDATA
        fPos = mXMLCharData.end(0, status);
    }

    return result;
}

//
//   replaceCharRefs
//
//      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
//       with the corresponding actual character.
//
void
UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
    UnicodeString result;
    UnicodeString replacement;
    int     i;

    mAmps.reset(s);
    // See the initialization for the regex matcher mAmps.
    //    Which entity we've matched is determined by which capture group has content,
    //      which is flaged by start() of that group not being -1.
    while (mAmps.find()) {
        if (mAmps.start(1, status) != -1) {
            replacement.setTo((UChar)x_AMP);
        } else if (mAmps.start(2, status) != -1) {
            replacement.setTo((UChar)x_LT);
        } else if (mAmps.start(3, status) != -1) {
            replacement.setTo((UChar)x_GT);
        } else if (mAmps.start(4, status) != -1) {
            replacement.setTo((UChar)x_APOS);
        } else if (mAmps.start(5, status) != -1) {
            replacement.setTo((UChar)x_QUOT);
        } else if (mAmps.start(6, status) != -1) {
            UnicodeString hexString = mAmps.group(6, status);
            UChar32 val = 0;
            for (i=0; i<hexString.length(); i++) {
                val = (val << 4) + u_digit(hexString.charAt(i), 16);
            }
            // TODO:  some verification that the character is valid
            replacement.setTo(val);
        } else if (mAmps.start(7, status) != -1) {
            UnicodeString decimalString = mAmps.group(7, status);
            UChar32 val = 0;
            for (i=0; i<decimalString.length(); i++) {
                val = val*10 + u_digit(decimalString.charAt(i), 10);
            }
            // TODO:  some verification that the character is valid
            replacement.setTo(val);
        } else {
            // An unrecognized &entity;  Leave it alone.
            //  TODO:  check that it really looks like an entity, and is not some
            //         random & in the text.
            replacement = mAmps.group(0, status);
        }
        mAmps.appendReplacement(result, replacement, status);
    }
    mAmps.appendTail(result);
    s = result;
}

void
UXMLParser::error(const char *message, UErrorCode &status) {
    // TODO:  something better here...
    const UnicodeString &src=mXMLDecl.input();
    int  line = 0;
    int  ci = 0;
    while (ci < fPos && ci>=0) {
        ci = src.indexOf((UChar)0x0a, ci+1);
        line++;
    }
    fprintf(stderr, "Error: %s at line %d\n", message, line);
    if (U_SUCCESS(status)) {
        status = U_PARSE_ERROR;
    }
}

// intern strings like in Java

const UnicodeString *
UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
    const UHashElement *he=fNames.find(s);
    if(he!=NULL) {
        // already a known name, return its hashed key pointer
        return (const UnicodeString *)he->key.pointer;
    } else {
        // add this new name and return its hashed key pointer
        fNames.puti(s, 0, errorCode);
        he=fNames.find(s);
        return (const UnicodeString *)he->key.pointer;
    }
}

const UnicodeString *
UXMLParser::findName(const UnicodeString &s) const {
    const UHashElement *he=fNames.find(s);
    if(he!=NULL) {
        // a known name, return its hashed key pointer
        return (const UnicodeString *)he->key.pointer;
    } else {
        // unknown name
        return NULL;
    }
}

// UXMLElement ------------------------------------------------------------- ***

UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
   fParser(parser),
   fName(name),
   fAttNames(errorCode),
   fAttValues(errorCode),
   fChildren(errorCode),
   fParent(NULL)
{
}

UXMLElement::~UXMLElement() {
    int   i;
    // attribute names are owned by the UXMLParser, don't delete them here
    for (i=fAttValues.size()-1; i>=0; i--) {
        delete (UObject *)fAttValues.elementAt(i);
    }
    for (i=fChildren.size()-1; i>=0; i--) {
        delete (UObject *)fChildren.elementAt(i);
    }
}

const UnicodeString &
UXMLElement::getTagName() const {
    return *fName;
}

UnicodeString
UXMLElement::getText(UBool recurse) const {
    UnicodeString text;
    appendText(text, recurse);
    return text;
}

void
UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
    const UObject *node;
    int32_t i, count=fChildren.size();
    for(i=0; i<count; ++i) {
        node=(const UObject *)fChildren.elementAt(i);
        if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
            text.append(*(const UnicodeString *)node);
        } else if(recurse) /* must be a UXMLElement */ {
            ((const UXMLElement *)node)->appendText(text, recurse);
        }
    }
}

int32_t
UXMLElement::countAttributes() const {
    return fAttNames.size();
}

const UnicodeString *
UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
    if(0<=i && i<fAttNames.size()) {
        name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
        value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
    } else {
        return NULL;
    }
}

const UnicodeString *
UXMLElement::getAttribute(const UnicodeString &name) const {
    // search for the attribute name by comparing the interned pointer,
    // not the string contents
    const UnicodeString *p=fParser->findName(name);
    if(p==NULL) {
        return NULL; // no such attribute seen by the parser at all
    }

    int32_t i, count=fAttNames.size();
    for(i=0; i<count; ++i) {
        if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
            return (const UnicodeString *)fAttValues.elementAt(i);
        }
    }
    return NULL;
}

int32_t
UXMLElement::countChildren() const {
    return fChildren.size();
}

const UObject *
UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
    if(0<=i && i<fChildren.size()) {
        const UObject *node=(const UObject *)fChildren.elementAt(i);
        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
            type=UXML_NODE_TYPE_ELEMENT;
        } else {
            type=UXML_NODE_TYPE_STRING;
        }
        return node;
    } else {
        return NULL;
    }
}

const UXMLElement *
UXMLElement::nextChildElement(int32_t &i) const {
    if(i<0) {
        return NULL;
    }

    const UObject *node;
    int32_t count=fChildren.size();
    while(i<count) {
        node=(const UObject *)fChildren.elementAt(i++);
        // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
        // if(node instanceof UXMLElement) {
        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
            return (const UXMLElement *)node;
        }
    }
    return NULL;
}

const UXMLElement *
UXMLElement::getChildElement(const UnicodeString &name) const {
    // search for the element name by comparing the interned pointer,
    // not the string contents
    const UnicodeString *p=fParser->findName(name);
    if(p==NULL) {
        return NULL; // no such element seen by the parser at all
    }

    const UObject *node;
    int32_t i, count=fChildren.size();
    for(i=0; i<count; ++i) {
        node=(const UObject *)fChildren.elementAt(i);
        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
            const UXMLElement *elem=(const UXMLElement *)node;
            if(p==elem->fName) {
                return elem;
            }
        }
    }
    return NULL;
}

U_NAMESPACE_END

#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

登录后可以享受更多权益