// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2004-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: xmlparser.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2004jul21
* created by: Andy Heninger
*
* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
* Not suitable for production use. Not supported.
* Not conformant. Not efficient.
* But very small.
*/
#ifndef __XMLPARSER_H__
#define __XMLPARSER_H__
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "uvector.h"
#include "hash.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
enum UXMLNodeType {
/** Node type string (text contents), stored as a UnicodeString. */
UXML_NODE_TYPE_STRING,
/** Node type element, stored as a UXMLElement. */
UXML_NODE_TYPE_ELEMENT,
UXML_NODE_TYPE_COUNT
};
U_NAMESPACE_BEGIN
class UXMLParser;
/**
* This class represents an element node in a parsed XML tree.
*/
class U_TOOLUTIL_API UXMLElement : public UObject {
public:
/**
* Destructor.
*/
virtual ~UXMLElement();
/**
* Get the tag name of this element.
*/
const UnicodeString &getTagName() const;
/**
* Get the text contents of the element.
* Append the contents of all text child nodes.
* @param recurse If TRUE, also recursively appends the contents of all
* text child nodes of element children.
* @return The text contents.
*/
UnicodeString getText(UBool recurse) const;
/**
* Get the number of attributes.
*/
int32_t countAttributes() const;
/**
* Get the i-th attribute.
* @param i Index of the attribute.
* @param name Output parameter, receives the attribute name.
* @param value Output parameter, receives the attribute value.
* @return A pointer to the attribute value (may be &value or a pointer to an
* internal string object), or NULL if i is out of bounds.
*/
const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
/**
* Get the value of the attribute with the given name.
* @param name Attribute name to be looked up.
* @return A pointer to the attribute value, or NULL if this element
* does not have this attribute.
*/
const UnicodeString *getAttribute(const UnicodeString &name) const;
/**
* Get the number of child nodes.
*/
int32_t countChildren() const;
/**
* Get the i-th child node.
* @param i Index of the child node.
* @param type The child node type.
* @return A pointer to the child node object, or NULL if i is out of bounds.
*/
const UObject *getChild(int32_t i, UXMLNodeType &type) const;
/**
* Get the next child element node, skipping non-element child nodes.
* @param i Enumeration index; initialize to 0 before getting the first child element.
* @return A pointer to the next child element, or NULL if there is none.
*/
const UXMLElement *nextChildElement(int32_t &i) const;
/**
* Get the immediate child element with the given name.
* If there are multiple child elements with this name, then return
* the first one.
* @param name Element name to be looked up.
* @return A pointer to the element node, or NULL if this element
* does not have this immediate child element.
*/
const UXMLElement *getChildElement(const UnicodeString &name) const;
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*/
virtual UClassID getDynamicClassID() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/
static UClassID U_EXPORT2 getStaticClassID();
private:
// prevent default construction etc.
UXMLElement();
UXMLElement(const UXMLElement &other);
UXMLElement &operator=(const UXMLElement &other);
void appendText(UnicodeString &text, UBool recurse) const;
friend class UXMLParser;
UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
const UXMLParser *fParser;
const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
UnicodeString fContent; // The text content of this node. All element content is
// concatenated even when there are intervening nested elements
// (which doesn't happen with most xml files we care about)
// Sections of content containing only white space are dropped,
// which gets rid the bogus white space content from
// elements which are primarily containers for nested elements.
UVector fAttNames; // A vector containing the names of this element's attributes
// The names are UnicodeString objects, owned by the UXMLParser.
UVector fAttValues; // A vector containing the attribute values for
// this element's attributes. The order is the same
// as that of the attribute name vector.
UVector fChildren; // The child nodes of this element (a Vector)
UXMLElement *fParent; // A pointer to the parent element of this element.
};
/**
* A simple XML parser; it is neither efficient nor conformant and only useful for
* restricted types of XML documents.
*
* The parse methods parse whole documents and return the parse trees via their
* root elements.
*/
class U_TOOLUTIL_API UXMLParser : public UObject {
public:
/**
* Create an XML parser.
*/
static UXMLParser *createParser(UErrorCode &errorCode);
/**
* Destructor.
*/
virtual ~UXMLParser();
/**
* Parse an XML document, create the entire document tree, and
* return a pointer to the root element of the parsed tree.
* The caller must delete the element.
*/
UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
/**
* Parse an XML file, create the entire document tree, and
* return a pointer to the root element of the parsed tree.
* The caller must delete the element.
*/
UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*/
virtual UClassID getDynamicClassID() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/
static UClassID U_EXPORT2 getStaticClassID();
private:
// prevent default construction etc.
UXMLParser();
UXMLParser(const UXMLParser &other);
UXMLParser &operator=(const UXMLParser &other);
// constructor
UXMLParser(UErrorCode &status);
void parseMisc(UErrorCode &status);
UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
void error(const char *message, UErrorCode &status);
UnicodeString scanContent(UErrorCode &status);
void replaceCharRefs(UnicodeString &s, UErrorCode &status);
const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
public:
// public for UXMLElement only
const UnicodeString *findName(const UnicodeString &s) const;
private:
// There is one ICU regex matcher for each of the major XML syntax items
// that are recognized.
RegexMatcher mXMLDecl;
RegexMatcher mXMLComment;
RegexMatcher mXMLSP;
RegexMatcher mXMLDoctype;
RegexMatcher mXMLPI;
RegexMatcher mXMLElemStart;
RegexMatcher mXMLElemEnd;
RegexMatcher mXMLElemEmpty;
RegexMatcher mXMLCharData;
RegexMatcher mAttrValue;
RegexMatcher mAttrNormalizer;
RegexMatcher mNewLineNormalizer;
RegexMatcher mAmps;
Hashtable fNames; // interned element/attribute name strings
UStack fElementStack; // Stack holds the parent elements when nested
// elements are being parsed. All items on this
// stack are of type UXMLElement.
int32_t fPos; // String index of the current scan position in
// xml source (in fSrc).
UnicodeString fOneLF;
};
U_NAMESPACE_END
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
#endif