/*------------------------------------------------------------------------- * drawElements Quality Program Test Executor * ------------------------------------------ * * Copyright 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * *//*! * \file * \brief XML Parser. *//*--------------------------------------------------------------------*/ #include "xeXMLParser.hpp" #include "deInt32.h" namespace xe { namespace xml { enum { TOKENIZER_INITIAL_BUFFER_SIZE = 1024 }; static inline bool isIdentifierStartChar (int ch) { return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z'); } static inline bool isIdentifierChar (int ch) { return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_'); } static inline bool isWhitespaceChar (int ch) { return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; } static int getNextBufferSize (int curSize, int minNewSize) { return de::max(curSize*2, 1<<deLog2Ceil32(minNewSize)); } Tokenizer::Tokenizer (void) : m_curToken (TOKEN_INCOMPLETE) , m_curTokenLen (0) , m_state (STATE_DATA) , m_buf (TOKENIZER_INITIAL_BUFFER_SIZE) { } Tokenizer::~Tokenizer (void) { } void Tokenizer::clear (void) { m_curToken = TOKEN_INCOMPLETE; m_curTokenLen = 0; m_state = STATE_DATA; m_buf.clear(); } void Tokenizer::error (const std::string& what) { throw ParseError(what); } void Tokenizer::feed (const deUint8* bytes, int numBytes) { // Grow buffer if necessary. if (m_buf.getNumFree() < numBytes) { m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements()+numBytes)); } // Append to front. m_buf.pushFront(bytes, numBytes); // If we haven't parsed complete token, re-try after data feed. if (m_curToken == TOKEN_INCOMPLETE) advance(); } int Tokenizer::getChar (int offset) const { DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements())); if (offset < m_buf.getNumElements()) return m_buf.peekBack(offset); else return END_OF_BUFFER; } void Tokenizer::advance (void) { if (m_curToken != TOKEN_INCOMPLETE) { // Parser should not try to advance beyond end of string. DE_ASSERT(m_curToken != TOKEN_END_OF_STRING); // If current token is tag end, change state to data. if (m_curToken == TOKEN_TAG_END || m_curToken == TOKEN_EMPTY_ELEMENT_END || m_curToken == TOKEN_PROCESSING_INSTRUCTION_END || m_curToken == TOKEN_COMMENT || m_curToken == TOKEN_ENTITY) m_state = STATE_DATA; // Advance buffer by length of last token. m_buf.popBack(m_curTokenLen); // Reset state. m_curToken = TOKEN_INCOMPLETE; m_curTokenLen = 0; // If we hit end of string here, report it as end of string. if (getChar(0) == END_OF_STRING) { m_curToken = TOKEN_END_OF_STRING; m_curTokenLen = 1; return; } } int curChar = getChar(m_curTokenLen); for (;;) { if (m_state == STATE_DATA) { // Advance until we hit end of buffer or tag start and treat that as data token. if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&') { if (curChar == '<') m_state = STATE_TAG; else if (curChar == '&') m_state = STATE_ENTITY; if (m_curTokenLen > 0) { // Report data token. m_curToken = TOKEN_DATA; return; } else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER) { // Just return incomplete token, no data parsed. return; } else { DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY); continue; } } } else { // Eat all whitespace if present. if (m_curTokenLen == 0) { while (isWhitespaceChar(curChar)) { m_buf.popBack(); curChar = getChar(0); } } // Handle end of string / buffer. if (curChar == END_OF_STRING) error("Unexpected end of string"); else if (curChar == (int)END_OF_BUFFER) { DE_ASSERT(m_curToken == TOKEN_INCOMPLETE); return; } if (m_curTokenLen == 0) { // Expect start of identifier, value or special tag token. if (curChar == '\'' || curChar == '"') m_state = STATE_VALUE; else if (isIdentifierStartChar(curChar)) m_state = STATE_IDENTIFIER; else if (curChar == '<' || curChar == '?' || curChar == '/') m_state = STATE_TAG; else if (curChar == '&') DE_ASSERT(m_state == STATE_ENTITY); else if (curChar == '=') { m_curToken = TOKEN_EQUAL; m_curTokenLen = 1; return; } else if (curChar == '>') { m_curToken = TOKEN_TAG_END; m_curTokenLen = 1; return; } else error("Unexpected character"); } else if (m_state == STATE_IDENTIFIER) { if (!isIdentifierChar(curChar)) { m_curToken = TOKEN_IDENTIFIER; return; } } else if (m_state == STATE_VALUE) { // \todo [2012-06-07 pyry] Escapes. if (curChar == '\'' || curChar == '"') { // \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)? if (curChar != getChar(0)) error("Mismatched quote"); m_curToken = TOKEN_STRING; m_curTokenLen += 1; return; } } else if (m_state == STATE_COMMENT) { DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state. if (m_curTokenLen <= 3) { if (curChar != '-') error("Invalid comment start"); } else { int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen-2) : 0; int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen-1) : 0; if (prev2 == '-' && prev1 == '-') { if (curChar != '>') error("Invalid comment end"); m_curToken = TOKEN_COMMENT; m_curTokenLen += 1; return; } } } else if (m_state == STATE_ENTITY) { if (m_curTokenLen >= 1) { if (curChar == ';') { m_curToken = TOKEN_ENTITY; m_curTokenLen += 1; return; } else if (!de::inRange<int>(curChar, '0', '9') && !de::inRange<int>(curChar, 'a', 'z') && !de::inRange<int>(curChar, 'A', 'Z')) error("Invalid entity"); } } else { // Special tokens are at most 2 characters. DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1); int prevChar = getChar(m_curTokenLen-1); if (prevChar == '<') { // Tag start. if (curChar == '/') { m_curToken = TOKEN_END_TAG_START; m_curTokenLen = 2; return; } else if (curChar == '?') { m_curToken = TOKEN_PROCESSING_INSTRUCTION_START; m_curTokenLen = 2; return; } else if (curChar == '!') { m_state = STATE_COMMENT; } else { m_curToken = TOKEN_TAG_START; m_curTokenLen = 1; return; } } else if (prevChar == '?') { if (curChar != '>') error("Invalid processing instruction end"); m_curToken = TOKEN_PROCESSING_INSTRUCTION_END; m_curTokenLen = 2; return; } else if (prevChar == '/') { if (curChar != '>') error("Invalid empty element end"); m_curToken = TOKEN_EMPTY_ELEMENT_END; m_curTokenLen = 2; return; } else error("Could not parse special token"); } } m_curTokenLen += 1; curChar = getChar(m_curTokenLen); } } void Tokenizer::getString (std::string& dst) const { DE_ASSERT(m_curToken == TOKEN_STRING); dst.resize(m_curTokenLen-2); for (int ndx = 0; ndx < m_curTokenLen-2; ndx++) dst[ndx] = m_buf.peekBack(ndx+1); } Parser::Parser (void) : m_element (ELEMENT_INCOMPLETE) , m_state (STATE_DATA) { } Parser::~Parser (void) { } void Parser::clear (void) { m_tokenizer.clear(); m_elementName.clear(); m_attributes.clear(); m_attribName.clear(); m_entityValue.clear(); m_element = ELEMENT_INCOMPLETE; m_state = STATE_DATA; } void Parser::error (const std::string& what) { throw ParseError(what); } void Parser::feed (const deUint8* bytes, int numBytes) { m_tokenizer.feed(bytes, numBytes); if (m_element == ELEMENT_INCOMPLETE) advance(); } void Parser::advance (void) { if (m_element == ELEMENT_START) m_attributes.clear(); // \note No token is advanced when element end is reported. if (m_state == STATE_YIELD_EMPTY_ELEMENT_END) { DE_ASSERT(m_element == ELEMENT_START); m_element = ELEMENT_END; m_state = STATE_DATA; return; } if (m_element != ELEMENT_INCOMPLETE) { m_tokenizer.advance(); m_element = ELEMENT_INCOMPLETE; } for (;;) { Token curToken = m_tokenizer.getToken(); // Skip comments. while (curToken == TOKEN_COMMENT) { m_tokenizer.advance(); curToken = m_tokenizer.getToken(); } if (curToken == TOKEN_INCOMPLETE) { DE_ASSERT(m_element == ELEMENT_INCOMPLETE); return; } switch (m_state) { case STATE_ENTITY: m_state = STATE_DATA; // Fall-through to STATE_DATA processing. case STATE_DATA: switch (curToken) { case TOKEN_DATA: m_element = ELEMENT_DATA; return; case TOKEN_END_OF_STRING: m_element = ELEMENT_END_OF_STRING; return; case TOKEN_TAG_START: m_state = STATE_START_TAG_OPEN; break; case TOKEN_END_TAG_START: m_state = STATE_END_TAG_OPEN; break; case TOKEN_PROCESSING_INSTRUCTION_START: m_state = STATE_IN_PROCESSING_INSTRUCTION; break; case TOKEN_ENTITY: m_state = STATE_ENTITY; m_element = ELEMENT_DATA; parseEntityValue(); return; default: error("Unexpected token"); } break; case STATE_IN_PROCESSING_INSTRUCTION: if (curToken == TOKEN_PROCESSING_INSTRUCTION_END) m_state = STATE_DATA; else if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING) error("Unexpected token in processing instruction"); break; case STATE_START_TAG_OPEN: if (curToken != TOKEN_IDENTIFIER) error("Expected identifier"); m_tokenizer.getTokenStr(m_elementName); m_state = STATE_ATTRIBUTE_LIST; break; case STATE_END_TAG_OPEN: if (curToken != TOKEN_IDENTIFIER) error("Expected identifier"); m_tokenizer.getTokenStr(m_elementName); m_state = STATE_EXPECTING_END_TAG_CLOSE; break; case STATE_EXPECTING_END_TAG_CLOSE: if (curToken != TOKEN_TAG_END) error("Expected tag end"); m_state = STATE_DATA; m_element = ELEMENT_END; return; case STATE_ATTRIBUTE_LIST: if (curToken == TOKEN_IDENTIFIER) { m_tokenizer.getTokenStr(m_attribName); m_state = STATE_EXPECTING_ATTRIBUTE_EQ; } else if (curToken == TOKEN_EMPTY_ELEMENT_END) { m_state = STATE_YIELD_EMPTY_ELEMENT_END; m_element = ELEMENT_START; return; } else if (curToken == TOKEN_TAG_END) { m_state = STATE_DATA; m_element = ELEMENT_START; return; } else error("Unexpected token"); break; case STATE_EXPECTING_ATTRIBUTE_EQ: if (curToken != TOKEN_EQUAL) error("Expected '='"); m_state = STATE_EXPECTING_ATTRIBUTE_VALUE; break; case STATE_EXPECTING_ATTRIBUTE_VALUE: if (curToken != TOKEN_STRING) error("Expected value"); if (hasAttribute(m_attribName.c_str())) error("Duplicate attribute"); m_tokenizer.getString(m_attributes[m_attribName]); m_state = STATE_ATTRIBUTE_LIST; break; default: DE_ASSERT(false); } m_tokenizer.advance(); } } static char getEntityValue (const std::string& entity) { static const struct { const char* name; char value; } s_entities[] = { { "<", '<' }, { ">", '>' }, { "&", '&' }, { "'", '\''}, { """, '"' }, }; for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++) { if (entity == s_entities[ndx].name) return s_entities[ndx].value; } return 0; } void Parser::parseEntityValue (void) { DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY); std::string entity; m_tokenizer.getTokenStr(entity); const char value = getEntityValue(entity); if (value == 0) error("Invalid entity '" + entity + "'"); m_entityValue.resize(1); m_entityValue[0] = value; } } // xml } // xe