// Copyright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#include <algorithm>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "core/fxcrt/cfx_utf8decoder.h"
#include "core/fxcrt/cfx_widetextbuf.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/xml/cxml_content.h"
#include "core/fxcrt/xml/cxml_element.h"
#include "core/fxcrt/xml/cxml_parser.h"
#include "third_party/base/ptr_util.h"
#include "third_party/base/stl_util.h"
namespace {
#define FXCRTM_XML_CHARTYPE_Normal 0x00
#define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
#define FXCRTM_XML_CHARTYPE_Letter 0x02
#define FXCRTM_XML_CHARTYPE_Digital 0x04
#define FXCRTM_XML_CHARTYPE_NameIntro 0x08
#define FXCRTM_XML_CHARTYPE_NameChar 0x10
#define FXCRTM_XML_CHARTYPE_HexDigital 0x20
#define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
#define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
#define FXCRTM_XML_CHARTYPE_HexChar 0x60
const uint8_t g_FXCRT_XML_ByteTypes[256] = {
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
0x1A, 0x1A, 0x01, 0x01,
};
constexpr int kMaxDepth = 1024;
bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
}
bool g_FXCRT_XML_IsDigital(uint8_t ch) {
return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
}
bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
}
bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
}
} // namespace
CXML_Parser::CXML_Parser()
: m_nOffset(0),
m_pBuffer(nullptr),
m_dwBufferSize(0),
m_nBufferOffset(0),
m_dwIndex(0) {}
CXML_Parser::~CXML_Parser() {}
bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
m_pDataAcc = pdfium::MakeUnique<CXML_DataBufAcc>(pBuffer, size);
m_nOffset = 0;
return ReadNextBlock();
}
bool CXML_Parser::ReadNextBlock() {
if (!m_pDataAcc->ReadNextBlock())
return false;
m_pBuffer = m_pDataAcc->GetBlockBuffer();
m_dwBufferSize = m_pDataAcc->GetBlockSize();
m_nBufferOffset = 0;
m_dwIndex = 0;
return m_dwBufferSize > 0;
}
bool CXML_Parser::IsEOF() {
return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
}
void CXML_Parser::SkipWhiteSpaces() {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF())
return;
do {
while (m_dwIndex < m_dwBufferSize &&
g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
m_dwIndex++;
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
}
void CXML_Parser::GetName(ByteString* space, ByteString* name) {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF())
return;
std::ostringstream buf;
do {
while (m_dwIndex < m_dwBufferSize) {
uint8_t ch = m_pBuffer[m_dwIndex];
if (ch == ':') {
*space = ByteString(buf);
buf.str("");
} else if (g_FXCRT_XML_IsNameChar(ch)) {
buf << static_cast<char>(ch);
} else {
break;
}
m_dwIndex++;
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
*name = ByteString(buf);
}
void CXML_Parser::SkipLiterals(const ByteStringView& str) {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF()) {
return;
}
int32_t i = 0, iLen = str.GetLength();
do {
while (m_dwIndex < m_dwBufferSize) {
if (str[i] != m_pBuffer[m_dwIndex++]) {
i = 0;
continue;
}
i++;
if (i == iLen)
break;
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (i == iLen)
return;
if (m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
while (!m_pDataAcc->IsEOF()) {
ReadNextBlock();
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
}
m_dwIndex = m_dwBufferSize;
}
uint32_t CXML_Parser::GetCharRef() {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF())
return 0;
uint8_t ch;
int32_t iState = 0;
std::ostringstream buf;
uint32_t code = 0;
do {
while (m_dwIndex < m_dwBufferSize) {
ch = m_pBuffer[m_dwIndex];
switch (iState) {
case 0:
if (ch == '#') {
m_dwIndex++;
iState = 2;
break;
}
iState = 1;
case 1:
m_dwIndex++;
if (ch == ';') {
std::string ref = buf.str();
if (ref == "gt")
code = '>';
else if (ref == "lt")
code = '<';
else if (ref == "amp")
code = '&';
else if (ref == "apos")
code = '\'';
else if (ref == "quot")
code = '"';
iState = 10;
break;
}
buf << static_cast<char>(ch);
break;
case 2:
if (ch == 'x') {
m_dwIndex++;
iState = 4;
break;
}
iState = 3;
case 3:
m_dwIndex++;
if (ch == ';') {
iState = 10;
break;
}
if (g_FXCRT_XML_IsDigital(ch))
code = code * 10 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
break;
case 4:
m_dwIndex++;
if (ch == ';') {
iState = 10;
break;
}
uint8_t nHex =
g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
if (nHex) {
if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
code = (code << 4) +
FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
} else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
code = (code << 4) + ch - 87;
} else {
code = (code << 4) + ch - 55;
}
}
break;
}
if (iState == 10)
break;
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
break;
}
} while (ReadNextBlock());
return code;
}
WideString CXML_Parser::GetAttrValue() {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF())
return WideString();
CFX_UTF8Decoder decoder;
uint8_t mark = 0;
uint8_t ch = 0;
do {
while (m_dwIndex < m_dwBufferSize) {
ch = m_pBuffer[m_dwIndex];
if (mark == 0) {
if (ch != '\'' && ch != '"')
return WideString();
mark = ch;
m_dwIndex++;
ch = 0;
continue;
}
m_dwIndex++;
if (ch == mark)
break;
if (ch == '&') {
decoder.AppendCodePoint(GetCharRef());
if (IsEOF())
return WideString(decoder.GetResult());
} else {
decoder.Input(ch);
}
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
return WideString(decoder.GetResult());
}
void CXML_Parser::GetTagName(bool bStartTag,
bool* bEndTag,
ByteString* space,
ByteString* name) {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF())
return;
*bEndTag = false;
uint8_t ch;
int32_t iState = bStartTag ? 1 : 0;
do {
while (m_dwIndex < m_dwBufferSize) {
ch = m_pBuffer[m_dwIndex];
switch (iState) {
case 0:
m_dwIndex++;
if (ch != '<')
break;
iState = 1;
break;
case 1:
if (ch == '?') {
m_dwIndex++;
SkipLiterals("?>");
iState = 0;
break;
}
if (ch == '!') {
m_dwIndex++;
SkipLiterals("-->");
iState = 0;
break;
}
if (ch == '/') {
m_dwIndex++;
GetName(space, name);
*bEndTag = true;
} else {
GetName(space, name);
*bEndTag = false;
}
return;
}
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
}
std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
bool bStartTag) {
return ParseElementInternal(pParent, bStartTag, 0);
}
std::unique_ptr<CXML_Element> CXML_Parser::ParseElementInternal(
CXML_Element* pParent,
bool bStartTag,
int nDepth) {
if (nDepth > kMaxDepth)
return nullptr;
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (IsEOF())
return nullptr;
ByteString tag_name;
ByteString tag_space;
bool bEndTag;
GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
if (tag_name.IsEmpty() || bEndTag)
return nullptr;
auto pElement = pdfium::MakeUnique<CXML_Element>(
pParent, tag_space.AsStringView(), tag_name.AsStringView());
do {
ByteString attr_space;
ByteString attr_name;
while (m_dwIndex < m_dwBufferSize) {
SkipWhiteSpaces();
if (IsEOF())
break;
if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
break;
GetName(&attr_space, &attr_name);
SkipWhiteSpaces();
if (IsEOF())
break;
if (m_pBuffer[m_dwIndex] != '=')
break;
m_dwIndex++;
SkipWhiteSpaces();
if (IsEOF())
break;
WideString attr_value = GetAttrValue();
pElement->SetAttribute(attr_space, attr_name, attr_value);
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
SkipWhiteSpaces();
if (IsEOF())
return pElement;
uint8_t ch = m_pBuffer[m_dwIndex++];
if (ch == '/') {
m_dwIndex++;
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
return pElement;
}
if (ch != '>') {
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
return nullptr;
}
SkipWhiteSpaces();
if (IsEOF())
return pElement;
CFX_UTF8Decoder decoder;
CFX_WideTextBuf content;
bool bCDATA = false;
int32_t iState = 0;
do {
while (m_dwIndex < m_dwBufferSize) {
ch = m_pBuffer[m_dwIndex++];
switch (iState) {
case 0:
if (ch == '<') {
iState = 1;
} else if (ch == '&') {
decoder.ClearStatus();
decoder.AppendCodePoint(GetCharRef());
} else {
decoder.Input(ch);
}
break;
case 1:
if (ch == '!') {
iState = 2;
} else if (ch == '?') {
SkipLiterals("?>");
SkipWhiteSpaces();
iState = 0;
} else if (ch == '/') {
ByteString space;
ByteString name;
GetName(&space, &name);
SkipWhiteSpaces();
m_dwIndex++;
iState = 10;
} else {
content << decoder.GetResult();
WideString dataStr = content.MakeString();
if (!bCDATA)
dataStr.TrimRight(L" \t\r\n");
InsertContentSegment(bCDATA, dataStr.AsStringView(),
pElement.get());
content.Clear();
decoder.Clear();
bCDATA = false;
iState = 0;
m_dwIndex--;
std::unique_ptr<CXML_Element> pSubElement =
ParseElementInternal(pElement.get(), true, nDepth + 1);
if (!pSubElement)
break;
pElement->AppendChild(std::move(pSubElement));
SkipWhiteSpaces();
}
break;
case 2:
if (ch == '[') {
SkipLiterals("]]>");
} else if (ch == '-') {
m_dwIndex++;
SkipLiterals("-->");
} else {
SkipLiterals(">");
}
decoder.Clear();
SkipWhiteSpaces();
iState = 0;
break;
}
if (iState == 10) {
break;
}
}
m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
break;
} while (ReadNextBlock());
content << decoder.GetResult();
WideString dataStr = content.MakeString();
dataStr.TrimRight(L" \t\r\n");
InsertContentSegment(bCDATA, dataStr.AsStringView(), pElement.get());
content.Clear();
decoder.Clear();
bCDATA = false;
return pElement;
}
void CXML_Parser::InsertContentSegment(bool bCDATA,
const WideStringView& content,
CXML_Element* pElement) {
if (content.IsEmpty())
return;
pElement->AppendChild(pdfium::MakeUnique<CXML_Content>(bCDATA, content));
}