// Copyright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#include "core/fxcrt/xml/cfx_saxreader.h"
#include <algorithm>
#include <utility>
#include "core/fxcrt/fx_stream.h"
#include "core/fxcrt/xml/cfx_saxreaderhandler.h"
#include "third_party/base/ptr_util.h"
#include "third_party/base/stl_util.h"
enum class CFX_SaxMode {
Text = 0,
NodeStart,
DeclOrComment,
DeclNode,
Comment,
CommentContent,
TagName,
TagAttributeName,
TagAttributeEqual,
TagAttributeValue,
TagMaybeClose,
TagClose,
TagEnd,
TargetData,
};
class CFX_SAXCommentContext {
public:
CFX_SAXCommentContext() : m_iHeaderCount(0), m_iTailCount(0) {}
int32_t m_iHeaderCount;
int32_t m_iTailCount;
};
namespace {
const uint32_t kSaxFileBufSize = 32768;
} // namespace
CFX_SAXFile::CFX_SAXFile()
: m_dwStart(0),
m_dwEnd(0),
m_dwCur(0),
m_pBuf(nullptr),
m_dwBufSize(0),
m_dwBufIndex(0) {}
CFX_SAXFile::~CFX_SAXFile() {}
bool CFX_SAXFile::StartFile(const RetainPtr<IFX_SeekableReadStream>& pFile,
uint32_t dwStart,
uint32_t dwLen) {
ASSERT(!m_pFile && pFile);
uint32_t dwSize = pFile->GetSize();
if (dwStart >= dwSize)
return false;
if (dwLen == static_cast<uint32_t>(-1) || dwStart + dwLen > dwSize)
dwLen = dwSize - dwStart;
if (dwLen == 0)
return false;
m_dwBufSize = std::min(dwLen, kSaxFileBufSize);
m_pBuf = FX_Alloc(uint8_t, m_dwBufSize);
if (!pFile->ReadBlock(m_pBuf, dwStart, m_dwBufSize))
return false;
m_dwStart = dwStart;
m_dwEnd = dwStart + dwLen;
m_dwCur = dwStart;
m_pFile = pFile;
m_dwBufIndex = 0;
return true;
}
bool CFX_SAXFile::ReadNextBlock() {
ASSERT(m_pFile);
uint32_t dwSize = m_dwEnd - m_dwCur;
if (dwSize == 0) {
return false;
}
m_dwBufSize = std::min(dwSize, kSaxFileBufSize);
if (!m_pFile->ReadBlock(m_pBuf, m_dwCur, m_dwBufSize)) {
return false;
}
m_dwBufIndex = 0;
return true;
}
void CFX_SAXFile::Reset() {
if (m_pBuf) {
FX_Free(m_pBuf);
m_pBuf = nullptr;
}
m_pFile = nullptr;
}
CFX_SAXReader::CFX_SAXReader()
: m_File(),
m_pHandler(nullptr),
m_iState(-1),
m_dwItemID(0),
m_dwParseMode(0) {
m_Data.reserve(256);
m_Name.reserve(256);
}
CFX_SAXReader::~CFX_SAXReader() {
Reset();
}
void CFX_SAXReader::Reset() {
m_File.Reset();
m_iState = -1;
m_Stack = std::stack<std::unique_ptr<CFX_SAXItem>>();
m_dwItemID = 0;
m_SkipStack = std::stack<char>();
m_SkipChar = 0;
m_pCommentContext.reset();
ClearData();
ClearName();
}
void CFX_SAXReader::Push() {
std::unique_ptr<CFX_SAXItem> pNew =
pdfium::MakeUnique<CFX_SAXItem>(++m_dwItemID);
if (!m_Stack.empty())
pNew->m_bSkip = m_Stack.top()->m_bSkip;
m_Stack.push(std::move(pNew));
}
void CFX_SAXReader::Pop() {
if (!m_Stack.empty())
m_Stack.pop();
}
CFX_SAXItem* CFX_SAXReader::GetCurrentItem() const {
return m_Stack.empty() ? nullptr : m_Stack.top().get();
}
void CFX_SAXReader::ClearData() {
m_Data.clear();
m_iEntityStart = -1;
}
void CFX_SAXReader::ClearName() {
m_Name.clear();
}
void CFX_SAXReader::AppendToData(uint8_t ch) {
m_Data.push_back(ch);
}
void CFX_SAXReader::AppendToName(uint8_t ch) {
m_Name.push_back(ch);
}
void CFX_SAXReader::BackUpAndReplaceDataAt(int32_t index, uint8_t ch) {
ASSERT(index > -1);
m_Data.erase(m_Data.begin() + index, m_Data.end());
AppendToData(ch);
}
int32_t CFX_SAXReader::CurrentDataIndex() const {
return pdfium::CollectionSize<int32_t>(m_Data) - 1;
}
bool CFX_SAXReader::IsEntityStart(uint8_t ch) const {
return m_iEntityStart == -1 && ch == '&';
}
bool CFX_SAXReader::IsEntityEnd(uint8_t ch) const {
return m_iEntityStart != -1 && ch == ';';
}
bool CFX_SAXReader::SkipSpace(uint8_t ch) {
return (m_dwParseMode & CFX_SaxParseMode_NotSkipSpace) == 0 && ch < 0x21;
}
int32_t CFX_SAXReader::StartParse(
const RetainPtr<IFX_SeekableReadStream>& pFile,
uint32_t dwStart,
uint32_t dwLen,
uint32_t dwParseMode) {
Reset();
if (!m_File.StartFile(pFile, dwStart, dwLen))
return -1;
m_iState = 0;
m_eMode = CFX_SaxMode::Text;
m_ePrevMode = CFX_SaxMode::Text;
m_bCharData = false;
m_dwDataOffset = 0;
m_dwParseMode = dwParseMode;
m_Stack.push(pdfium::MakeUnique<CFX_SAXItem>(++m_dwItemID));
return 0;
}
int32_t CFX_SAXReader::ContinueParse() {
if (m_iState < 0 || m_iState > 99)
return m_iState;
while (m_File.m_dwCur < m_File.m_dwEnd) {
uint32_t& index = m_File.m_dwBufIndex;
uint32_t size = m_File.m_dwBufSize;
const uint8_t* pBuf = m_File.m_pBuf;
while (index < size) {
m_CurByte = pBuf[index];
ParseInternal();
index++;
}
m_File.m_dwCur += index;
m_iState = (m_File.m_dwCur - m_File.m_dwStart) * 100 /
(m_File.m_dwEnd - m_File.m_dwStart);
if (m_File.m_dwCur >= m_File.m_dwEnd)
break;
if (!m_File.ReadNextBlock()) {
m_iState = -2;
break;
}
m_dwDataOffset = 0;
}
return m_iState;
}
void CFX_SAXReader::ParseInternal() {
switch (m_eMode) {
case CFX_SaxMode::Text:
ParseText();
break;
case CFX_SaxMode::NodeStart:
ParseNodeStart();
break;
case CFX_SaxMode::DeclOrComment:
ParseDeclOrComment();
break;
case CFX_SaxMode::DeclNode:
ParseDeclNode();
break;
case CFX_SaxMode::Comment:
ParseComment();
break;
case CFX_SaxMode::CommentContent:
ParseCommentContent();
break;
case CFX_SaxMode::TagName:
ParseTagName();
break;
case CFX_SaxMode::TagAttributeName:
ParseTagAttributeName();
break;
case CFX_SaxMode::TagAttributeEqual:
ParseTagAttributeEqual();
break;
case CFX_SaxMode::TagAttributeValue:
ParseTagAttributeValue();
break;
case CFX_SaxMode::TagMaybeClose:
ParseMaybeClose();
break;
case CFX_SaxMode::TagClose:
ParseTagClose();
break;
case CFX_SaxMode::TagEnd:
ParseTagEnd();
break;
case CFX_SaxMode::TargetData:
ParseTargetData();
break;
}
}
void CFX_SAXReader::ParseChar(uint8_t ch) {
AppendToData(ch);
if (IsEntityStart(ch)) {
m_iEntityStart = CurrentDataIndex();
return;
}
if (!IsEntityEnd(ch))
return;
// No matter what, we're no longer in an entity.
ASSERT(m_iEntityStart > -1);
int32_t iSaveStart = m_iEntityStart;
m_iEntityStart = -1;
// NOTE: Relies on negative lengths being treated as empty strings.
ByteString csEntity(m_Data.data() + iSaveStart + 1,
CurrentDataIndex() - iSaveStart - 1);
int32_t iLen = csEntity.GetLength();
if (iLen == 0)
return;
if (csEntity[0] == '#') {
if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_sharp) == 0) {
ch = 0;
uint8_t w;
if (iLen > 1 && csEntity[1] == 'x') {
for (int32_t i = 2; i < iLen; i++) {
w = csEntity[i];
if (w >= '0' && w <= '9')
ch = (ch << 4) + w - '0';
else if (w >= 'A' && w <= 'F')
ch = (ch << 4) + w - 55;
else if (w >= 'a' && w <= 'f')
ch = (ch << 4) + w - 87;
else
break;
}
} else {
for (int32_t i = 1; i < iLen; i++) {
w = csEntity[i];
if (w < '0' || w > '9')
break;
ch = ch * 10 + w - '0';
}
}
if (ch != 0)
BackUpAndReplaceDataAt(iSaveStart, ch);
}
return;
}
if (csEntity == "amp") {
if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_amp) == 0)
BackUpAndReplaceDataAt(iSaveStart, '&');
return;
}
if (csEntity == "lt") {
if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_lt) == 0)
BackUpAndReplaceDataAt(iSaveStart, '<');
return;
}
if (csEntity == "gt") {
if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_gt) == 0)
BackUpAndReplaceDataAt(iSaveStart, '>');
return;
}
if (csEntity == "apos") {
if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_apos) == 0)
BackUpAndReplaceDataAt(iSaveStart, '\'');
return;
}
if (csEntity == "quot") {
if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_quot) == 0)
BackUpAndReplaceDataAt(iSaveStart, '\"');
return;
}
}
void CFX_SAXReader::ParseText() {
if (m_CurByte == '<') {
if (!m_Data.empty()) {
NotifyData();
ClearData();
}
Push();
m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex;
m_eMode = CFX_SaxMode::NodeStart;
return;
}
if (m_Data.empty() && SkipSpace(m_CurByte))
return;
ParseChar(m_CurByte);
}
void CFX_SAXReader::ParseNodeStart() {
if (m_CurByte == '?') {
GetCurrentItem()->m_eNode = CFX_SAXItem::Type::Instruction;
m_eMode = CFX_SaxMode::TagName;
return;
}
if (m_CurByte == '!') {
m_eMode = CFX_SaxMode::DeclOrComment;
return;
}
if (m_CurByte == '/') {
m_eMode = CFX_SaxMode::TagEnd;
return;
}
if (m_CurByte == '>') {
Pop();
m_eMode = CFX_SaxMode::Text;
return;
}
if (m_CurByte > 0x20) {
m_dwDataOffset = m_File.m_dwBufIndex;
GetCurrentItem()->m_eNode = CFX_SAXItem::Type::Tag;
m_eMode = CFX_SaxMode::TagName;
AppendToData(m_CurByte);
}
}
void CFX_SAXReader::ParseDeclOrComment() {
if (m_CurByte == '-') {
m_eMode = CFX_SaxMode::Comment;
GetCurrentItem()->m_eNode = CFX_SAXItem::Type::Comment;
if (!m_pCommentContext)
m_pCommentContext = pdfium::MakeUnique<CFX_SAXCommentContext>();
m_pCommentContext->m_iHeaderCount = 1;
m_pCommentContext->m_iTailCount = 0;
return;
}
m_eMode = CFX_SaxMode::DeclNode;
m_dwDataOffset = m_File.m_dwBufIndex;
m_SkipChar = '>';
m_SkipStack.push('>');
SkipNode();
}
void CFX_SAXReader::ParseComment() {
m_pCommentContext->m_iHeaderCount = 2;
m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex;
m_eMode = CFX_SaxMode::CommentContent;
}
void CFX_SAXReader::ParseCommentContent() {
if (m_CurByte == '-') {
m_pCommentContext->m_iTailCount++;
return;
}
if (m_CurByte == '>' && m_pCommentContext->m_iTailCount == 2) {
NotifyTargetData();
ClearData();
Pop();
m_eMode = CFX_SaxMode::Text;
return;
}
while (m_pCommentContext->m_iTailCount > 0) {
AppendToData('-');
m_pCommentContext->m_iTailCount--;
}
AppendToData(m_CurByte);
}
void CFX_SAXReader::ParseDeclNode() {
SkipNode();
}
void CFX_SAXReader::ParseTagName() {
if (m_CurByte < 0x21 || m_CurByte == '/' || m_CurByte == '>' ||
m_CurByte == '?') {
NotifyEnter();
ClearData();
if (m_CurByte < 0x21) {
ClearName();
m_eMode = CFX_SaxMode::TagAttributeName;
} else if (m_CurByte == '/' || m_CurByte == '?') {
m_ePrevMode = m_eMode;
m_eMode = CFX_SaxMode::TagMaybeClose;
} else {
NotifyBreak();
m_eMode = CFX_SaxMode::Text;
}
} else {
AppendToData(m_CurByte);
}
}
void CFX_SAXReader::ParseTagAttributeName() {
if (m_CurByte < 0x21 || m_CurByte == '=') {
if (m_Name.empty() && m_CurByte < 0x21)
return;
m_SkipChar = 0;
m_eMode = m_CurByte == '=' ? CFX_SaxMode::TagAttributeValue
: CFX_SaxMode::TagAttributeEqual;
ClearData();
return;
}
if (m_CurByte == '/' || m_CurByte == '>' || m_CurByte == '?') {
if (m_CurByte == '/' || m_CurByte == '?') {
m_ePrevMode = m_eMode;
m_eMode = CFX_SaxMode::TagMaybeClose;
} else {
NotifyBreak();
m_eMode = CFX_SaxMode::Text;
}
return;
}
if (m_Name.empty())
m_dwDataOffset = m_File.m_dwBufIndex;
AppendToName(m_CurByte);
}
void CFX_SAXReader::ParseTagAttributeEqual() {
if (m_CurByte == '=') {
m_SkipChar = 0;
m_eMode = CFX_SaxMode::TagAttributeValue;
return;
}
if (GetCurrentItem()->m_eNode == CFX_SAXItem::Type::Instruction) {
AppendToName(0x20);
m_eMode = CFX_SaxMode::TargetData;
ParseTargetData();
}
}
void CFX_SAXReader::ParseTagAttributeValue() {
if (m_SkipChar) {
if (m_SkipChar == m_CurByte) {
NotifyAttribute();
ClearData();
ClearName();
m_SkipChar = 0;
m_eMode = CFX_SaxMode::TagAttributeName;
return;
}
ParseChar(m_CurByte);
return;
}
if (m_CurByte < 0x21) {
return;
}
if (m_Data.empty()) {
if (m_CurByte == '\'' || m_CurByte == '\"')
m_SkipChar = m_CurByte;
}
}
void CFX_SAXReader::ParseMaybeClose() {
if (m_CurByte == '>') {
if (GetCurrentItem()->m_eNode == CFX_SAXItem::Type::Instruction) {
NotifyTargetData();
ClearData();
ClearName();
}
ParseTagClose();
m_eMode = CFX_SaxMode::Text;
} else if (m_ePrevMode == CFX_SaxMode::TagName) {
AppendToData('/');
m_eMode = CFX_SaxMode::TagName;
m_ePrevMode = CFX_SaxMode::Text;
ParseTagName();
} else if (m_ePrevMode == CFX_SaxMode::TagAttributeName) {
AppendToName('/');
m_eMode = CFX_SaxMode::TagAttributeName;
m_ePrevMode = CFX_SaxMode::Text;
ParseTagAttributeName();
} else if (m_ePrevMode == CFX_SaxMode::TargetData) {
AppendToName('?');
m_eMode = CFX_SaxMode::TargetData;
m_ePrevMode = CFX_SaxMode::Text;
ParseTargetData();
}
}
void CFX_SAXReader::ParseTagClose() {
m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex;
NotifyClose();
Pop();
}
void CFX_SAXReader::ParseTagEnd() {
if (m_CurByte < 0x21) {
return;
}
if (m_CurByte == '>') {
Pop();
m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex;
NotifyEnd();
ClearData();
Pop();
m_eMode = CFX_SaxMode::Text;
} else {
ParseChar(m_CurByte);
}
}
void CFX_SAXReader::ParseTargetData() {
if (m_CurByte == '?') {
m_ePrevMode = m_eMode;
m_eMode = CFX_SaxMode::TagMaybeClose;
} else {
AppendToName(m_CurByte);
}
}
void CFX_SAXReader::SkipNode() {
if (m_SkipChar == '\'' || m_SkipChar == '\"') {
if (m_CurByte != m_SkipChar)
return;
ASSERT(!m_SkipStack.empty());
m_SkipStack.pop();
m_SkipChar = !m_SkipStack.empty() ? m_SkipStack.top() : 0;
return;
}
switch (m_CurByte) {
case '<':
m_SkipChar = '>';
m_SkipStack.push('>');
break;
case '[':
m_SkipChar = ']';
m_SkipStack.push(']');
break;
case '(':
m_SkipChar = ')';
m_SkipStack.push(')');
break;
case '\'':
m_SkipChar = '\'';
m_SkipStack.push('\'');
break;
case '\"':
m_SkipChar = '\"';
m_SkipStack.push('\"');
break;
default:
if (m_CurByte == m_SkipChar) {
m_SkipStack.pop();
m_SkipChar = !m_SkipStack.empty() ? m_SkipStack.top() : 0;
if (m_SkipStack.empty() && m_CurByte == '>') {
if (m_Data.size() >= 9 && memcmp(m_Data.data(), "[CDATA[", 7) == 0 &&
memcmp(m_Data.data() + m_Data.size() - 2, "]]", 2) == 0) {
Pop();
m_Data.erase(m_Data.begin(), m_Data.begin() + 7);
m_Data.erase(m_Data.end() - 2, m_Data.end());
m_bCharData = true;
NotifyData();
m_bCharData = false;
} else {
Pop();
}
ClearData();
m_eMode = CFX_SaxMode::Text;
}
}
break;
}
if (!m_SkipStack.empty())
ParseChar(m_CurByte);
}
void CFX_SAXReader::NotifyData() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Tag)
m_pHandler->OnTagData(
pItem->m_pNode,
m_bCharData ? CFX_SAXItem::Type::CharData : CFX_SAXItem::Type::Text,
ByteStringView(m_Data), m_File.m_dwCur + m_dwDataOffset);
}
void CFX_SAXReader::NotifyEnter() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Tag ||
pItem->m_eNode == CFX_SAXItem::Type::Instruction) {
pItem->m_pNode = m_pHandler->OnTagEnter(ByteStringView(m_Data),
pItem->m_eNode, m_dwNodePos);
}
}
void CFX_SAXReader::NotifyAttribute() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Tag ||
pItem->m_eNode == CFX_SAXItem::Type::Instruction) {
m_pHandler->OnTagAttribute(pItem->m_pNode, ByteStringView(m_Name),
ByteStringView(m_Data));
}
}
void CFX_SAXReader::NotifyBreak() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Tag)
m_pHandler->OnTagBreak(pItem->m_pNode);
}
void CFX_SAXReader::NotifyClose() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Tag ||
pItem->m_eNode == CFX_SAXItem::Type::Instruction) {
m_pHandler->OnTagClose(pItem->m_pNode, m_dwNodePos);
}
}
void CFX_SAXReader::NotifyEnd() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Tag)
m_pHandler->OnTagEnd(pItem->m_pNode, ByteStringView(m_Data), m_dwNodePos);
}
void CFX_SAXReader::NotifyTargetData() {
if (!m_pHandler)
return;
CFX_SAXItem* pItem = GetCurrentItem();
if (!pItem)
return;
if (pItem->m_eNode == CFX_SAXItem::Type::Instruction) {
m_pHandler->OnTargetData(pItem->m_pNode, pItem->m_eNode,
ByteStringView(m_Name), m_dwNodePos);
} else if (pItem->m_eNode == CFX_SAXItem::Type::Comment) {
m_pHandler->OnTargetData(pItem->m_pNode, pItem->m_eNode,
ByteStringView(m_Data), m_dwNodePos);
}
}
void CFX_SAXReader::SkipCurrentNode() {
CFX_SAXItem* pItem = GetCurrentItem();
if (pItem)
pItem->m_bSkip = true;
}