// Copyright 2016 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include "core/fpdftext/cpdf_textpagefind.h" #include <cwchar> #include <cwctype> #include <vector> #include "core/fpdftext/cpdf_textpage.h" #include "core/fxcrt/fx_string.h" #include "core/fxcrt/fx_system.h" #include "third_party/base/stl_util.h" namespace { bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) { if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || (curChar >= 0xFE70 && curChar <= 0xFEFF) || (curChar >= 0xFB50 && curChar <= 0xFDFF) || (curChar >= 0x0400 && curChar <= 0x04FF) || (curChar >= 0x0500 && curChar <= 0x052F) || (curChar >= 0xA640 && curChar <= 0xA69F) || (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || (curChar >= 0x2000 && curChar <= 0x206F)) { return false; } return true; } } // namespace CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) : m_pTextPage(pTextPage), m_flags(0), m_findNextStart(-1), m_findPreStart(-1), m_bMatchCase(false), m_bMatchWholeWord(false), m_resStart(0), m_resEnd(-1), m_IsFind(false) { m_strText = m_pTextPage->GetPageText(); int nCount = pTextPage->CountChars(); if (nCount) m_CharIndex.push_back(0); for (int i = 0; i < nCount; i++) { FPDF_CHAR_INFO info; pTextPage->GetCharInfo(i, &info); int indexSize = pdfium::CollectionSize<int>(m_CharIndex); if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || info.m_Flag == FPDFTEXT_CHAR_GENERATED) { if (indexSize % 2) { m_CharIndex.push_back(1); } else { if (indexSize <= 0) continue; m_CharIndex[indexSize - 1] += 1; } } else { if (indexSize % 2) { if (indexSize <= 0) continue; m_CharIndex[indexSize - 1] = i + 1; } else { m_CharIndex.push_back(i + 1); } } } int indexSize = pdfium::CollectionSize<int>(m_CharIndex); if (indexSize % 2) m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); } CPDF_TextPageFind::~CPDF_TextPageFind() {} int CPDF_TextPageFind::GetCharIndex(int index) const { return m_pTextPage->CharIndexFromTextIndex(index); } bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, int flags, int startPos) { if (!m_pTextPage) return false; if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) m_strText = m_pTextPage->GetPageText(); CFX_WideString findwhatStr = findwhat; m_findWhat = findwhatStr; m_flags = flags; m_bMatchCase = flags & FPDFTEXT_MATCHCASE; if (m_strText.IsEmpty()) { m_IsFind = false; return true; } FX_STRSIZE len = findwhatStr.GetLength(); if (!m_bMatchCase) { findwhatStr.MakeLower(); m_strText.MakeLower(); } m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD); m_findNextStart = startPos; if (startPos == -1) m_findPreStart = m_strText.GetLength() - 1; else m_findPreStart = startPos; m_csFindWhatArray.clear(); int i = 0; while (i < len) { if (findwhatStr.GetAt(i) != ' ') break; i++; } if (i < len) ExtractFindWhat(findwhatStr); else m_csFindWhatArray.push_back(findwhatStr); if (m_csFindWhatArray.empty()) return false; m_IsFind = true; m_resStart = 0; m_resEnd = -1; return true; } bool CPDF_TextPageFind::FindNext() { if (!m_pTextPage) return false; m_resArray.clear(); if (m_findNextStart == -1) return false; if (m_strText.IsEmpty()) { m_IsFind = false; return m_IsFind; } int strLen = m_strText.GetLength(); if (m_findNextStart > strLen - 1) { m_IsFind = false; return m_IsFind; } int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); int nResultPos = 0; int nStartPos = 0; nStartPos = m_findNextStart; bool bSpaceStart = false; for (int iWord = 0; iWord < nCount; iWord++) { CFX_WideString csWord = m_csFindWhatArray[iWord]; if (csWord.IsEmpty()) { if (iWord == nCount - 1) { FX_WCHAR strInsert = m_strText.GetAt(nStartPos); if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || strInsert == TEXT_RETURN_CHAR || strInsert == 160) { nResultPos = nStartPos + 1; break; } iWord = -1; } else if (iWord == 0) { bSpaceStart = true; } continue; } int endIndex; nResultPos = m_strText.Find(csWord.c_str(), nStartPos); if (nResultPos == -1) { m_IsFind = false; return m_IsFind; } endIndex = nResultPos + csWord.GetLength() - 1; if (iWord == 0) m_resStart = nResultPos; bool bMatch = true; if (iWord != 0 && !bSpaceStart) { int PreResEndPos = nStartPos; int curChar = csWord.GetAt(0); CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); if (nStartPos == nResultPos && !(IsIgnoreSpaceCharacter(lastChar) || IsIgnoreSpaceCharacter(curChar))) { bMatch = false; } for (int d = PreResEndPos; d < nResultPos; d++) { FX_WCHAR strInsert = m_strText.GetAt(d); if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) { bMatch = false; break; } } } else if (bSpaceStart) { if (nResultPos > 0) { FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) { bMatch = false; m_resStart = nResultPos; } else { m_resStart = nResultPos - 1; } } } if (m_bMatchWholeWord && bMatch) { bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); } nStartPos = endIndex + 1; if (!bMatch) { iWord = -1; if (bSpaceStart) nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); else nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); } } m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1; m_IsFind = true; int resStart = GetCharIndex(m_resStart); int resEnd = GetCharIndex(m_resEnd); m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); if (m_flags & FPDFTEXT_CONSECUTIVE) { m_findNextStart = m_resStart + 1; m_findPreStart = m_resEnd - 1; } else { m_findNextStart = m_resEnd + 1; m_findPreStart = m_resStart - 1; } return m_IsFind; } bool CPDF_TextPageFind::FindPrev() { if (!m_pTextPage) return false; m_resArray.clear(); if (m_strText.IsEmpty() || m_findPreStart < 0) { m_IsFind = false; return m_IsFind; } CPDF_TextPageFind findEngine(m_pTextPage); bool ret = findEngine.FindFirst(m_findWhat, m_flags); if (!ret) { m_IsFind = false; return m_IsFind; } int order = -1, MatchedCount = 0; while (ret) { ret = findEngine.FindNext(); if (ret) { int order1 = findEngine.GetCurOrder(); int MatchedCount1 = findEngine.GetMatchedCount(); if (((order1 + MatchedCount1) - 1) > m_findPreStart) break; order = order1; MatchedCount = MatchedCount1; } } if (order == -1) { m_IsFind = false; return m_IsFind; } m_resStart = m_pTextPage->TextIndexFromCharIndex(order); m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); m_IsFind = true; m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); if (m_flags & FPDFTEXT_CONSECUTIVE) { m_findNextStart = m_resStart + 1; m_findPreStart = m_resEnd - 1; } else { m_findNextStart = m_resEnd + 1; m_findPreStart = m_resStart - 1; } return m_IsFind; } void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { if (findwhat.IsEmpty()) return; int index = 0; while (1) { CFX_WideString csWord = TEXT_EMPTY; int ret = ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR); if (csWord.IsEmpty()) { if (ret) { m_csFindWhatArray.push_back(L""); index++; continue; } else { break; } } int pos = 0; while (pos < csWord.GetLength()) { CFX_WideString curStr = csWord.Mid(pos, 1); FX_WCHAR curChar = csWord.GetAt(pos); if (IsIgnoreSpaceCharacter(curChar)) { if (pos > 0 && curChar == 0x2019) { pos++; continue; } if (pos > 0) m_csFindWhatArray.push_back(csWord.Mid(0, pos)); m_csFindWhatArray.push_back(curStr); if (pos == csWord.GetLength() - 1) { csWord.clear(); break; } csWord = csWord.Right(csWord.GetLength() - pos - 1); pos = 0; continue; } pos++; } if (!csWord.IsEmpty()) m_csFindWhatArray.push_back(csWord); index++; } } bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, int startPos, int endPos) { FX_WCHAR char_left = 0; FX_WCHAR char_right = 0; int char_count = endPos - startPos + 1; if (char_count < 1) return false; if (char_count == 1 && csPageText.GetAt(startPos) > 255) return true; if (startPos - 1 >= 0) char_left = csPageText.GetAt(startPos - 1); if (startPos + char_count < csPageText.GetLength()) char_right = csPageText.GetAt(startPos + char_count); if ((char_left > 'A' && char_left < 'a') || (char_left > 'a' && char_left < 'z') || (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || (char_right > 'A' && char_right < 'a') || (char_right > 'a' && char_right < 'z') || (char_right > 0xfb00 && char_right < 0xfb06) || std::iswdigit(char_right)) { return false; } if (!(('A' > char_left || char_left > 'Z') && ('a' > char_left || char_left > 'z') && ('A' > char_right || char_right > 'Z') && ('a' > char_right || char_right > 'z'))) { return false; } if (char_count > 0) { if (csPageText.GetAt(startPos) >= L'0' && csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && char_left <= L'9') { return false; } if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && char_right >= L'0' && char_right <= L'9') { return false; } } return true; } bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, const FX_WCHAR* lpszFullString, int iSubString, FX_WCHAR chSep) { if (!lpszFullString) return false; while (iSubString--) { lpszFullString = std::wcschr(lpszFullString, chSep); if (!lpszFullString) { rString.clear(); return false; } lpszFullString++; while (*lpszFullString == chSep) lpszFullString++; } const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep); int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) : (int)FXSYS_wcslen(lpszFullString); ASSERT(nLen >= 0); FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, nLen * sizeof(FX_WCHAR)); rString.ReleaseBuffer(); return true; } CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { CFX_WideString str2; str2.clear(); int nlen = str.GetLength(); for (int i = nlen - 1; i >= 0; i--) str2 += str.GetAt(i); return str2; } int CPDF_TextPageFind::GetCurOrder() const { return GetCharIndex(m_resStart); } int CPDF_TextPageFind::GetMatchedCount() const { int resStart = GetCharIndex(m_resStart); int resEnd = GetCharIndex(m_resEnd); return resEnd - resStart + 1; }