// Copyright 2016 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

#include "core/fpdftext/cpdf_textpagefind.h"

#include <cwchar>
#include <cwctype>
#include <vector>

#include "core/fpdftext/cpdf_textpage.h"
#include "core/fxcrt/fx_string.h"
#include "core/fxcrt/fx_system.h"
#include "third_party/base/stl_util.h"

namespace {

bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
  if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
      (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
      (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
      (curChar >= 0x0400 && curChar <= 0x04FF) ||
      (curChar >= 0x0500 && curChar <= 0x052F) ||
      (curChar >= 0xA640 && curChar <= 0xA69F) ||
      (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
      (curChar >= 0x2000 && curChar <= 0x206F)) {
    return false;
  }
  return true;
}

}  // namespace

CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
    : m_pTextPage(pTextPage),
      m_flags(0),
      m_findNextStart(-1),
      m_findPreStart(-1),
      m_bMatchCase(false),
      m_bMatchWholeWord(false),
      m_resStart(0),
      m_resEnd(-1),
      m_IsFind(false) {
  m_strText = m_pTextPage->GetPageText();
  int nCount = pTextPage->CountChars();
  if (nCount)
    m_CharIndex.push_back(0);
  for (int i = 0; i < nCount; i++) {
    FPDF_CHAR_INFO info;
    pTextPage->GetCharInfo(i, &info);
    int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
    if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
        info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
      if (indexSize % 2) {
        m_CharIndex.push_back(1);
      } else {
        if (indexSize <= 0)
          continue;
        m_CharIndex[indexSize - 1] += 1;
      }
    } else {
      if (indexSize % 2) {
        if (indexSize <= 0)
          continue;
        m_CharIndex[indexSize - 1] = i + 1;
      } else {
        m_CharIndex.push_back(i + 1);
      }
    }
  }
  int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
  if (indexSize % 2)
    m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
}

CPDF_TextPageFind::~CPDF_TextPageFind() {}

int CPDF_TextPageFind::GetCharIndex(int index) const {
  return m_pTextPage->CharIndexFromTextIndex(index);
}

bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
                                  int flags,
                                  int startPos) {
  if (!m_pTextPage)
    return false;
  if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
    m_strText = m_pTextPage->GetPageText();
  CFX_WideString findwhatStr = findwhat;
  m_findWhat = findwhatStr;
  m_flags = flags;
  m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
  if (m_strText.IsEmpty()) {
    m_IsFind = false;
    return true;
  }
  FX_STRSIZE len = findwhatStr.GetLength();
  if (!m_bMatchCase) {
    findwhatStr.MakeLower();
    m_strText.MakeLower();
  }
  m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
  m_findNextStart = startPos;
  if (startPos == -1)
    m_findPreStart = m_strText.GetLength() - 1;
  else
    m_findPreStart = startPos;
  m_csFindWhatArray.clear();
  int i = 0;
  while (i < len) {
    if (findwhatStr.GetAt(i) != ' ')
      break;
    i++;
  }
  if (i < len)
    ExtractFindWhat(findwhatStr);
  else
    m_csFindWhatArray.push_back(findwhatStr);
  if (m_csFindWhatArray.empty())
    return false;
  m_IsFind = true;
  m_resStart = 0;
  m_resEnd = -1;
  return true;
}

bool CPDF_TextPageFind::FindNext() {
  if (!m_pTextPage)
    return false;
  m_resArray.clear();
  if (m_findNextStart == -1)
    return false;
  if (m_strText.IsEmpty()) {
    m_IsFind = false;
    return m_IsFind;
  }
  int strLen = m_strText.GetLength();
  if (m_findNextStart > strLen - 1) {
    m_IsFind = false;
    return m_IsFind;
  }
  int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
  int nResultPos = 0;
  int nStartPos = 0;
  nStartPos = m_findNextStart;
  bool bSpaceStart = false;
  for (int iWord = 0; iWord < nCount; iWord++) {
    CFX_WideString csWord = m_csFindWhatArray[iWord];
    if (csWord.IsEmpty()) {
      if (iWord == nCount - 1) {
        FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
        if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
            strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
          nResultPos = nStartPos + 1;
          break;
        }
        iWord = -1;
      } else if (iWord == 0) {
        bSpaceStart = true;
      }
      continue;
    }
    int endIndex;
    nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
    if (nResultPos == -1) {
      m_IsFind = false;
      return m_IsFind;
    }
    endIndex = nResultPos + csWord.GetLength() - 1;
    if (iWord == 0)
      m_resStart = nResultPos;
    bool bMatch = true;
    if (iWord != 0 && !bSpaceStart) {
      int PreResEndPos = nStartPos;
      int curChar = csWord.GetAt(0);
      CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
      int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
      if (nStartPos == nResultPos &&
          !(IsIgnoreSpaceCharacter(lastChar) ||
            IsIgnoreSpaceCharacter(curChar))) {
        bMatch = false;
      }
      for (int d = PreResEndPos; d < nResultPos; d++) {
        FX_WCHAR strInsert = m_strText.GetAt(d);
        if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
            strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
          bMatch = false;
          break;
        }
      }
    } else if (bSpaceStart) {
      if (nResultPos > 0) {
        FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
        if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
            strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
          bMatch = false;
          m_resStart = nResultPos;
        } else {
          m_resStart = nResultPos - 1;
        }
      }
    }
    if (m_bMatchWholeWord && bMatch) {
      bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
    }
    nStartPos = endIndex + 1;
    if (!bMatch) {
      iWord = -1;
      if (bSpaceStart)
        nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
      else
        nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
    }
  }
  m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
  m_IsFind = true;
  int resStart = GetCharIndex(m_resStart);
  int resEnd = GetCharIndex(m_resEnd);
  m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
  if (m_flags & FPDFTEXT_CONSECUTIVE) {
    m_findNextStart = m_resStart + 1;
    m_findPreStart = m_resEnd - 1;
  } else {
    m_findNextStart = m_resEnd + 1;
    m_findPreStart = m_resStart - 1;
  }
  return m_IsFind;
}

bool CPDF_TextPageFind::FindPrev() {
  if (!m_pTextPage)
    return false;
  m_resArray.clear();
  if (m_strText.IsEmpty() || m_findPreStart < 0) {
    m_IsFind = false;
    return m_IsFind;
  }
  CPDF_TextPageFind findEngine(m_pTextPage);
  bool ret = findEngine.FindFirst(m_findWhat, m_flags);
  if (!ret) {
    m_IsFind = false;
    return m_IsFind;
  }
  int order = -1, MatchedCount = 0;
  while (ret) {
    ret = findEngine.FindNext();
    if (ret) {
      int order1 = findEngine.GetCurOrder();
      int MatchedCount1 = findEngine.GetMatchedCount();
      if (((order1 + MatchedCount1) - 1) > m_findPreStart)
        break;
      order = order1;
      MatchedCount = MatchedCount1;
    }
  }
  if (order == -1) {
    m_IsFind = false;
    return m_IsFind;
  }
  m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
  m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
  m_IsFind = true;
  m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
  if (m_flags & FPDFTEXT_CONSECUTIVE) {
    m_findNextStart = m_resStart + 1;
    m_findPreStart = m_resEnd - 1;
  } else {
    m_findNextStart = m_resEnd + 1;
    m_findPreStart = m_resStart - 1;
  }
  return m_IsFind;
}

void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
  if (findwhat.IsEmpty())
    return;
  int index = 0;
  while (1) {
    CFX_WideString csWord = TEXT_EMPTY;
    int ret =
        ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
    if (csWord.IsEmpty()) {
      if (ret) {
        m_csFindWhatArray.push_back(L"");
        index++;
        continue;
      } else {
        break;
      }
    }
    int pos = 0;
    while (pos < csWord.GetLength()) {
      CFX_WideString curStr = csWord.Mid(pos, 1);
      FX_WCHAR curChar = csWord.GetAt(pos);
      if (IsIgnoreSpaceCharacter(curChar)) {
        if (pos > 0 && curChar == 0x2019) {
          pos++;
          continue;
        }
        if (pos > 0)
          m_csFindWhatArray.push_back(csWord.Mid(0, pos));
        m_csFindWhatArray.push_back(curStr);
        if (pos == csWord.GetLength() - 1) {
          csWord.clear();
          break;
        }
        csWord = csWord.Right(csWord.GetLength() - pos - 1);
        pos = 0;
        continue;
      }
      pos++;
    }
    if (!csWord.IsEmpty())
      m_csFindWhatArray.push_back(csWord);
    index++;
  }
}

bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
                                         int startPos,
                                         int endPos) {
  FX_WCHAR char_left = 0;
  FX_WCHAR char_right = 0;
  int char_count = endPos - startPos + 1;
  if (char_count < 1)
    return false;
  if (char_count == 1 && csPageText.GetAt(startPos) > 255)
    return true;
  if (startPos - 1 >= 0)
    char_left = csPageText.GetAt(startPos - 1);
  if (startPos + char_count < csPageText.GetLength())
    char_right = csPageText.GetAt(startPos + char_count);
  if ((char_left > 'A' && char_left < 'a') ||
      (char_left > 'a' && char_left < 'z') ||
      (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
      (char_right > 'A' && char_right < 'a') ||
      (char_right > 'a' && char_right < 'z') ||
      (char_right > 0xfb00 && char_right < 0xfb06) ||
      std::iswdigit(char_right)) {
    return false;
  }
  if (!(('A' > char_left || char_left > 'Z') &&
        ('a' > char_left || char_left > 'z') &&
        ('A' > char_right || char_right > 'Z') &&
        ('a' > char_right || char_right > 'z'))) {
    return false;
  }
  if (char_count > 0) {
    if (csPageText.GetAt(startPos) >= L'0' &&
        csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
        char_left <= L'9') {
      return false;
    }
    if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
        char_right >= L'0' && char_right <= L'9') {
      return false;
    }
  }
  return true;
}

bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
                                         const FX_WCHAR* lpszFullString,
                                         int iSubString,
                                         FX_WCHAR chSep) {
  if (!lpszFullString)
    return false;
  while (iSubString--) {
    lpszFullString = std::wcschr(lpszFullString, chSep);
    if (!lpszFullString) {
      rString.clear();
      return false;
    }
    lpszFullString++;
    while (*lpszFullString == chSep)
      lpszFullString++;
  }
  const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
  int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
                     : (int)FXSYS_wcslen(lpszFullString);
  ASSERT(nLen >= 0);
  FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
               nLen * sizeof(FX_WCHAR));
  rString.ReleaseBuffer();
  return true;
}

CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
  CFX_WideString str2;
  str2.clear();
  int nlen = str.GetLength();
  for (int i = nlen - 1; i >= 0; i--)
    str2 += str.GetAt(i);
  return str2;
}

int CPDF_TextPageFind::GetCurOrder() const {
  return GetCharIndex(m_resStart);
}

int CPDF_TextPageFind::GetMatchedCount() const {
  int resStart = GetCharIndex(m_resStart);
  int resEnd = GetCharIndex(m_resEnd);
  return resEnd - resStart + 1;
}