C++程序  |  790行  |  24.36 KB

// Copyright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

#include <cctype>
#include <cwctype>
#include <memory>

#include "core/include/fpdfapi/fpdf_page.h"
#include "core/include/fpdfapi/fpdf_pageobj.h"
#include "core/include/fpdfapi/fpdf_resource.h"
#include "core/include/fpdftext/fpdf_text.h"
#include "core/include/fxcrt/fx_bidi.h"
#include "core/include/fxcrt/fx_ucd.h"
#include "text_int.h"
#include "txtproc.h"

CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode,
                                  int destcp,
                                  const FX_CHAR* defchar) {
  if (destcp == 0) {
    if (unicode < 0x80) {
      return CFX_ByteString((char)unicode);
    }
    const FX_CHAR* altstr = FCS_GetAltStr(unicode);
    return CFX_ByteString(altstr ? altstr : defchar);
  }
  char buf[10];
  int iDef = 0;
  int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10,
                                      NULL, &iDef);
  if (ret && !iDef) {
    return CFX_ByteString(buf, ret);
  }
  const FX_CHAR* altstr = FCS_GetAltStr(unicode);
  return CFX_ByteString(altstr ? altstr : defchar);
}
CTextPage::CTextPage() {}
CTextPage::~CTextPage() {
  int i;
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    delete m_BaseLines.GetAt(i);
  }
  for (i = 0; i < m_TextColumns.GetSize(); i++) {
    delete m_TextColumns.GetAt(i);
  }
}
void CTextPage::ProcessObject(CPDF_PageObject* pObject) {
  if (pObject->m_Type != PDFPAGE_TEXT) {
    return;
  }
  CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
  CPDF_Font* pFont = pText->m_TextState.GetFont();
  int count = pText->CountItems();
  FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
  pText->CalcCharPos(pPosArray);

  FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
  FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
  FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
  FX_FLOAT spacew = 0;
  if (space_charcode != -1) {
    spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
  }
  if (spacew == 0) {
    spacew = fontsize_h / 4;
  }
  if (pText->m_TextState.GetBaselineAngle() != 0) {
    int cc = 0;
    CFX_Matrix matrix;
    pText->GetTextMatrix(&matrix);
    for (int i = 0; i < pText->m_nChars; i++) {
      FX_DWORD charcode = pText->m_nChars == 1
                              ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
                              : pText->m_pCharCodes[i];
      if (charcode == (FX_DWORD)-1) {
        continue;
      }
      FX_RECT char_box;
      pFont->GetCharBBox(charcode, char_box);
      FX_FLOAT char_left =
          pPosArray ? pPosArray[cc * 2]
                    : char_box.left * pText->m_TextState.GetFontSize() / 1000;
      FX_FLOAT char_right =
          pPosArray ? pPosArray[cc * 2 + 1]
                    : char_box.right * pText->m_TextState.GetFontSize() / 1000;
      FX_FLOAT char_top =
          char_box.top * pText->m_TextState.GetFontSize() / 1000;
      FX_FLOAT char_bottom =
          char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
      cc++;
      FX_FLOAT char_origx, char_origy;
      matrix.Transform(char_left, 0, char_origx, char_origy);
      matrix.TransformRect(char_left, char_right, char_top, char_bottom);
      CFX_ByteString str;
      pFont->AppendChar(str, charcode);
      InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
                    char_bottom, spacew, fontsize_v, str, pFont);
    }
    FX_Free(pPosArray);
    return;
  }
  FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
  for (int ii = 0; ii < count * 2; ii++) {
    pPosArray[ii] *= ratio_h;
  }
  FX_FLOAT baseline = pText->m_PosY;
  CTextBaseLine* pBaseLine = NULL;
  FX_FLOAT topy = pText->m_Top;
  FX_FLOAT bottomy = pText->m_Bottom;
  FX_FLOAT leftx = pText->m_Left;
  int cc = 0;
  CFX_ByteString segment;
  int space_count = 0;
  FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
  for (int i = 0; i < pText->m_nChars; i++) {
    FX_DWORD charcode = pText->m_nChars == 1
                            ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
                            : pText->m_pCharCodes[i];
    if (charcode == (FX_DWORD)-1) {
      continue;
    }
    FX_FLOAT char_left = pPosArray[cc * 2];
    FX_FLOAT char_right = pPosArray[cc * 2 + 1];
    cc++;
    if (char_left < last_left || (char_left - last_right) > spacew / 2) {
      pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
                                leftx + segment_right, topy, bottomy, spacew,
                                fontsize_v, segment, pFont);
      segment_left = char_left;
      segment = "";
    }
    if (space_count > 1) {
      pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
                                leftx + segment_right, topy, bottomy, spacew,
                                fontsize_v, segment, pFont);
      segment = "";
    } else if (space_count == 1) {
      pFont->AppendChar(segment, ' ');
    }
    if (segment.GetLength() == 0) {
      segment_left = char_left;
    }
    segment_right = char_right;
    pFont->AppendChar(segment, charcode);
    space_count = 0;
    last_left = char_left;
    last_right = char_right;
  }
  if (segment.GetLength())
    pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
                              leftx + segment_right, topy, bottomy, spacew,
                              fontsize_v, segment, pFont);
  FX_Free(pPosArray);
}
CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine,
                                        FX_FLOAT basey,
                                        FX_FLOAT leftx,
                                        FX_FLOAT rightx,
                                        FX_FLOAT topy,
                                        FX_FLOAT bottomy,
                                        FX_FLOAT spacew,
                                        FX_FLOAT fontsize_v,
                                        CFX_ByteString& str,
                                        CPDF_Font* pFont) {
  if (str.GetLength() == 0) {
    return NULL;
  }
  if (!pBaseLine) {
    int i;
    for (i = 0; i < m_BaseLines.GetSize(); i++) {
      CTextBaseLine* pExistLine = m_BaseLines.GetAt(i);
      if (pExistLine->m_BaseLine == basey) {
        pBaseLine = pExistLine;
        break;
      }
      if (pExistLine->m_BaseLine < basey) {
        break;
      }
    }
    if (!pBaseLine) {
      pBaseLine = new CTextBaseLine;
      pBaseLine->m_BaseLine = basey;
      m_BaseLines.InsertAt(i, pBaseLine);
    }
  }
  CFX_WideString text;
  const FX_CHAR* pStr = str;
  int len = str.GetLength(), offset = 0;
  while (offset < len) {
    FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
    CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
    if (unicode_str.IsEmpty()) {
      text += (FX_WCHAR)ch;
    } else {
      text += unicode_str;
    }
  }
  pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v,
                           text);
  return pBaseLine;
}
void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) {
  FX_FLOAT lastheight = -1;
  FX_FLOAT lastbaseline = -1;
  FX_FLOAT MinLeftX = 1000000;
  FX_FLOAT MaxRightX = 0;
  int i;
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    FX_FLOAT leftx, rightx;
    if (pBaseLine->GetWidth(leftx, rightx)) {
      if (leftx < MinLeftX) {
        MinLeftX = leftx;
      }
      if (rightx > MaxRightX) {
        MaxRightX = rightx;
      }
    }
  }
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    m_BaseLines.GetAt(i)->MergeBoxes();
  }
  for (i = 1; i < m_BaseLines.GetSize(); i++) {
    CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1);
    if (pBaseLine->CanMerge(pPrevLine)) {
      pPrevLine->Merge(pBaseLine);
      delete pBaseLine;
      m_BaseLines.RemoveAt(i);
      i--;
    }
  }
  if (m_bAutoWidth) {
    int* widths = FX_Alloc(int, m_BaseLines.GetSize());
    for (i = 0; i < m_BaseLines.GetSize(); i++) {
      widths[i] = 0;
      CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
      int TotalChars = 0;
      FX_FLOAT TotalWidth = 0;
      int minchars;
      pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
      if (TotalChars) {
        FX_FLOAT charwidth = TotalWidth / TotalChars;
        widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
      }
      if (widths[i] > 1000) {
        widths[i] = 1000;
      }
      if (widths[i] < minchars) {
        widths[i] = minchars;
      }
    }
    int AvgWidth = 0, widthcount = 0;
    for (i = 0; i < m_BaseLines.GetSize(); i++)
      if (widths[i]) {
        AvgWidth += widths[i];
        widthcount++;
      }
    AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
    int MaxWidth = 0;
    for (i = 0; i < m_BaseLines.GetSize(); i++)
      if (MaxWidth < widths[i]) {
        MaxWidth = widths[i];
      }
    if (MaxWidth > AvgWidth * 6 / 5) {
      MaxWidth = AvgWidth * 6 / 5;
    }
    FX_Free(widths);
    if (iMinWidth < MaxWidth) {
      iMinWidth = MaxWidth;
    }
  }
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    m_BaseLines.GetAt(i)->MergeBoxes();
  }
  if (m_bKeepColumn) {
    FindColumns();
  }
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    if (lastheight >= 0) {
      FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
      if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
        lines.Add(L"");
      }
    }
    lastheight = pBaseLine->m_MaxFontSizeV;
    lastbaseline = pBaseLine->m_BaseLine;
    CFX_WideString str;
    pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
    lines.Add(str);
  }
}
void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) {
  wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
  FX_WCHAR* pDst = NULL;
  FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
  if (nCount < 1) {
    sDest += wChar;
    return;
  }
  pDst = new FX_WCHAR[nCount];
  FX_Unicode_GetNormalization(wChar, pDst);
  for (int nIndex = 0; nIndex < nCount; nIndex++) {
    sDest += pDst[nIndex];
  }
  delete[] pDst;
}
void NormalizeString(CFX_WideString& str) {
  if (str.GetLength() <= 0) {
    return;
  }
  CFX_WideString sBuffer;
  std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
  CFX_WordArray order;
  FX_BOOL bR2L = FALSE;
  int32_t start = 0, count = 0, i = 0;
  int nR2L = 0, nL2R = 0;
  for (i = 0; i < str.GetLength(); i++) {
    if (pBidiChar->AppendChar(str.GetAt(i))) {
      CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
      order.Add(start);
      order.Add(count);
      order.Add(ret);
      if (!bR2L) {
        if (ret == CFX_BidiChar::RIGHT) {
          nR2L++;
        } else if (ret == CFX_BidiChar::LEFT) {
          nL2R++;
        }
      }
    }
  }
  if (pBidiChar->EndChar()) {
    CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
    order.Add(start);
    order.Add(count);
    order.Add(ret);
    if (!bR2L) {
      if (ret == CFX_BidiChar::RIGHT) {
        nR2L++;
      } else if (ret == CFX_BidiChar::LEFT) {
        nL2R++;
      }
    }
  }
  if (nR2L > 0 && nR2L >= nL2R) {
    bR2L = TRUE;
  }
  if (bR2L) {
    int count = order.GetSize();
    for (int j = count - 1; j > 0; j -= 3) {
      int ret = order.GetAt(j);
      int start = order.GetAt(j - 2);
      int count1 = order.GetAt(j - 1);
      if (ret == 2 || ret == 0) {
        for (int i = start + count1 - 1; i >= start; i--) {
          NormalizeCompositeChar(str[i], sBuffer);
        }
      } else {
        i = j;
        FX_BOOL bSymbol = FALSE;
        while (i > 0 && order.GetAt(i) != 2) {
          bSymbol = !order.GetAt(i);
          i -= 3;
        }
        int end = start + count1;
        int n = 0;
        if (bSymbol) {
          n = i + 6;
        } else {
          n = i + 3;
        }
        if (n >= j) {
          for (int m = start; m < end; m++) {
            sBuffer += str[m];
          }
        } else {
          i = j;
          j = n;
          for (; n <= i; n += 3) {
            int start = order.GetAt(n - 2);
            int count1 = order.GetAt(n - 1);
            int end = start + count1;
            for (int m = start; m < end; m++) {
              sBuffer += str[m];
            }
          }
        }
      }
    }
  } else {
    int count = order.GetSize();
    FX_BOOL bL2R = FALSE;
    for (int j = 0; j < count; j += 3) {
      int ret = order.GetAt(j + 2);
      int start = order.GetAt(j);
      int count1 = order.GetAt(j + 1);
      if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
        int i = j + 3;
        while (bR2L && i < count) {
          if (order.GetAt(i + 2) == 1) {
            break;
          } else {
            i += 3;
          }
        }
        if (i == 3) {
          j = -3;
          bL2R = TRUE;
          continue;
        }
        int end = str.GetLength() - 1;
        if (i < count) {
          end = order.GetAt(i) - 1;
        }
        j = i - 3;
        for (int n = end; n >= start; n--) {
          NormalizeCompositeChar(str[i], sBuffer);
        }
      } else {
        int end = start + count1;
        for (int i = start; i < end; i++) {
          sBuffer += str[i];
        }
      }
    }
  }
  str.Empty();
  str += sBuffer;
}
static FX_BOOL IsNumber(CFX_WideString& str) {
  for (int i = 0; i < str.GetLength(); i++) {
    FX_WCHAR ch = str[i];
    // TODO(dsinclair): --.+ +.-- should probably not be a number.
    if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ')
      return FALSE;
  }
  return TRUE;
}
void CTextPage::FindColumns() {
  int i;
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
      CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
      CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
      if (pColumn) {
        pColumn->m_AvgPos =
            (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
            (pColumn->m_Count + 1);
        pColumn->m_Count++;
      } else {
        pColumn = new CTextColumn;
        pColumn->m_Count = 1;
        pColumn->m_AvgPos = pTextBox->m_Right;
        pColumn->m_TextPos = -1;
        m_TextColumns.Add(pColumn);
      }
    }
  }
  int mincount = m_BaseLines.GetSize() / 4;
  for (i = 0; i < m_TextColumns.GetSize(); i++) {
    CTextColumn* pTextColumn = m_TextColumns.GetAt(i);
    if (pTextColumn->m_Count >= mincount) {
      continue;
    }
    delete pTextColumn;
    m_TextColumns.RemoveAt(i);
    i--;
  }
  for (i = 0; i < m_BaseLines.GetSize(); i++) {
    CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
      CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
      if (IsNumber(pTextBox->m_Text)) {
        pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
      }
    }
  }
}
CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) {
  for (int i = 0; i < m_TextColumns.GetSize(); i++) {
    CTextColumn* pColumn = m_TextColumns.GetAt(i);
    if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
      return pColumn;
    }
  }
  return NULL;
}
void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {}
CTextBaseLine::CTextBaseLine() {
  m_Top = -100000;
  m_Bottom = 100000;
  m_MaxFontSizeV = 0;
}
CTextBaseLine::~CTextBaseLine() {
  for (int i = 0; i < m_TextList.GetSize(); i++) {
    delete m_TextList.GetAt(i);
  }
}
void CTextBaseLine::InsertTextBox(FX_FLOAT leftx,
                                  FX_FLOAT rightx,
                                  FX_FLOAT topy,
                                  FX_FLOAT bottomy,
                                  FX_FLOAT spacew,
                                  FX_FLOAT fontsize_v,
                                  const CFX_WideString& text) {
  if (m_Top < topy) {
    m_Top = topy;
  }
  if (m_Bottom > bottomy) {
    m_Bottom = bottomy;
  }
  if (m_MaxFontSizeV < fontsize_v) {
    m_MaxFontSizeV = fontsize_v;
  }
  int i;
  for (i = 0; i < m_TextList.GetSize(); i++) {
    CTextBox* pText = m_TextList.GetAt(i);
    if (pText->m_Left > leftx) {
      break;
    }
  }
  CTextBox* pText = new CTextBox;
  pText->m_Text = text;
  pText->m_Left = leftx;
  pText->m_Right = rightx;
  pText->m_Top = topy;
  pText->m_Bottom = bottomy;
  pText->m_SpaceWidth = spacew;
  pText->m_FontSizeV = fontsize_v;
  pText->m_pColumn = NULL;
  m_TextList.InsertAt(i, pText);
}
FX_BOOL GetIntersection(FX_FLOAT low1,
                        FX_FLOAT high1,
                        FX_FLOAT low2,
                        FX_FLOAT high2,
                        FX_FLOAT& interlow,
                        FX_FLOAT& interhigh);
FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) {
  FX_FLOAT inter_top, inter_bottom;
  if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
                       inter_bottom, inter_top)) {
    return FALSE;
  }
  FX_FLOAT inter_h = inter_top - inter_bottom;
  if (inter_h < (m_Top - m_Bottom) / 2 &&
      inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
    return FALSE;
  }
  FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
  for (int i = 0; i < m_TextList.GetSize(); i++) {
    CTextBox* pText = m_TextList.GetAt(i);
    for (int j = 0; j < pOther->m_TextList.GetSize(); j++) {
      CTextBox* pOtherText = pOther->m_TextList.GetAt(j);
      FX_FLOAT inter_left, inter_right;
      if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left,
                           pOtherText->m_Right, inter_left, inter_right)) {
        continue;
      }
      FX_FLOAT inter_w = inter_right - inter_left;
      if (inter_w < pText->m_SpaceWidth / 2 &&
          inter_w < pOtherText->m_SpaceWidth / 2) {
        continue;
      }
      if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
          dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
        return FALSE;
      }
    }
  }
  return TRUE;
}
void CTextBaseLine::Merge(CTextBaseLine* pOther) {
  for (int i = 0; i < pOther->m_TextList.GetSize(); i++) {
    CTextBox* pText = pOther->m_TextList.GetAt(i);
    InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
                  pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
  }
}
FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) {
  int i;
  for (i = 0; i < m_TextList.GetSize(); i++) {
    CTextBox* pText = m_TextList.GetAt(i);
    if (pText->m_Text != L" ") {
      break;
    }
  }
  if (i == m_TextList.GetSize()) {
    return FALSE;
  }
  CTextBox* pText = m_TextList.GetAt(i);
  leftx = pText->m_Left;
  for (i = m_TextList.GetSize() - 1; i >= 0; i--) {
    CTextBox* pText = m_TextList.GetAt(i);
    if (pText->m_Text != L" ") {
      break;
    }
  }
  pText = m_TextList.GetAt(i);
  rightx = pText->m_Right;
  return TRUE;
}
void CTextBaseLine::MergeBoxes() {
  int i = 0;
  while (1) {
    if (i >= m_TextList.GetSize() - 1) {
      break;
    }
    CTextBox* pThisText = m_TextList.GetAt(i);
    CTextBox* pNextText = m_TextList.GetAt(i + 1);
    FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
    FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0)
                          ? pNextText->m_SpaceWidth
                          : pThisText->m_SpaceWidth;
    if (spacew > 0.0 && dx < spacew * 2) {
      pThisText->m_Right = pNextText->m_Right;
      if (dx > spacew * 1.5) {
        pThisText->m_Text += L"  ";
      } else if (dx > spacew / 3) {
        pThisText->m_Text += L' ';
      }
      pThisText->m_Text += pNextText->m_Text;
      pThisText->m_SpaceWidth =
          pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth;
      m_TextList.RemoveAt(i + 1);
      delete pNextText;
    } else {
      i++;
    }
  }
}
void CTextBaseLine::WriteOutput(CFX_WideString& str,
                                FX_FLOAT leftx,
                                FX_FLOAT pagewidth,
                                int iTextWidth) {
  int lastpos = -1;
  for (int i = 0; i < m_TextList.GetSize(); i++) {
    CTextBox* pText = m_TextList.GetAt(i);
    int xpos;
    if (pText->m_pColumn) {
      xpos =
          (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth +
                0.5);
      xpos -= pText->m_Text.GetLength();
    } else {
      xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
    }
    if (xpos <= lastpos) {
      xpos = lastpos + 1;
    }
    for (int j = lastpos + 1; j < xpos; j++) {
      str += ' ';
    }
    CFX_WideString sSrc(pText->m_Text);
    NormalizeString(sSrc);
    str += sSrc;
    str += ' ';
    lastpos = xpos + pText->m_Text.GetLength();
  }
}
void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) {
  minchars = 0;
  for (int i = 0; i < m_TextList.GetSize(); i++) {
    CTextBox* pText = m_TextList.GetAt(i);
    if (pText->m_Right - pText->m_Left < 0.002) {
      continue;
    }
    count += pText->m_Text.GetLength();
    width += pText->m_Right - pText->m_Left;
    minchars += pText->m_Text.GetLength() + 1;
  }
}
#define PI 3.1415926535897932384626433832795
static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) {
  int total_count = 0, rotated_count[3] = {0, 0, 0};
  FX_POSITION pos = page.GetFirstObjectPosition();
  while (pos) {
    CPDF_PageObject* pObj = page.GetNextObject(pos);
    if (pObj->m_Type != PDFPAGE_TEXT) {
      continue;
    }
    total_count++;
    CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
    FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
    if (angle == 0.0) {
      continue;
    }
    int degree = (int)(angle * 180 / PI + 0.5);
    if (degree % 90) {
      continue;
    }
    if (degree < 0) {
      degree += 360;
    }
    int index = degree / 90 % 3 - 1;
    if (index < 0) {
      continue;
    }
    rotated_count[index]++;
  }
  if (total_count == 0) {
    return;
  }
  CFX_Matrix matrix;
  if (rotated_count[0] > total_count * 2 / 3) {
    matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
  } else if (rotated_count[1] > total_count * 2 / 3) {
    matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
  } else if (rotated_count[2] > total_count * 2 / 3) {
    matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
  } else {
    return;
  }
  page.Transform(matrix);
  page_bbox.Transform(&matrix);
}
void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
                             CPDF_Document* pDoc,
                             CPDF_Dictionary* pPage,
                             int iMinWidth,
                             FX_DWORD flags) {
  lines.RemoveAll();
  if (!pPage) {
    return;
  }
  CPDF_Page page;
  page.Load(pDoc, pPage);
  CPDF_ParseOptions options;
  options.m_bTextOnly = TRUE;
  options.m_bSeparateForm = FALSE;
  page.ParseContent(&options);
  CFX_FloatRect page_bbox = page.GetPageBBox();
  if (flags & PDF2TXT_AUTO_ROTATE) {
    CheckRotate(page, page_bbox);
  }
  CTextPage texts;
  texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
  texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
  texts.m_bBreakSpace = TRUE;
  FX_POSITION pos = page.GetFirstObjectPosition();
  while (pos) {
    CPDF_PageObject* pObject = page.GetNextObject(pos);
    if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
      CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right,
                         pObject->m_Top);
      if (!page_bbox.Contains(rect)) {
        continue;
      }
    }
    texts.ProcessObject(pObject);
  }
  texts.WriteOutput(lines, iMinWidth);
}
void PDF_GetPageText(CFX_ByteStringArray& lines,
                     CPDF_Document* pDoc,
                     CPDF_Dictionary* pPage,
                     int iMinWidth,
                     FX_DWORD flags) {
  lines.RemoveAll();
  CFX_WideStringArray wlines;
  PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
  for (int i = 0; i < wlines.GetSize(); i++) {
    CFX_WideString wstr = wlines[i];
    CFX_ByteString str;
    for (int c = 0; c < wstr.GetLength(); c++) {
      str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
    }
    lines.Add(str);
  }
}
void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
                               CPDF_Document* pDoc,
                               CPDF_Dictionary* pPage,
                               FX_DWORD flags) {
  buffer.EstimateSize(0, 10240);
  CPDF_Page page;
  page.Load(pDoc, pPage);
  CPDF_ParseOptions options;
  options.m_bTextOnly = TRUE;
  options.m_bSeparateForm = FALSE;
  page.ParseContent(&options);
  GetTextStream_Unicode(buffer, &page, TRUE, NULL);
}