// Copyright 2014 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include <cctype> #include <cwctype> #include <memory> #include "core/include/fpdfapi/fpdf_page.h" #include "core/include/fpdfapi/fpdf_pageobj.h" #include "core/include/fpdfapi/fpdf_resource.h" #include "core/include/fpdftext/fpdf_text.h" #include "core/include/fxcrt/fx_bidi.h" #include "core/include/fxcrt/fx_ucd.h" #include "text_int.h" #include "txtproc.h" CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, const FX_CHAR* defchar) { if (destcp == 0) { if (unicode < 0x80) { return CFX_ByteString((char)unicode); } const FX_CHAR* altstr = FCS_GetAltStr(unicode); return CFX_ByteString(altstr ? altstr : defchar); } char buf[10]; int iDef = 0; int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &iDef); if (ret && !iDef) { return CFX_ByteString(buf, ret); } const FX_CHAR* altstr = FCS_GetAltStr(unicode); return CFX_ByteString(altstr ? altstr : defchar); } CTextPage::CTextPage() {} CTextPage::~CTextPage() { int i; for (i = 0; i < m_BaseLines.GetSize(); i++) { delete m_BaseLines.GetAt(i); } for (i = 0; i < m_TextColumns.GetSize(); i++) { delete m_TextColumns.GetAt(i); } } void CTextPage::ProcessObject(CPDF_PageObject* pObject) { if (pObject->m_Type != PDFPAGE_TEXT) { return; } CPDF_TextObject* pText = (CPDF_TextObject*)pObject; CPDF_Font* pFont = pText->m_TextState.GetFont(); int count = pText->CountItems(); FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2); pText->CalcCharPos(pPosArray); FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); FX_FLOAT spacew = 0; if (space_charcode != -1) { spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; } if (spacew == 0) { spacew = fontsize_h / 4; } if (pText->m_TextState.GetBaselineAngle() != 0) { int cc = 0; CFX_Matrix matrix; pText->GetTextMatrix(&matrix); for (int i = 0; i < pText->m_nChars; i++) { FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes : pText->m_pCharCodes[i]; if (charcode == (FX_DWORD)-1) { continue; } FX_RECT char_box; pFont->GetCharBBox(charcode, char_box); FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000; FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000; FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000; FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000; cc++; FX_FLOAT char_origx, char_origy; matrix.Transform(char_left, 0, char_origx, char_origy); matrix.TransformRect(char_left, char_right, char_top, char_bottom); CFX_ByteString str; pFont->AppendChar(str, charcode); InsertTextBox(NULL, char_origy, char_left, char_right, char_top, char_bottom, spacew, fontsize_v, str, pFont); } FX_Free(pPosArray); return; } FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); for (int ii = 0; ii < count * 2; ii++) { pPosArray[ii] *= ratio_h; } FX_FLOAT baseline = pText->m_PosY; CTextBaseLine* pBaseLine = NULL; FX_FLOAT topy = pText->m_Top; FX_FLOAT bottomy = pText->m_Bottom; FX_FLOAT leftx = pText->m_Left; int cc = 0; CFX_ByteString segment; int space_count = 0; FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; for (int i = 0; i < pText->m_nChars; i++) { FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes : pText->m_pCharCodes[i]; if (charcode == (FX_DWORD)-1) { continue; } FX_FLOAT char_left = pPosArray[cc * 2]; FX_FLOAT char_right = pPosArray[cc * 2 + 1]; cc++; if (char_left < last_left || (char_left - last_right) > spacew / 2) { pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, topy, bottomy, spacew, fontsize_v, segment, pFont); segment_left = char_left; segment = ""; } if (space_count > 1) { pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, topy, bottomy, spacew, fontsize_v, segment, pFont); segment = ""; } else if (space_count == 1) { pFont->AppendChar(segment, ' '); } if (segment.GetLength() == 0) { segment_left = char_left; } segment_right = char_right; pFont->AppendChar(segment, charcode); space_count = 0; last_left = char_left; last_right = char_right; } if (segment.GetLength()) pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, topy, bottomy, spacew, fontsize_v, segment, pFont); FX_Free(pPosArray); } CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, CFX_ByteString& str, CPDF_Font* pFont) { if (str.GetLength() == 0) { return NULL; } if (!pBaseLine) { int i; for (i = 0; i < m_BaseLines.GetSize(); i++) { CTextBaseLine* pExistLine = m_BaseLines.GetAt(i); if (pExistLine->m_BaseLine == basey) { pBaseLine = pExistLine; break; } if (pExistLine->m_BaseLine < basey) { break; } } if (!pBaseLine) { pBaseLine = new CTextBaseLine; pBaseLine->m_BaseLine = basey; m_BaseLines.InsertAt(i, pBaseLine); } } CFX_WideString text; const FX_CHAR* pStr = str; int len = str.GetLength(), offset = 0; while (offset < len) { FX_DWORD ch = pFont->GetNextChar(pStr, len, offset); CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); if (unicode_str.IsEmpty()) { text += (FX_WCHAR)ch; } else { text += unicode_str; } } pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text); return pBaseLine; } void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) { FX_FLOAT lastheight = -1; FX_FLOAT lastbaseline = -1; FX_FLOAT MinLeftX = 1000000; FX_FLOAT MaxRightX = 0; int i; for (i = 0; i < m_BaseLines.GetSize(); i++) { CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); FX_FLOAT leftx, rightx; if (pBaseLine->GetWidth(leftx, rightx)) { if (leftx < MinLeftX) { MinLeftX = leftx; } if (rightx > MaxRightX) { MaxRightX = rightx; } } } for (i = 0; i < m_BaseLines.GetSize(); i++) { m_BaseLines.GetAt(i)->MergeBoxes(); } for (i = 1; i < m_BaseLines.GetSize(); i++) { CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1); if (pBaseLine->CanMerge(pPrevLine)) { pPrevLine->Merge(pBaseLine); delete pBaseLine; m_BaseLines.RemoveAt(i); i--; } } if (m_bAutoWidth) { int* widths = FX_Alloc(int, m_BaseLines.GetSize()); for (i = 0; i < m_BaseLines.GetSize(); i++) { widths[i] = 0; CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); int TotalChars = 0; FX_FLOAT TotalWidth = 0; int minchars; pBaseLine->CountChars(TotalChars, TotalWidth, minchars); if (TotalChars) { FX_FLOAT charwidth = TotalWidth / TotalChars; widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); } if (widths[i] > 1000) { widths[i] = 1000; } if (widths[i] < minchars) { widths[i] = minchars; } } int AvgWidth = 0, widthcount = 0; for (i = 0; i < m_BaseLines.GetSize(); i++) if (widths[i]) { AvgWidth += widths[i]; widthcount++; } AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); int MaxWidth = 0; for (i = 0; i < m_BaseLines.GetSize(); i++) if (MaxWidth < widths[i]) { MaxWidth = widths[i]; } if (MaxWidth > AvgWidth * 6 / 5) { MaxWidth = AvgWidth * 6 / 5; } FX_Free(widths); if (iMinWidth < MaxWidth) { iMinWidth = MaxWidth; } } for (i = 0; i < m_BaseLines.GetSize(); i++) { m_BaseLines.GetAt(i)->MergeBoxes(); } if (m_bKeepColumn) { FindColumns(); } for (i = 0; i < m_BaseLines.GetSize(); i++) { CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); if (lastheight >= 0) { FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { lines.Add(L""); } } lastheight = pBaseLine->m_MaxFontSizeV; lastbaseline = pBaseLine->m_BaseLine; CFX_WideString str; pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); lines.Add(str); } } void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) { wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); FX_WCHAR* pDst = NULL; FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); if (nCount < 1) { sDest += wChar; return; } pDst = new FX_WCHAR[nCount]; FX_Unicode_GetNormalization(wChar, pDst); for (int nIndex = 0; nIndex < nCount; nIndex++) { sDest += pDst[nIndex]; } delete[] pDst; } void NormalizeString(CFX_WideString& str) { if (str.GetLength() <= 0) { return; } CFX_WideString sBuffer; std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar); CFX_WordArray order; FX_BOOL bR2L = FALSE; int32_t start = 0, count = 0, i = 0; int nR2L = 0, nL2R = 0; for (i = 0; i < str.GetLength(); i++) { if (pBidiChar->AppendChar(str.GetAt(i))) { CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); order.Add(start); order.Add(count); order.Add(ret); if (!bR2L) { if (ret == CFX_BidiChar::RIGHT) { nR2L++; } else if (ret == CFX_BidiChar::LEFT) { nL2R++; } } } } if (pBidiChar->EndChar()) { CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); order.Add(start); order.Add(count); order.Add(ret); if (!bR2L) { if (ret == CFX_BidiChar::RIGHT) { nR2L++; } else if (ret == CFX_BidiChar::LEFT) { nL2R++; } } } if (nR2L > 0 && nR2L >= nL2R) { bR2L = TRUE; } if (bR2L) { int count = order.GetSize(); for (int j = count - 1; j > 0; j -= 3) { int ret = order.GetAt(j); int start = order.GetAt(j - 2); int count1 = order.GetAt(j - 1); if (ret == 2 || ret == 0) { for (int i = start + count1 - 1; i >= start; i--) { NormalizeCompositeChar(str[i], sBuffer); } } else { i = j; FX_BOOL bSymbol = FALSE; while (i > 0 && order.GetAt(i) != 2) { bSymbol = !order.GetAt(i); i -= 3; } int end = start + count1; int n = 0; if (bSymbol) { n = i + 6; } else { n = i + 3; } if (n >= j) { for (int m = start; m < end; m++) { sBuffer += str[m]; } } else { i = j; j = n; for (; n <= i; n += 3) { int start = order.GetAt(n - 2); int count1 = order.GetAt(n - 1); int end = start + count1; for (int m = start; m < end; m++) { sBuffer += str[m]; } } } } } } else { int count = order.GetSize(); FX_BOOL bL2R = FALSE; for (int j = 0; j < count; j += 3) { int ret = order.GetAt(j + 2); int start = order.GetAt(j); int count1 = order.GetAt(j + 1); if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) { int i = j + 3; while (bR2L && i < count) { if (order.GetAt(i + 2) == 1) { break; } else { i += 3; } } if (i == 3) { j = -3; bL2R = TRUE; continue; } int end = str.GetLength() - 1; if (i < count) { end = order.GetAt(i) - 1; } j = i - 3; for (int n = end; n >= start; n--) { NormalizeCompositeChar(str[i], sBuffer); } } else { int end = start + count1; for (int i = start; i < end; i++) { sBuffer += str[i]; } } } } str.Empty(); str += sBuffer; } static FX_BOOL IsNumber(CFX_WideString& str) { for (int i = 0; i < str.GetLength(); i++) { FX_WCHAR ch = str[i]; // TODO(dsinclair): --.+ +.-- should probably not be a number. if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ') return FALSE; } return TRUE; } void CTextPage::FindColumns() { int i; for (i = 0; i < m_BaseLines.GetSize(); i++) { CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); CTextColumn* pColumn = FindColumn(pTextBox->m_Right); if (pColumn) { pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / (pColumn->m_Count + 1); pColumn->m_Count++; } else { pColumn = new CTextColumn; pColumn->m_Count = 1; pColumn->m_AvgPos = pTextBox->m_Right; pColumn->m_TextPos = -1; m_TextColumns.Add(pColumn); } } } int mincount = m_BaseLines.GetSize() / 4; for (i = 0; i < m_TextColumns.GetSize(); i++) { CTextColumn* pTextColumn = m_TextColumns.GetAt(i); if (pTextColumn->m_Count >= mincount) { continue; } delete pTextColumn; m_TextColumns.RemoveAt(i); i--; } for (i = 0; i < m_BaseLines.GetSize(); i++) { CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); if (IsNumber(pTextBox->m_Text)) { pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); } } } } CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) { for (int i = 0; i < m_TextColumns.GetSize(); i++) { CTextColumn* pColumn = m_TextColumns.GetAt(i); if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { return pColumn; } } return NULL; } void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {} CTextBaseLine::CTextBaseLine() { m_Top = -100000; m_Bottom = 100000; m_MaxFontSizeV = 0; } CTextBaseLine::~CTextBaseLine() { for (int i = 0; i < m_TextList.GetSize(); i++) { delete m_TextList.GetAt(i); } } void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text) { if (m_Top < topy) { m_Top = topy; } if (m_Bottom > bottomy) { m_Bottom = bottomy; } if (m_MaxFontSizeV < fontsize_v) { m_MaxFontSizeV = fontsize_v; } int i; for (i = 0; i < m_TextList.GetSize(); i++) { CTextBox* pText = m_TextList.GetAt(i); if (pText->m_Left > leftx) { break; } } CTextBox* pText = new CTextBox; pText->m_Text = text; pText->m_Left = leftx; pText->m_Right = rightx; pText->m_Top = topy; pText->m_Bottom = bottomy; pText->m_SpaceWidth = spacew; pText->m_FontSizeV = fontsize_v; pText->m_pColumn = NULL; m_TextList.InsertAt(i, pText); } FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2, FX_FLOAT& interlow, FX_FLOAT& interhigh); FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) { FX_FLOAT inter_top, inter_bottom; if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, inter_bottom, inter_top)) { return FALSE; } FX_FLOAT inter_h = inter_top - inter_bottom; if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { return FALSE; } FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); for (int i = 0; i < m_TextList.GetSize(); i++) { CTextBox* pText = m_TextList.GetAt(i); for (int j = 0; j < pOther->m_TextList.GetSize(); j++) { CTextBox* pOtherText = pOther->m_TextList.GetAt(j); FX_FLOAT inter_left, inter_right; if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) { continue; } FX_FLOAT inter_w = inter_right - inter_left; if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) { continue; } if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { return FALSE; } } } return TRUE; } void CTextBaseLine::Merge(CTextBaseLine* pOther) { for (int i = 0; i < pOther->m_TextList.GetSize(); i++) { CTextBox* pText = pOther->m_TextList.GetAt(i); InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom, pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); } } FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) { int i; for (i = 0; i < m_TextList.GetSize(); i++) { CTextBox* pText = m_TextList.GetAt(i); if (pText->m_Text != L" ") { break; } } if (i == m_TextList.GetSize()) { return FALSE; } CTextBox* pText = m_TextList.GetAt(i); leftx = pText->m_Left; for (i = m_TextList.GetSize() - 1; i >= 0; i--) { CTextBox* pText = m_TextList.GetAt(i); if (pText->m_Text != L" ") { break; } } pText = m_TextList.GetAt(i); rightx = pText->m_Right; return TRUE; } void CTextBaseLine::MergeBoxes() { int i = 0; while (1) { if (i >= m_TextList.GetSize() - 1) { break; } CTextBox* pThisText = m_TextList.GetAt(i); CTextBox* pNextText = m_TextList.GetAt(i + 1); FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ? pNextText->m_SpaceWidth : pThisText->m_SpaceWidth; if (spacew > 0.0 && dx < spacew * 2) { pThisText->m_Right = pNextText->m_Right; if (dx > spacew * 1.5) { pThisText->m_Text += L" "; } else if (dx > spacew / 3) { pThisText->m_Text += L' '; } pThisText->m_Text += pNextText->m_Text; pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth; m_TextList.RemoveAt(i + 1); delete pNextText; } else { i++; } } } void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth, int iTextWidth) { int lastpos = -1; for (int i = 0; i < m_TextList.GetSize(); i++) { CTextBox* pText = m_TextList.GetAt(i); int xpos; if (pText->m_pColumn) { xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5); xpos -= pText->m_Text.GetLength(); } else { xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); } if (xpos <= lastpos) { xpos = lastpos + 1; } for (int j = lastpos + 1; j < xpos; j++) { str += ' '; } CFX_WideString sSrc(pText->m_Text); NormalizeString(sSrc); str += sSrc; str += ' '; lastpos = xpos + pText->m_Text.GetLength(); } } void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) { minchars = 0; for (int i = 0; i < m_TextList.GetSize(); i++) { CTextBox* pText = m_TextList.GetAt(i); if (pText->m_Right - pText->m_Left < 0.002) { continue; } count += pText->m_Text.GetLength(); width += pText->m_Right - pText->m_Left; minchars += pText->m_Text.GetLength() + 1; } } #define PI 3.1415926535897932384626433832795 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) { int total_count = 0, rotated_count[3] = {0, 0, 0}; FX_POSITION pos = page.GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pObj = page.GetNextObject(pos); if (pObj->m_Type != PDFPAGE_TEXT) { continue; } total_count++; CPDF_TextObject* pText = (CPDF_TextObject*)pObj; FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); if (angle == 0.0) { continue; } int degree = (int)(angle * 180 / PI + 0.5); if (degree % 90) { continue; } if (degree < 0) { degree += 360; } int index = degree / 90 % 3 - 1; if (index < 0) { continue; } rotated_count[index]++; } if (total_count == 0) { return; } CFX_Matrix matrix; if (rotated_count[0] > total_count * 2 / 3) { matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); } else if (rotated_count[1] > total_count * 2 / 3) { matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); } else if (rotated_count[2] > total_count * 2 / 3) { matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); } else { return; } page.Transform(matrix); page_bbox.Transform(&matrix); } void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, int iMinWidth, FX_DWORD flags) { lines.RemoveAll(); if (!pPage) { return; } CPDF_Page page; page.Load(pDoc, pPage); CPDF_ParseOptions options; options.m_bTextOnly = TRUE; options.m_bSeparateForm = FALSE; page.ParseContent(&options); CFX_FloatRect page_bbox = page.GetPageBBox(); if (flags & PDF2TXT_AUTO_ROTATE) { CheckRotate(page, page_bbox); } CTextPage texts; texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; texts.m_bBreakSpace = TRUE; FX_POSITION pos = page.GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pObject = page.GetNextObject(pos); if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top); if (!page_bbox.Contains(rect)) { continue; } } texts.ProcessObject(pObject); } texts.WriteOutput(lines, iMinWidth); } void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, int iMinWidth, FX_DWORD flags) { lines.RemoveAll(); CFX_WideStringArray wlines; PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); for (int i = 0; i < wlines.GetSize(); i++) { CFX_WideString wstr = wlines[i]; CFX_ByteString str; for (int c = 0; c < wstr.GetLength(); c++) { str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); } lines.Add(str); } } void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags) { buffer.EstimateSize(0, 10240); CPDF_Page page; page.Load(pDoc, pPage); CPDF_ParseOptions options; options.m_bTextOnly = TRUE; options.m_bSeparateForm = FALSE; page.ParseContent(&options); GetTextStream_Unicode(buffer, &page, TRUE, NULL); }