PDF-XChange Co Ltd :: Knowledge Base :: PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

KB103
Sep 01, 2022 10:44 AM

PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

Question:

PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

Answer:

Yes, this functionality is available in the PDF-Tools SDK.

Several functions are required to perform this operation. Firstly, a set of PXCp_ET_... functions is required to extract the text content and their positions. Secondly, a PXC_pAddLink function is required to add external links to the text, once the postions are identified. (If the links are located in the same document then the PXC_AddGotoAction function must be used for this purpose).

The most complex part of the operation is the recognition and extraction of the text. This is because the text in PDF documents can take several different forms. Text-based elements may appear identical on screen but have different represenations at the file level. For example, a single word on the screen can be represented in several parts at the file level, spaces may be ommitted and the word ordering may be nonsensical until it is presented on the screen).

Follow the steps below, for each document page, in order to enable this functionality:

1. Call PXCp_ET_AnalyzePageContent to detect text on the specified page.

2. Call PXCp_ET_GetElementCount to determine how many text elements are on the page.

3. Use the PXC_ET_GetElement function to retrieve each element.

4. Analyze the page text in order to detect the required words. N.b. it may be necessary to recompose pages at this point.

5. Add links to the page as desired and then release the acquired data.

A sample code is detailed below that extracts specified text and places it into a new PDF file. (Please ignore the calls to PXC...functions. They are related to new PDF file creation).

// Example shows how to extract all text from the document

// and save it to another document retaining formatting

void ExtractTextToOtherPDFDocument(PDFDocument hDoc, LPCWSTR OtherPDFFileName)

{

HRESULT hr = DS_OK;

// Preparsing document

hr = PXCp_ET_Prepare(hDoc);

if (IS_DS_FAILED(hr))

{

// report error

...

}

DWORD fontCount = 0;

DWORD* fontIDs = NULL;

PXCDocument hDstDoc = NULL;

{

// 1. Get all fonts from the doc

hr = PXCp_ET_GetFontCount(hDoc, &fontCount);

if (IS_DS_FAILED(hr))

break;

// 2. Create new doc

hr = PXC_NewDocument(&hDstDoc, NULL, NULL);

if(IS_DS_FAILED(hr))

break;

fontIDs = new DWORD[fontCount];

::ZeroMemory(fontIDs, fontCount * sizeof(DWORD));

// buffer for font name + font style

LPWSTR fontName = NULL;

DWORD curFontNameLen = 0;

for (DWORD i = 0; i < fontCount; i++)

{

DWORD bufLen = 0;

// get font name length

// if it is equal to 1 then there is no font name

// "1" is null-terminator in this case

hr = PXCp_ET_GetFontName(hDoc, i, NULL, &bufLen);

if (IS_DS_FAILED(hr))

break;

// Check if the font have any name set

BOOL bNoFontNameSet = bufLen <= 1;

// if there is no font name default 'Arial' will be used

static LPCWSTR DefaultFontName = L"Arial";

if (bNoFontNameSet)

bufLen = 6; // ::lstrlenW(DefaultFontName)

// Get the length of font style

DWORD StyleLen = 0;

hr = PXCp_ET_GetFontStyle(hDoc, i, NULL, &StyleLen);

// Check if there is font style sºõ

if (StyleLen <= 1)

StyleLen = 0;

if (IS_DS_SUCCESSFUL(hr) && StyleLen)

{

// if there is font style set - then adjust the buffer length

bufLen += StyleLen;

}

// Check for necessary buffer

if (bufLen > curFontNameLen)

{

if (fontName)

delete[] fontName;

fontName = new WCHAR[bufLen];

curFontNameLen = bufLen;

}

if (bNoFontNameSet)

{

// if there is no file name - copy default name

::lstrcpy(fontName, DefaultFontName);

}

else

{

// else aquire font name from the library

DWORD tempBufLen = bufLen;

hr = PXCp_ET_GetFontName(hDoc, i, fontName, &tempBufLen);

if (IS_DS_FAILED(hr))

break;

}

if (StyleLen)

{

// if there is font style set - acquire it

hr = PXCp_ET_GetFontStyle(hDoc, i, fontName + (bufLen - StyleLen) - 1, &StyleLen);

}

// add the font into library

hr = PXC_AddFontW(hDstDoc, FW_NORMAL, FALSE, fontName, fontIDs + i);

if (IS_DS_FAILED(hr))

break;

}

// clean unnecessary buffer

if (fontName)

{

delete[] fontName;

fontName = NULL;

}

if (IS_DS_FAILED(hr))

{

break;

}

DWORD PageCnt = 0;

hr = PXCp_GetPagesCount(hDoc, &PageCnt);

if (IS_DS_FAILED(hr) || !PageCnt)

break;

// 3. for each page

for (DWORD CurPage = 0; CurPage < PageCnt; CurPage++)

{

// create new page in the new document

PXC_RectF rcMediaBox;

PXC_RectF rcCropBox;

LONG nAngle;

hr = PXCp_PageGetBox(hDoc, CurPage, PB_MediaBox, &rcMediaBox);

if(IS_DS_FAILED(hr))

break;

// add to the new page

PXCPage hDstPage = NULL;

hr = PXC_AddPage(hDstDoc, rcMediaBox.right - rcMediaBox.left, rcMediaBox.top - rcMediaBox.bottom, &hDstPage);

if(IS_DS_FAILED(hr))

break;

hr = PXCp_PageGetBox(hDoc, CurPage, PB_CropBox, &rcCropBox);

if(IS_DS_SUCCESSFUL(hr))

{

hr = PXC_SetPageBox(hDstPage, PB_CropBox, &rcCropBox);

}

hr = PXCp_PageGetRotate(hDoc, CurPage, &nAngle);

if(IS_DS_SUCCESSFUL(hr) && nAngle)

{

hr = PXC_SetPageRotation(hDstPage, nAngle);

}

PXC_TextOptions pto = { sizeof(PXC_TextOptions) };

PXC_GetTextOptions(hDstPage, &pto);

pto.nTextPosition = TextPosition_Baseline;

PXC_SetTextOptions(hDstPage, &pto);

// for each element

hr = PXCp_ET_AnalyzePageContent(hDoc, CurPage);

if(IS_DS_FAILED(hr))

break;

DWORD TextElCount = 0;

hr = PXCp_ET_GetElementCount(hDoc, &TextElCount);

if(IS_DS_FAILED(hr) || TextElCount == 0)

continue;

PXP_TextElement TextElement = {0};

TextElement.cbSize = sizeof(PXP_TextElement);

DWORD CurCount = 0;

PXC_PointF ptTextOrg = {0};

WCHAR buf[2];

buf[0] = buf[1] = 0;

for (DWORD t = 0; t < TextElCount; t++)

{

TextElement.Count = 0;

TextElement.mask = 0;

hr = PXCp_ET_GetElement(hDoc, t, &TextElement, 0);

if(IS_DS_FAILED(hr) || (LONG)TextElement.Count <= 0)

continue;

TextElement.mask = PTEM_Text | PTEM_Offsets | PTEM_Matrix | PTEM_FontInfo | PTEM_TextParams;

if (CurCount < TextElement.Count)

{

if (TextElement.Characters != NULL)

delete TextElement.Characters;

if (TextElement.Offsets != NULL)

delete TextElement.Offsets;

TextElement.Characters = new WCHAR[TextElement.Count];

TextElement.Offsets = new double[TextElement.Count];

CurCount = TextElement.Count;

}

hr = PXCp_ET_GetElement(hDoc, t, &TextElement, GTEF_IgnorePageRotation);

if (IS_DS_FAILED(hr))

continue;

// Now add this text element into new PDF document

hr = PXC_TCS_Transform(hDstPage, &TextElement.Matrix);

if (fontCount <= TextElement.FontIndex)

continue;

hr = PXC_SetCurrentFont(hDstPage, fontIDs[TextElement.FontIndex], TextElement.FontSize);

hr = PXC_SetFillColor(hDstPage, TextElement.FillColor);

hr = PXC_SetStrokeColor(hDstPage, TextElement.StrokeColor);

hr = PXC_SetTextRMode(hDstPage, TextElement.RenderingMode, NULL);

hr = PXC_SetTextScaling(hDstPage, TextElement.Th, NULL);

hr = PXC_SetTextLeading(hDstPage, TextElement.Leading, NULL);

hr = PXC_SetCharSpacing(hDstPage, TextElement.CharSpace, NULL);

hr = PXC_SetWordSpacing(hDstPage, TextElement.WordSpace, NULL);

for(DWORD j = 0; j < TextElement.Count - 1; j++)

{

ptTextOrg.x = TextElement.Offsets[j];

buf[0] = TextElement.Characters[j];

hr = PXC_TextOutW(hDstPage, &ptTextOrg, buf, 1);

}

if (TextElement.Characters != NULL)

delete[] TextElement.Characters;

if (TextElement.Offsets != NULL)

delete[] TextElement.Offsets;

}

if (IS_DS_FAILED(hr))

break;

hr = PXC_WriteDocumentExW(hDstDoc, OtherPDFFileName, -1, WEF_ShowSaveDialog | WEF_RunApp, NULL);

} while(FALSE);

// clear up

if (hDstDoc)

{

PXC_ReleaseDocument(hDstDoc);

hDstDoc = NULL;

}

if (fontIDs)

{

delete[] fontIDs;

fontIDs = NULL;

}

PXCp_ET_Finish(hDoc);

}

Was this article helpful?

Yes No Somewhat

Knowledgebase

PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

Question:

Answer:

Error

More Like This

Need more information? Get in touch.

Knowledgebase

PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

Question:

Answer:

Error

More Like This

We serve cookies.

Need more information? Get in touch.