PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?
Yes, this functionality is available in the PDF-Tools SDK.
Several functions are required to perform this operation. Firstly, a set of PXCp_ET_... functions is required to extract the text content and their positions. Secondly, a PXC_pAddLink function is required to add external links to the text, once the postions are identified. (If the links are located in the same document then the PXC_AddGotoAction function must be used for this purpose).
The most complex part of the operation is the recognition and extraction of the text. This is because the text in PDF documents can take several different forms. Text-based elements may appear identical on screen but have different represenations at the file level. For example, a single word on the screen can be represented in several parts at the file level, spaces may be ommitted and the word ordering may be nonsensical until it is presented on the screen).
Follow the steps below, for each document page, in order to enable this functionality:
1. Call PXCp_ET_AnalyzePageContent to detect text on the specified page.
2. Call PXCp_ET_GetElementCount to determine how many text elements are on the page.
3. Use the PXC_ET_GetElement function to retrieve each element.
4. Analyze the page text in order to detect the required words. N.b. it may be necessary to recompose pages at this point.
5. Add links to the page as desired and then release the acquired data.
A sample code is detailed below that extracts specified text and places it into a new PDF file. (Please ignore the calls to PXC...functions. They are related to new PDF file creation).
// Example shows how to extract all text from the document
// and save it to another document retaining formatting
void ExtractTextToOtherPDFDocument(PDFDocument hDoc, LPCWSTR OtherPDFFileName)
{
HRESULT hr = DS_OK;
// Preparsing document
hr = PXCp_ET_Prepare(hDoc);
if (IS_DS_FAILED(hr))
{
// report error
...
}
DWORD fontCount = 0;
DWORD* fontIDs = NULL;
PXCDocument hDstDoc = NULL;
do
{
// 1. Get all fonts from the doc
hr = PXCp_ET_GetFontCount(hDoc, &fontCount);
if (IS_DS_FAILED(hr))
break;
// 2. Create new doc
hr = PXC_NewDocument(&hDstDoc, NULL, NULL);
if(IS_DS_FAILED(hr))
break;
fontIDs = new DWORD[fontCount];
::ZeroMemory(fontIDs, fontCount * sizeof(DWORD));
// buffer for font name + font style
LPWSTR fontName = NULL;
DWORD curFontNameLen = 0;
for (DWORD i = 0; i < fontCount; i++)
{
DWORD bufLen = 0;
// get font name length
// if it is equal to 1 then there is no font name
// "1" is null-terminator in this case
hr = PXCp_ET_GetFontName(hDoc, i, NULL, &bufLen);
if (IS_DS_FAILED(hr))
break;
// Check if the font have any name set
BOOL bNoFontNameSet = bufLen <= 1;
// if there is no font name default 'Arial' will be used
static LPCWSTR DefaultFontName = L"Arial";
if (bNoFontNameSet)
bufLen = 6; // ::lstrlenW(DefaultFontName)
// Get the length of font style
DWORD StyleLen = 0;
hr = PXCp_ET_GetFontStyle(hDoc, i, NULL, &StyleLen);
// Check if there is font style sºõ
if (StyleLen <= 1)
StyleLen = 0;
if (IS_DS_SUCCESSFUL(hr) && StyleLen)
{
// if there is font style set - then adjust the buffer length
bufLen += StyleLen;
}
// Check for necessary buffer
if (bufLen > curFontNameLen)
{
if (fontName)
delete[] fontName;
fontName = new WCHAR[bufLen];
curFontNameLen = bufLen;
}
if (bNoFontNameSet)
{
// if there is no file name - copy default name
::lstrcpy(fontName, DefaultFontName);
}
else
{
// else aquire font name from the library
DWORD tempBufLen = bufLen;
hr = PXCp_ET_GetFontName(hDoc, i, fontName, &tempBufLen);
if (IS_DS_FAILED(hr))
break;
}
if (StyleLen)
{
// if there is font style set - acquire it
hr = PXCp_ET_GetFontStyle(hDoc, i, fontName + (bufLen - StyleLen) - 1, &StyleLen);
}
// add the font into library
hr = PXC_AddFontW(hDstDoc, FW_NORMAL, FALSE, fontName, fontIDs + i);
if (IS_DS_FAILED(hr))
break;
}
// clean unnecessary buffer
if (fontName)
{
delete[] fontName;
fontName = NULL;
}
if (IS_DS_FAILED(hr))
{
break;
}
DWORD PageCnt = 0;
hr = PXCp_GetPagesCount(hDoc, &PageCnt);
if (IS_DS_FAILED(hr) || !PageCnt)
break;
// 3. for each page
for (DWORD CurPage = 0; CurPage < PageCnt; CurPage++)
{
// create new page in the new document
PXC_RectF rcMediaBox;
PXC_RectF rcCropBox;
LONG nAngle;
hr = PXCp_PageGetBox(hDoc, CurPage, PB_MediaBox, &rcMediaBox);
if(IS_DS_FAILED(hr))
break;
// add to the new page
PXCPage hDstPage = NULL;
hr = PXC_AddPage(hDstDoc, rcMediaBox.right - rcMediaBox.left, rcMediaBox.top - rcMediaBox.bottom, &hDstPage);
if(IS_DS_FAILED(hr))
break;
hr = PXCp_PageGetBox(hDoc, CurPage, PB_CropBox, &rcCropBox);
if(IS_DS_SUCCESSFUL(hr))
{
hr = PXC_SetPageBox(hDstPage, PB_CropBox, &rcCropBox);
}
hr = PXCp_PageGetRotate(hDoc, CurPage, &nAngle);
if(IS_DS_SUCCESSFUL(hr) && nAngle)
{
hr = PXC_SetPageRotation(hDstPage, nAngle);
}
PXC_TextOptions pto = { sizeof(PXC_TextOptions) };
PXC_GetTextOptions(hDstPage, &pto);
pto.nTextPosition = TextPosition_Baseline;
PXC_SetTextOptions(hDstPage, &pto);
// for each element
hr = PXCp_ET_AnalyzePageContent(hDoc, CurPage);
if(IS_DS_FAILED(hr))
break;
DWORD TextElCount = 0;
hr = PXCp_ET_GetElementCount(hDoc, &TextElCount);
if(IS_DS_FAILED(hr) || TextElCount == 0)
continue;
PXP_TextElement TextElement = {0};
TextElement.cbSize = sizeof(PXP_TextElement);
DWORD CurCount = 0;
PXC_PointF ptTextOrg = {0};
WCHAR buf[2];
buf[0] = buf[1] = 0;
for (DWORD t = 0; t < TextElCount; t++)
{
TextElement.Count = 0;
TextElement.mask = 0;
hr = PXCp_ET_GetElement(hDoc, t, &TextElement, 0);
if(IS_DS_FAILED(hr) || (LONG)TextElement.Count <= 0)
continue;
TextElement.mask = PTEM_Text | PTEM_Offsets | PTEM_Matrix | PTEM_FontInfo | PTEM_TextParams;
if (CurCount < TextElement.Count)
{
if (TextElement.Characters != NULL)
delete TextElement.Characters;
if (TextElement.Offsets != NULL)
delete TextElement.Offsets;
TextElement.Characters = new WCHAR[TextElement.Count];
TextElement.Offsets = new double[TextElement.Count];
CurCount = TextElement.Count;
}
hr = PXCp_ET_GetElement(hDoc, t, &TextElement, GTEF_IgnorePageRotation);
if (IS_DS_FAILED(hr))
continue;
// Now add this text element into new PDF document
hr = PXC_TCS_Transform(hDstPage, &TextElement.Matrix);
if (fontCount <= TextElement.FontIndex)
continue;
hr = PXC_SetCurrentFont(hDstPage, fontIDs[TextElement.FontIndex], TextElement.FontSize);
hr = PXC_SetFillColor(hDstPage, TextElement.FillColor);
hr = PXC_SetStrokeColor(hDstPage, TextElement.StrokeColor);
hr = PXC_SetTextRMode(hDstPage, TextElement.RenderingMode, NULL);
hr = PXC_SetTextScaling(hDstPage, TextElement.Th, NULL);
hr = PXC_SetTextLeading(hDstPage, TextElement.Leading, NULL);
hr = PXC_SetCharSpacing(hDstPage, TextElement.CharSpace, NULL);
hr = PXC_SetWordSpacing(hDstPage, TextElement.WordSpace, NULL);
for(DWORD j = 0; j < TextElement.Count - 1; j++)
{
ptTextOrg.x = TextElement.Offsets[j];
buf[0] = TextElement.Characters[j];
hr = PXC_TextOutW(hDstPage, &ptTextOrg, buf, 1);
}
}
if (TextElement.Characters != NULL)
delete[] TextElement.Characters;
if (TextElement.Offsets != NULL)
delete[] TextElement.Offsets;
}
if (IS_DS_FAILED(hr))
break;
hr = PXC_WriteDocumentExW(hDstDoc, OtherPDFFileName, -1, WEF_ShowSaveDialog | WEF_RunApp, NULL);
} while(FALSE);
// clear up
if (hDstDoc)
{
PXC_ReleaseDocument(hDstDoc);
hDstDoc = NULL;
}
if (fontIDs)
{
delete[] fontIDs;
fontIDs = NULL;
}
PXCp_ET_Finish(hDoc);
}
You can contact us by phone, email or our social media accounts — we are here to assist you.