[api-minor] Add a parameter to PDFPageProxy_getTextContent that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)

This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces.

When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`).

Fixes 6612.
This commit is contained in:
Jonas Jenwald 2015-11-23 16:57:43 +01:00
parent c2dfe9e9a9
commit 6dfe53b976
12 changed files with 75 additions and 24 deletions

View file

@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
handler.on('GetTextContent', function wphExtractText(data) {
var pageIndex = data.pageIndex;
var normalizeWhitespace = data.normalizeWhitespace;
return pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
return page.extractTextContent(task).then(function(textContent) {
return page.extractTextContent(task, normalizeWhitespace).then(
function(textContent) {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +
(Date.now() - start) + 'ms');