[api-minor] Add a parameter to PDFPageProxy_getTextContent
that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
This commit is contained in:
parent
c2dfe9e9a9
commit
6dfe53b976
12 changed files with 75 additions and 24 deletions
|
@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
|
|||
|
||||
handler.on('GetTextContent', function wphExtractText(data) {
|
||||
var pageIndex = data.pageIndex;
|
||||
var normalizeWhitespace = data.normalizeWhitespace;
|
||||
return pdfManager.getPage(pageIndex).then(function(page) {
|
||||
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
||||
startWorkerTask(task);
|
||||
var pageNum = pageIndex + 1;
|
||||
var start = Date.now();
|
||||
return page.extractTextContent(task).then(function(textContent) {
|
||||
return page.extractTextContent(task, normalizeWhitespace).then(
|
||||
function(textContent) {
|
||||
finishWorkerTask(task);
|
||||
info('text indexing: page=' + pageNum + ' - time=' +
|
||||
(Date.now() - start) + 'ms');
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue