From 40b9be137f276ab54598aad8726c982ed6d58075 Mon Sep 17 00:00:00 2001 From: Nils Maier Date: Sun, 27 May 2012 22:49:28 +0200 Subject: [PATCH 1/4] Decode XML metadata as UTF-8 XML uses UTF-8 by default, which needs to be decoded to a Javascript String prior to feeding it to the DOMParser. In an ideal world, the XML would actually be analyzed and the specified charset would be used, however that does not seem feasible unless JS engines get iconv bindings. Fixes GH-1692 --- src/obj.js | 7 ++++++- src/util.js | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 3432ac68..acc9e128 100644 --- a/src/obj.js +++ b/src/obj.js @@ -140,7 +140,12 @@ var Catalog = (function CatalogClosure() { if (isName(type) && isName(subtype) && type.name === 'Metadata' && subtype.name === 'XML') { - metadata = stringToPDFString(bytesToString(stream.getBytes())); + // XXX: This should examine the charset the XML document defines, + // however since there are currently no real means to decode + // arbitrary charsets, let's just hope that the author of the PDF + // was reasonable enough to stick with the XML default charset, + // which is UTF-8. + metadata = stringToUTF8String(bytesToString(stream.getBytes())); } } diff --git a/src/util.js b/src/util.js index 90e6cee5..fe5d895e 100644 --- a/src/util.js +++ b/src/util.js @@ -302,6 +302,10 @@ function stringToPDFString(str) { return str2; } +function stringToUTF8String(str) { + return decodeURIComponent(escape(str)); +} + function isBool(v) { return typeof v == 'boolean'; } From 413e5357b9c948a9a90a69150b49afaf2b258c04 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Sun, 27 May 2012 18:03:04 -0500 Subject: [PATCH 2/4] Suppress metadata decryption --- src/crypto.js | 4 +++- src/obj.js | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/crypto.js b/src/crypto.js index c86551f3..cd0cf74e 100644 --- a/src/crypto.js +++ b/src/crypto.js @@ -546,8 +546,10 @@ var CipherTransformFactory = (function CipherTransformFactoryClosure() { var userPassword = stringToBytes(dict.get('U')); var flags = dict.get('P'); var revision = dict.get('R'); - var encryptMetadata = + var encryptMetadata = algorithm == 4 && // meaningful when V is 4 dict.get('EncryptMetadata') !== false; // makes true as default value + this.encryptMetadata = encryptMetadata; + var fileIdBytes = stringToBytes(fileId); var passwordBytes; if (password) diff --git a/src/obj.js b/src/obj.js index acc9e128..3b7eb563 100644 --- a/src/obj.js +++ b/src/obj.js @@ -132,7 +132,14 @@ var Catalog = (function CatalogClosure() { Catalog.prototype = { get metadata() { - var stream = this.catDict.get('Metadata'); + var streamRef = this.catDict.getRaw('Metadata'); + if (!isRef(streamRef)) + return shadow(this, 'metadata', null); + + var encryptMetadata = !this.xref.encrypt ? false : + this.xref.encrypt.encryptMetadata; + + var stream = this.xref.fetch(streamRef, !encryptMetadata); var metadata; if (stream && isDict(stream.dict)) { var type = stream.dict.get('Type'); From 48811f362b1fe3035808720483507c896e39c477 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Sun, 27 May 2012 19:00:13 -0500 Subject: [PATCH 3/4] Skipping incorrectly encoded metadata --- src/obj.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 3b7eb563..43ec8472 100644 --- a/src/obj.js +++ b/src/obj.js @@ -152,7 +152,11 @@ var Catalog = (function CatalogClosure() { // arbitrary charsets, let's just hope that the author of the PDF // was reasonable enough to stick with the XML default charset, // which is UTF-8. - metadata = stringToUTF8String(bytesToString(stream.getBytes())); + try { + metadata = stringToUTF8String(bytesToString(stream.getBytes())); + } catch (e) { + log('Skipping invalid metadata.'); + } } } From 1fb02300a424d79802a2b3d479bda005c6210b31 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Tue, 29 May 2012 11:01:46 -0500 Subject: [PATCH 4/4] Removing log --- src/obj.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 43ec8472..c01ffab5 100644 --- a/src/obj.js +++ b/src/obj.js @@ -155,7 +155,7 @@ var Catalog = (function CatalogClosure() { try { metadata = stringToUTF8String(bytesToString(stream.getBytes())); } catch (e) { - log('Skipping invalid metadata.'); + info('Skipping invalid metadata.'); } } }