On Monday, August 20, 2012 10:32:16 PM Fabio D'Urso wrote: > --- Patches 0015-0016 -- Special handling for XRef streams and ObjStm objects > Another category of objects in unencrypted form are XRef streams > (each with its own xref entry). Patch 0015 marks such entries as > Unencrypted, so that fetch can read them correctly. Note that XRef parsing > bypasses fetch, therefore it's unaffected by this patch. > Actually, storing XRef stream objects makes no sense in case of full > rewrite, because we always create a new XRef table. Therefore, copied XRef > stream objects from the original document are just a waste of space (and > also results in currupt objects, see commit message in 0015). Therefore > the patch also sets a "DontRewrite" flag on those objects and skips them > in full-rewrite mode. > > Another category of space-waster leaked objects are compressed object > streams, which are currently copied in fully-rewritten documents, even > though the objects they contain are individually written too. Patch 0016 > sets DontRewrite on ObjStm objects too. The patches I sent in the first message take the "reparse all XRef streams at save-time" approach proposed by Albert.
As we had planned on IRC, these are the alternative patches that make use of an auxiliary XRef std::vector<int> member. Now that they're done, I prefer the other ones :D but I'm posting these ones too so you can compare them. P.S.: Oops, I've sent a 19th patch that was not meant to be sent. It's a tool to print info about the XRef table and to dump the objects it contains in various formats. I didn't mean to publish it as it needs some refinements. If you find it useful, let me know. Thank you, Fabio
From 72b0c0facd654a5d3757d3935b1757b8ea203312 Mon Sep 17 00:00:00 2001 From: Fabio D'Urso <[email protected]> Date: Thu, 9 Aug 2012 20:08:44 +0200 Subject: [PATCH 15/19] Mark XRef streams as Unencrypted and DontRewrite - Unencrypted because they are stored in unencrypted form - DontRewrite because they must not be copied in full rewrite mode, because we always build a new XRef table, and existing XRef streams are not referenced any more (ie they become "leaked" objects). Furthermore, since readers know that XRef streams' objects are unencrypted from the fact that they are XRef streams, but these leaked objects are no longer referred as XRef streams, readers would think that they are regularly encrypted objects, resulting in currupt objects. --- poppler/PDFDoc.cc | 5 +++++ poppler/XRef.cc | 16 ++++++++++++++++ poppler/XRef.h | 4 +++- 3 files changed, 24 insertions(+), 1 deletions(-) diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc index 096d758..470083b 100644 --- a/poppler/PDFDoc.cc +++ b/poppler/PDFDoc.cc @@ -913,6 +913,11 @@ void PDFDoc::saveCompleteRewrite (OutStream* outStr) and we don't want the one with num=0 because it has already been added (gen = 65535)*/ if (ref.gen > 0 && ref.num > 0) uxref->add(ref.num, ref.gen, 0, gFalse); + } else if (xref->getEntry(i)->getFlag(XRefEntry::DontRewrite)) { + // This entry must not be written, put a free entry instead (with incremented gen) + ref.num = i; + ref.gen = xref->getEntry(i)->gen + 1; + uxref->add(ref.num, ref.gen, 0, gFalse); } else if (type == xrefEntryUncompressed){ ref.num = i; ref.gen = xref->getEntry(i)->gen; diff --git a/poppler/XRef.cc b/poppler/XRef.cc index 58278c6..0de395a 100644 --- a/poppler/XRef.cc +++ b/poppler/XRef.cc @@ -448,6 +448,7 @@ GBool XRef::readXRef(Guint *pos, std::vector<Guint> *followedXRefStm) { // parse an xref stream } else if (obj.isInt()) { + const int objNum = obj.getInt(); obj.free(); if (!parser->getObj(&obj, gTrue)->isInt()) { goto err1; @@ -463,6 +464,7 @@ GBool XRef::readXRef(Guint *pos, std::vector<Guint> *followedXRefStm) { if (trailerDict.isNone()) { xRefStream = gTrue; } + xrefStreamObjNums.push_back(objNum); more = readXRefStream(obj.getStream(), pos); obj.free(); @@ -1506,6 +1508,20 @@ void XRef::scanSpecialFlags() { } scannedSpecialFlags = gTrue; + // Forces all XRef entries to be parsed. As a side effect, xrefStreamObjNums + // is filled with all the XRef streams in the document that had not been + // parsed yet + for (int i = 0; i < size; ++i) { + getEntry(i, gFalse /* complainIfMissing */); + } + + // Mark XRef streams objects as Unencrypted and DontRewrite + for (size_t i = 0; i < xrefStreamObjNums.size(); ++i) { + const int objNum = xrefStreamObjNums.at(i); + getEntry(objNum)->setFlag(XRefEntry::Unencrypted, gTrue); + getEntry(objNum)->setFlag(XRefEntry::DontRewrite, gTrue); + } + // Mark objects referred from the Encrypt dict as Unencrypted Object obj; markUnencrypted(trailerDict.dictLookupNF("Encrypt", &obj)); diff --git a/poppler/XRef.h b/poppler/XRef.h index 3782f38..2f39601 100644 --- a/poppler/XRef.h +++ b/poppler/XRef.h @@ -67,7 +67,8 @@ struct XRefEntry { Updated, // Entry was modified // Special flags -- available only after xref->scanSpecialFlags is run - Unencrypted // Entry is stored in unencrypted form (meaningless in unencrypted documents) + Unencrypted, // Entry is stored in unencrypted form (meaningless in unencrypted documents) + DontRewrite // Entry must not be written back in case of full rewrite }; inline GBool getFlag(Flag flag) { @@ -203,6 +204,7 @@ private: Guint prevXRefOffset; // position of prev XRef section (= next to read) Guint mainXRefEntriesOffset; // offset of entries in main XRef table GBool xRefStream; // true if last XRef section is a stream + std::vector<int> xrefStreamObjNums; // num of all XRef stream objects that have been read GBool scannedSpecialFlags; // true if scanSpecialFlags has been called void init(); -- 1.7.6.5
From 375f629c7f559c44ff959faa1d6bd0b0acd3c52a Mon Sep 17 00:00:00 2001 From: Fabio D'Urso <[email protected]> Date: Wed, 15 Aug 2012 18:09:02 +0200 Subject: [PATCH 16/19] Mark object streams as DontRewrite So that they don't get copied in full rewrite mode, because they're not referenced from the XRef table we build, and we already individually write each object they contain. --- poppler/XRef.cc | 13 +++++++++++-- 1 files changed, 11 insertions(+), 2 deletions(-) diff --git a/poppler/XRef.cc b/poppler/XRef.cc index 0de395a..f22abe3 100644 --- a/poppler/XRef.cc +++ b/poppler/XRef.cc @@ -1510,9 +1510,18 @@ void XRef::scanSpecialFlags() { // Forces all XRef entries to be parsed. As a side effect, xrefStreamObjNums // is filled with all the XRef streams in the document that had not been - // parsed yet + // parsed yet. It also marks object streams as DontRewrite, because we write + // each object individually in full rewrite mode. for (int i = 0; i < size; ++i) { - getEntry(i, gFalse /* complainIfMissing */); + XRefEntry *e = getEntry(i, gFalse /* complainIfMissing */); + if (e->type == xrefEntryCompressed) { + const int objStmNum = e->offset; + if (unlikely(objStmNum < 0 || objStmNum >= size)) { + error(errSyntaxError, -1, "Compressed object offset out of xref bounds"); + } else { + getEntry(objStmNum)->setFlag(XRefEntry::DontRewrite, gTrue); + } + } } // Mark XRef streams objects as Unencrypted and DontRewrite -- 1.7.6.5
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
