Dear popplers,
maybe it is of no great relevance, but I prepared a patch (attached) to save
space in files written by 'saveCompleteRewrite' in case the input uses
'Cross-Reference Streams'
and 'Object Streams'.
In the current state every object from the input is read into the xref-table
and independently saved into the output file. This applies to object streams,
too. When each object is saved conventionally, and a standard xref section is
written to the new file (), there is no need to write the object streams and
reference streams to it.
This applies to the linearization dictionary, which is invalidated by a
rewrite, too.
The attached patch introduces two helpers in Linearization.cc/Linearization.h
and XRef.h to get object number of linearization dict and xrefstream flag,
respectively. Than changes 'saveCompleteRewrite' to not write content of
object streams, reference streams and linearization dictionary into the
output, but mark the coresponding objects as free.
The patched poppler works fine for my limited test cases.
I want to ask if someone more knowledgeable than me
could check for possible implications.
Feel free to change anything. Any thoughts are welcome.
regards
Axel
>From bdc19cb3c6d4309b30fa4a9ed8eef5bcd2ce137d Mon Sep 17 00:00:00 2001
From: Axel Struebing <[email protected]>
Date: Wed, 19 Oct 2011 15:12:44 +0200
Subject: [PATCH] save space in written files (saveCompleteRewrite)
- introduce two helpers in Linearization.cc/Linearization.h and XRef.h
to get object number of linearization dict and xrefstream flag, respectively
- adapt saveCompleteRewrite to not write content of object streams, xref
streams and linearization dict
- write free entry to new xref for these objects
---
poppler/Linearization.cc | 4 ++++
poppler/Linearization.h | 3 ++-
poppler/PDFDoc.cc | 39 ++++++++++++++++++++++++++++++++++++---
poppler/XRef.h | 2 ++
4 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/poppler/Linearization.cc b/poppler/Linearization.cc
index 73dc5ad..51c534e 100644
--- a/poppler/Linearization.cc
+++ b/poppler/Linearization.cc
@@ -34,7 +34,11 @@ Linearization::Linearization (BaseStream *str)
parser->getObj(&linDict);
if (obj1.isInt() && obj2.isInt() && obj3.isCmd("obj") && linDict.isDict()) {
linDict.dictLookup("Linearized", &obj5);
+ linRef.num = obj1.getInt();
+ linRef.gen = obj2.getInt();
if (!(obj5.isNum() && obj5.getNum() > 0)) {
+ linRef.num = -1;
+ linRef.gen = -1;
linDict.free();
linDict.initNull();
}
diff --git a/poppler/Linearization.h b/poppler/Linearization.h
index 6728a75..55cd57b 100644
--- a/poppler/Linearization.h
+++ b/poppler/Linearization.h
@@ -35,11 +35,12 @@ public:
int getNumPages();
Guint getMainXRefEntriesOffset();
int getPageFirst();
+ Ref getLinRef() { return linRef; }
private:
Object linDict;
-
+ Ref linRef;
};
#endif
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 01d2759..68c0c1e 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -814,15 +814,50 @@ void PDFDoc::saveIncrementalUpdate (OutStream* outStr)
delete uxref;
}
+/*
+ we can save space in saveCompleteRewrite
+ - it writes every nonfree object to the new file including its
+ object number in a standard xref
+ - objects originating from object streams are expanded to normal objects
+ - input object/xref streams were still copied to the output file
+ - so we can save space not doing so
+ - further saving is not writing linearization dict
+*/
void PDFDoc::saveCompleteRewrite (OutStream* outStr)
{
- outStr->printf("%%PDF-%d.%d\r\n",pdfMajorVersion,pdfMinorVersion);
+ Ref linRef = getLinearization()->getLinRef();
+ GBool noObjstrm = xref->getxRefStream();//file used xref stream
+
+ writeHeader(outStr, pdfMajorVersion, pdfMinorVersion);
XRef *uxref = new XRef();
uxref->add(0, 65535, 0, gFalse);
for(int i=0; i<xref->getNumObjects(); i++) {
Object obj1;
Ref ref;
XRefEntryType type = xref->getEntry(i)->type;
+ // set entry to free if ObjStm or XRef
+ if ( noObjstrm && ( type == xrefEntryUncompressed || type == xrefEntryCompressed ) ) {
+ ref.num = i;
+ if ( type == xrefEntryUncompressed ) {
+ ref.gen = xref->getEntry(i)->gen;
+ } else {
+ ref.gen = 0; //compressed entries have gen == 0
+ }
+ xref->fetch(ref.num, ref.gen, &obj1);
+ if ( obj1.isStream("XRef") || obj1.isStream("ObjStm") ) {
+ // do not write content and add as free
+ uxref->add(ref.num, ref.gen, 0, gFalse);
+ obj1.free();
+ continue;
+ }
+ obj1.free();
+ }
+ // do not write linearization dict, add as free
+ if ( linRef.num == i ) {
+ uxref->add(ref.num, ref.gen, 0, gFalse);
+ continue;
+ }
+
if (type == xrefEntryFree) {
ref.num = i;
ref.gen = xref->getEntry(i)->gen;
@@ -851,9 +886,7 @@ void PDFDoc::saveCompleteRewrite (OutStream* outStr)
writeTrailer(uxrefOffset, uxref->getSize(), outStr, gFalse);
-
delete uxref;
-
}
void PDFDoc::writeDictionnary (Dict* dict, OutStream* outStr, XRef *xRef, Guint numOffset)
diff --git a/poppler/XRef.h b/poppler/XRef.h
index 8b77b6c..697d2c0 100644
--- a/poppler/XRef.h
+++ b/poppler/XRef.h
@@ -131,6 +131,8 @@ public:
Ref addIndirectObject (Object* o);
void add(int num, int gen, Guint offs, GBool used);
void writeToFile(OutStream* outStr, GBool writeAllEntries);
+ // is this a cross reference streams only file
+ GBool getxRefStream() { return xRefStream; }
private:
--
1.7.3.4
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler