src/docrecord.py | 7 +++---- src/docstream.py | 26 ++++++++++++++++++++++++++ test/doc/header.doc |binary test/doc/header.rtf | 4 ++++ test/doc/headerlo.doc |binary test/doc/test.py | 12 ++++++++++++ 6 files changed, 45 insertions(+), 4 deletions(-)
New commits: commit 0ccbaa56a706ec398e4cea1a656ced09829f1f8f Author: Miklos Vajna <[email protected]> Date: Sat May 4 16:02:55 2013 +0200 PlcfSed: use retrieveCPs diff --git a/src/docrecord.py b/src/docrecord.py index bcd156b..20c63b4 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -300,7 +300,6 @@ class PlcfSed(DOCDirStream, PLC): def dump(self): print '<plcfSed type="PlcfSed" offset="%d" size="%d bytes">' % (self.pos, self.size) - offset = self.mainStream.fcMin pos = self.pos for i in range(self.getElements()): # aCp @@ -313,7 +312,7 @@ class PlcfSed(DOCDirStream, PLC): aSed = Sed(self, self.getOffset(self.pos, i)) aSed.dump() - print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveText(offset + start, offset + end, logicalLength = True)) + print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveCPs(start, end)) print '</aCP>' print '</plcfSed>' commit d8165133205a3c3533211b8e4f0da14de450bdaa Author: Miklos Vajna <[email protected]> Date: Sat May 4 15:57:43 2013 +0200 doc: test header dumping of both MSO and LO-generated doc diff --git a/test/doc/header.doc b/test/doc/header.doc new file mode 100755 index 0000000..a351d72 Binary files /dev/null and b/test/doc/header.doc differ diff --git a/test/doc/header.rtf b/test/doc/header.rtf new file mode 100644 index 0000000..b3fd8b4 --- /dev/null +++ b/test/doc/header.rtf @@ -0,0 +1,4 @@ +{\rtf1 +{\header This is a header.} +Hello world!\par +} diff --git a/test/doc/headerlo.doc b/test/doc/headerlo.doc new file mode 100644 index 0000000..73ec689 Binary files /dev/null and b/test/doc/headerlo.doc differ diff --git a/test/doc/test.py b/test/doc/test.py index d02f680..e50e447 100755 --- a/test/doc/test.py +++ b/test/doc/test.py @@ -170,6 +170,18 @@ class Test(unittest.TestCase): levels = self.root.findall('stream[@name="WordDocument"]/fib/fibRgFcLcbBlob/lcbPlfLst/plfLst/lvl') self.assertEqual("â¢", levels[0].findall('xst/rgtchar')[0].attrib['value']) + def test_header(self): + self.dump('header') + + firstHeader = self.root.findall('stream[@name="WordDocument"]/fib/fibRgFcLcbBlob/lcbPlcfHdd/plcfHdd/aCP[@index="7"]') + self.assertEqual("This is a header.\\x0D\\x0D", firstHeader[0].findall('transformed')[0].attrib['value']) + + def test_headerlo(self): + self.dump('headerlo') + + firstHeader = self.root.findall('stream[@name="WordDocument"]/fib/fibRgFcLcbBlob/lcbPlcfHdd/plcfHdd/aCP[@index="7"]') + self.assertEqual("This is a header.\\x0D\\x0D", firstHeader[0].findall('transformed')[0].attrib['value']) + if __name__ == '__main__': unittest.main() commit 89cb847a68c1db6a1177d9a0673a68702de48da2 Author: Miklos Vajna <[email protected]> Date: Sat May 4 15:50:25 2013 +0200 doc: fix dumping header/footer text in LO-produced files diff --git a/src/docrecord.py b/src/docrecord.py index 62ca85f..bcd156b 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -816,7 +816,7 @@ class PlcfHdd(DOCDirStream, PLC): def dump(self): print '<plcfHdd type="PlcfHdd" offset="%d" size="%d bytes">' % (self.pos, self.size) - offset = self.mainStream.fcMin + self.mainStream.ccpText + offset = self.mainStream.ccpText + self.mainStream.ccpFtn pos = self.pos for i in range(self.getElements() - 1): start = self.getuInt32(pos = pos) @@ -837,7 +837,7 @@ class PlcfHdd(DOCDirStream, PLC): 11: "First page footer", } print '<aCP index="%d" contents="%s" start="%d" end="%d">' % (i, contentsMap[i], start, end) - print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveText(offset + start, offset + end)) + print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveCPs(offset + start, offset + end)) pos += 4 print '</aCP>' print '</plcfHdd>' diff --git a/src/docstream.py b/src/docstream.py index 7f3cc51..dc36baf 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -9,6 +9,7 @@ import ole import struct from docdirstream import DOCDirStream import docrecord +import globals class DOCFile: """Represents the whole word file - feed will all bytes.""" @@ -674,8 +675,33 @@ class WordDocumentStream(DOCDirStream): return index def retrieveText(self, start, end, logicalLength = False): + """Deprecated, use retrieveCPs instead.""" plcPcd = self.clx.pcdt.plcPcd idx = self.__findText(plcPcd, start) return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, logicalPositions = False, logicalLength = logicalLength) + def retrieveCP(self, cp): + """Implements 2.4.1 Retrieving Text.""" + plcPcd = self.clx.pcdt.plcPcd + for i in range(len(plcPcd.aCp)): + if plcPcd.aCp[i] <= cp: + index = i + break + aPcd = plcPcd.aPcd[index] + fcCompressed = aPcd.fc + if fcCompressed.fCompressed == 1: + return globals.encodeName(self.bytes[(fcCompressed.fc/2) + (cp - plcPcd.aCp[i])]) + else: + pos = fcCompressed.fc + 2 * (cp - plcPcd.aCp[i]) + return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True) + + def retrieveCPs(self, start, end): + """Retrieves a range of characters.""" + ret = [] + i = start + while i < end: + ret.append(self.retrieveCP(i)) + i += 1 + return "".join(ret) + # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
_______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
