src/docdirstream.py | 9 ++++++--- src/docrecord.py | 40 ++++++++++++++++++++++++++++++++-------- src/docstream.py | 9 +++++++-- src/globals.py | 7 +++++-- 4 files changed, 50 insertions(+), 15 deletions(-)
New commits: commit 806678b9a755eb7f304a855c1efe23e86901cea4 Author: Miklos Vajna <[email protected]> Date: Sun Aug 18 16:07:54 2013 +0200 dump grfhic diff --git a/src/docrecord.py b/src/docrecord.py index a0d6cd4..61be2b4 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -3405,6 +3405,28 @@ class PlcfGram(DOCDirStream, PLC): print '</aCP>' print '</plcfGram>' +class Grfhic(DOCDirStream): + """The grfhic structure is a set of HTML incompatibility flags that specify + the HTML incompatibilities of a list structure.""" + def __init__(self, parent): + DOCDirStream.__init__(self, parent.bytes) + self.pos = parent.pos + self.parent = parent + + def dump(self): + print '<grfhic type="grfhic">' + buf = self.readuInt8() + self.printAndSet("fhicChecked", self.getBit(buf, 0)) + self.printAndSet("fhicFormat", self.getBit(buf, 1)) + self.printAndSet("fhicListText", self.getBit(buf, 2)) + self.printAndSet("fhicPeriod", self.getBit(buf, 3)) + self.printAndSet("fhicLeft1", self.getBit(buf, 4)) + self.printAndSet("fhicListTab", self.getBit(buf, 5)) + self.printAndSet("unused", self.getBit(buf, 6)) + self.printAndSet("fhicBullet", self.getBit(buf, 7)) + self.parent.pos = self.pos + print '</grfhic>' + class LSTF(DOCDirStream): """The LSTF structure contains formatting properties that apply to an entire list.""" def __init__(self, plfLst, index): @@ -3426,7 +3448,7 @@ class LSTF(DOCDirStream): self.printAndSet("unused2", self.getBit(buf, 3)) self.printAndSet("fHybrid", self.getBit(buf, 4)) self.printAndSet("reserved1", (buf & 0xe0) >> 5) # 6..8th bits - self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic + Grfhic(self).dump() print '</lstf>' class LVLF(DOCDirStream): @@ -3455,7 +3477,7 @@ class LVLF(DOCDirStream): self.printAndSet("cbGrpprlChpx", self.readuInt8()) self.printAndSet("cbGrpprlPapx", self.readuInt8()) self.printAndSet("ilvlRestartLim", self.readuInt8()) - self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic + Grfhic(self).dump() print '</lvlf>' class LVL(DOCDirStream): @@ -3531,7 +3553,7 @@ class LFO(DOCDirStream): self.printAndSet("unused2", self.readuInt32()) self.printAndSet("clfolvl", self.readuInt8()) self.printAndSet("ibstFltAutoNum", self.readuInt8()) - self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic + Grfhic(self).dump() self.printAndSet("unused3", self.readuInt8()) print '</lfo>' commit 7ebcbd32d92043bcc998e6dca2067ea86d5d9934 Author: Miklos Vajna <[email protected]> Date: Sun Aug 18 15:58:25 2013 +0200 the spec says lcbPlcfLvcPre10 should be ignored as well diff --git a/src/docstream.py b/src/docstream.py index 22695d4..89cd0d7 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -520,7 +520,7 @@ class WordDocumentStream(DOCDirStream): value = self.readInt32() hasHandler = len(i) > 1 # the spec says these must be ignored - needsIgnoring = ["lcbStshfOrig", "lcbPlcfBteLvc"] + needsIgnoring = ["lcbStshfOrig", "lcbPlcfBteLvc", "lcbPlcfLvcPre10"] # a member needs handling if it defines the size of a struct and it's non-zero needsHandling = i[0].startswith("lcb") and value != 0 and (not i[0] in needsIgnoring) self.printAndSet(i[0], value, end = ((not hasHandler) and (not needsHandling)), offset = True) commit dea1e63ac3e62ca67192d39177c1de26fb15fd77 Author: Miklos Vajna <[email protected]> Date: Sun Aug 18 15:38:58 2013 +0200 WordDocumentStream: don't throw on invalid utf16 Just print a warning instead. ooo101417-1.doc triggers this. diff --git a/src/docdirstream.py b/src/docdirstream.py index 9888638..5cf1493 100644 --- a/src/docdirstream.py +++ b/src/docdirstream.py @@ -134,7 +134,7 @@ class DOCDirStream: else: break count += 1 - return globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes))) + return globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes)), xml = True) def getBit(self, byte, bitNumber): return (byte & (1 << bitNumber)) >> bitNumber diff --git a/src/docstream.py b/src/docstream.py index 398207c..22695d4 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -948,7 +948,12 @@ class WordDocumentStream(DOCDirStream): if compressed: return globals.encodeName(self.bytes[pos]) else: - return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True) + try: + return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True) + except UnicodeDecodeError: + reason = 'could not decode bytes in position %d-%d (%s-%s)' % (pos, pos+1, hex(ord(self.bytes[pos])), hex(ord(self.bytes[pos+1]))) + print '<todo what="WordDocumentStream::retrieveCP(): %s"/>' % reason + return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16', errors="replace"), lowOnly = True) def retrieveCPs(self, start, end): """Retrieves a range of characters.""" diff --git a/src/globals.py b/src/globals.py index 68aae93..31e995a 100644 --- a/src/globals.py +++ b/src/globals.py @@ -412,7 +412,7 @@ def getDouble (bytes): return struct.unpack('<d', text)[0] -def getUTF8FromUTF16 (bytes): +def getUTF8FromUTF16 (bytes, xml = False): # little endian utf-16 strings byteCount = len(bytes) loopCount = int(byteCount/2) @@ -431,7 +431,10 @@ def getUTF8FromUTF16 (bytes): try: text += unicode(code, 'utf-8') except UnicodeDecodeError: - text += "<%d invalid chars>"%len(code) + close = "" + if xml: + close="/" + text += "<%d invalid chars%s>"%(len(code), close) return text class StreamWrap(object): commit 4031eb1e626a9c66c0c306708b6ea96bbfb468f6 Author: Miklos Vajna <[email protected]> Date: Sun Aug 18 15:02:58 2013 +0200 TCGRF: tolerate undocumented TextFlow As seen in ooo100632-2.doc. diff --git a/src/docdirstream.py b/src/docdirstream.py index c3e4207..9888638 100644 --- a/src/docdirstream.py +++ b/src/docdirstream.py @@ -21,13 +21,16 @@ class DOCDirStream: self.mainStream = mainStream self.doc = doc - def printAndSet(self, key, value, hexdump = True, end = True, offset = False, silent = False, dict = None): + def printAndSet(self, key, value, hexdump = True, end = True, offset = False, silent = False, dict = None, default = None): setattr(self, key, value) if silent: return attrs = "" if dict: - attrs += ' name="%s"' % dict[value] + if value in dict or not default: + attrs += ' name="%s"' % dict[value] + else: + attrs += ' name="%s"' % default if hexdump: value = hex(value) if offset: diff --git a/src/docrecord.py b/src/docrecord.py index a9d99db..a0d6cd4 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -1072,7 +1072,7 @@ class TCGRF(DOCDirStream): print '<tcgrf type="TCGRF" offset="%d">' % self.pos buf = self.readuInt16() self.printAndSet("horzMerge", buf & 0x0003) # 1..2nd bits - self.printAndSet("textFlow", (buf & 0x001c) >> 2, dict = TextFlow) # 3..6th bits + self.printAndSet("textFlow", (buf & 0x001c) >> 2, dict = TextFlow, default = "todo") # 3..6th bits self.printAndSet("vertMerge", (buf & 0x0060) >> 6, dict = VerticalMergeFlag) # 7..8th bits self.printAndSet("vertAlign", (buf & 0x0180) >> 8, dict = VerticalAlign) # 9..10th bits self.printAndSet("ftsWidth", (buf & 0x0e00) >> 10, dict = Fts) # 11..12th bits commit e8fd0762c67fab15929288719015c9af5b57fb4c Author: Miklos Vajna <[email protected]> Date: Sun Aug 18 13:01:28 2013 +0200 PICFAndofficeArtData: don't throw on MM_SHAPEFILE fdo54551-1.doc triggered this, with the change it only properly prints a TODO. diff --git a/src/docrecord.py b/src/docrecord.py index 66ff07f..a9d99db 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -1019,8 +1019,9 @@ class PICFAndOfficeArtData(DOCDirStream): assert self.pos == pos + 68 if picf.mfpf.mm == 0x0066: print '<todo what="PICFAndOfficeArtData::dump(): picf.mfpf.mm == MM_SHAPEFILE is unhandled"/>' - remaining = picf.lcb - (self.pos - pos) - msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream)) + else: + remaining = picf.lcb - (self.pos - pos) + msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream)) else: print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData or sprmCFOle2"/>' print '</PICFAndOfficeArtData>' commit 8f43c92520fb01066e7c287eaa2ba69b9ebe74fe Author: Miklos Vajna <[email protected]> Date: Sun Aug 18 11:51:35 2013 +0200 PICFAndOfficeArtData: blacklist sprmCFOle2 as well According to the spec, this should not occur with a 0x01 placeholder char, but fdo48097-1.doc has it. diff --git a/src/docrecord.py b/src/docrecord.py index e424e65..66ff07f 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -1009,8 +1009,9 @@ class PICFAndOfficeArtData(DOCDirStream): print '<PICFAndOfficeArtData>' found = False for prl in self.parent.parent.parent.prls: - if prl.sprm.sprm == 0x0806: # sprmCFData + if prl.sprm.sprm in (0x0806, 0x080a): # sprmCFData, sprmCFOle2 found = True + break if not found: pos = self.pos picf = PICF(self) @@ -1021,7 +1022,7 @@ class PICFAndOfficeArtData(DOCDirStream): remaining = picf.lcb - (self.pos - pos) msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream)) else: - print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData"/>' + print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData or sprmCFOle2"/>' print '</PICFAndOfficeArtData>' # The TextFlow enumeration specifies the rotation settings for a block of text and for the individual _______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
