src/lib/VSDMetaData.cpp | 152 +++++++++++++++++++++++++++++++++++++---------- src/lib/VSDMetaData.h | 4 - src/lib/VSDParser.cpp | 26 +++++--- src/lib/VSDXMetaData.cpp | 21 ++++++ src/lib/VSDXParser.cpp | 38 ++++++++--- src/lib/VSDXParser.h | 2 src/lib/tokens.txt | 5 + src/test/Makefile.am | 2 src/test/data/dwg.vsd |binary src/test/data/dwg.vsdx |binary src/test/importtest.cpp | 21 ++++++ 11 files changed, 219 insertions(+), 52 deletions(-)
New commits: commit 3a8a08caebd5c8fbbc28ff34b0d7d0979662a56a Author: Mihai Varga <[email protected]> Date: Fri Feb 27 14:32:11 2015 +0200 More metadata extracted from VSD/VSDX files Template, language, company and category metadata are extracted from VSD/VSDX files. Company and category are set as custom properties. I've also added unit tests for those 4 new document properties. Change-Id: Ic14bfa11a2a8253c79dd4c4466afc7f6b2ce4ea9 Signed-off-by: Miklos Vajna <[email protected]> diff --git a/src/lib/VSDMetaData.cpp b/src/lib/VSDMetaData.cpp index 4235b56..6449384 100644 --- a/src/lib/VSDMetaData.cpp +++ b/src/lib/VSDMetaData.cpp @@ -9,6 +9,8 @@ #include "VSDMetaData.h" #include <cmath> +#include <cstring> +#include <string> #include <unicode/ucnv.h> #include <ctime> @@ -21,6 +23,60 @@ libvisio::VSDMetaData::~VSDMetaData() { } +enum PIDDSI +{ + PIDDSI_CODEPAGE = 0x00000001, + PIDDSI_CATEGORY = 0x00000002, + PIDDSI_PRESFORMAT = 0x00000003, + PIDDSI_BYTECOUNT = 0x00000004, + PIDDSI_LINECOUNT = 0x00000005, + PIDDSI_PARACOUNT = 0x00000006, + PIDDSI_SLIDECOUNT = 0x00000007, + PIDDSI_NOTECOUNT = 0x00000008, + PIDDSI_HIDDENCOUNT = 0x00000009, + PIDDSI_MMCLIPCOUNT = 0x0000000A, + PIDDSI_SCALE = 0x0000000B, + PIDDSI_HEADINGPAIR = 0x0000000C, + PIDDSI_DOCPARTS = 0x0000000D, + PIDDSI_MANAGER = 0x0000000E, + PIDDSI_COMPANY = 0x0000000F, + PIDDSI_LINKSDIRTY = 0x00000010, + PIDDSI_CCHWITHSPACES = 0x00000011, + PIDDSI_SHAREDDOC = 0x00000013, + PIDDSI_LINKBASE = 0x00000014, + PIDDSI_HLINKS = 0x00000015, + PIDDSI_HYPERLINKSCHANGED = 0x00000016, + PIDDSI_VERSION = 0x00000017, + PIDDSI_DIGSIG = 0x00000018, + PIDDSI_CONTENTTYPE = 0x0000001A, + PIDDSI_CONTENTSTATUS = 0x0000001B, + PIDDSI_LANGUAGE = 0x0000001C, + PIDDSI_DOCVERSION = 0x0000001D +}; + +enum PIDSI +{ + CODEPAGE_PROPERTY_IDENTIFIER = 0x00000001, + PIDSI_TITLE = 0x00000002, + PIDSI_SUBJECT = 0x00000003, + PIDSI_AUTHOR = 0x00000004, + PIDSI_KEYWORDS = 0x00000005, + PIDSI_COMMENTS = 0x00000006, + PIDSI_TEMPLATE = 0x00000007, + PIDSI_LASTAUTHOR = 0x00000008, + PIDSI_REVNUMBER = 0x00000009, + PIDSI_EDITTIME = 0x0000000A, + PIDSI_LASTPRINTED = 0x0000000B, + PIDSI_CREATE_DTM = 0x0000000C, + PIDSI_LASTSAVE_DTM = 0x0000000D, + PIDSI_PAGECOUNT = 0x0000000E, + PIDSI_WORDCOUNT = 0x0000000F, + PIDSI_CHARCOUNT = 0x00000010, + PIDSI_THUMBNAIL = 0x00000011, + PIDSI_APPNAME = 0x00000012, + PIDSI_DOC_SECURITY = 0x00000013 +}; + bool libvisio::VSDMetaData::parse(librevenge::RVNGInputStream *input) { if (!input) @@ -44,12 +100,24 @@ void libvisio::VSDMetaData::readPropertySetStream(librevenge::RVNGInputStream *i // NumPropertySets input->seek(4, librevenge::RVNG_SEEK_CUR); // FMTID0 - input->seek(16, librevenge::RVNG_SEEK_CUR); + //input->seek(16, librevenge::RVNG_SEEK_CUR); + uint32_t data1 = readU32(input); + uint16_t data2 = readU16(input); + uint16_t data3 = readU16(input); + uint8_t data4[8]; + for (int i = 0; i < 8; i++) + { + data4[i] = readU8(input); + } + char FMTID0[36]; + sprintf(FMTID0, "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", data1, data2, data3, + data4[0], data4[1], data4[2], data4[3], data4[4], data4[5], data4[6], data4[7]); + uint32_t offset0 = readU32(input); - readPropertySet(input, offset0); + readPropertySet(input, offset0, FMTID0); } -void libvisio::VSDMetaData::readPropertySet(librevenge::RVNGInputStream *input, uint32_t offset) +void libvisio::VSDMetaData::readPropertySet(librevenge::RVNGInputStream *input, uint32_t offset, char *FMTID) { input->seek(offset, librevenge::RVNG_SEEK_SET); @@ -62,17 +130,15 @@ void libvisio::VSDMetaData::readPropertySet(librevenge::RVNGInputStream *input, { if (i >= m_idsAndOffsets.size()) break; - readTypedPropertyValue(input, i, offset + m_idsAndOffsets[i].second); + readTypedPropertyValue(input, i, offset + m_idsAndOffsets[i].second, FMTID); } } -#define CODEPAGE_PROPERTY_IDENTIFIER 0x00000001 - uint32_t libvisio::VSDMetaData::getCodePage() { for (size_t i = 0; i < m_idsAndOffsets.size(); ++i) { - if (m_idsAndOffsets[i].first == CODEPAGE_PROPERTY_IDENTIFIER) + if (m_idsAndOffsets[i].first == PIDSI::CODEPAGE_PROPERTY_IDENTIFIER) { if (i >= m_typedPropertyValues.size()) break; @@ -93,13 +159,10 @@ void libvisio::VSDMetaData::readPropertyIdentifierAndOffset(librevenge::RVNGInpu #define VT_I2 0x0002 #define VT_LPSTR 0x001E -#define PIDSI_TITLE 0x00000002 -#define PIDSI_SUBJECT 0x00000003 -#define PIDSI_AUTHOR 0x00000004 -#define PIDSI_KEYWORDS 0x00000005 -#define PIDSI_COMMENTS 0x00000006 - -void libvisio::VSDMetaData::readTypedPropertyValue(librevenge::RVNGInputStream *input, uint32_t index, uint32_t offset) +void libvisio::VSDMetaData::readTypedPropertyValue(librevenge::RVNGInputStream *input, + uint32_t index, + uint32_t offset, + char *FMTID) { input->seek(offset, librevenge::RVNG_SEEK_SET); uint16_t type = readU16(input); @@ -119,24 +182,51 @@ void libvisio::VSDMetaData::readTypedPropertyValue(librevenge::RVNGInputStream * if (index >= m_idsAndOffsets.size()) return; - switch (m_idsAndOffsets[index].first) + if (!strcmp(FMTID, "f29f85e0-4ff9-1068-ab91-08002b27b3d9")) { - case PIDSI_TITLE: - m_metaData.insert("dc:title", string); - break; - case PIDSI_SUBJECT: - m_metaData.insert("dc:subject", string); - break; - case PIDSI_AUTHOR: - m_metaData.insert("meta:initial-creator", string); - m_metaData.insert("dc:creator", string); - break; - case PIDSI_KEYWORDS: - m_metaData.insert("meta:keyword", string); - break; - case PIDSI_COMMENTS: - m_metaData.insert("dc:description", string); - break; + switch (m_idsAndOffsets[index].first) + { + case PIDSI::PIDSI_TITLE: + m_metaData.insert("dc:title", string); + break; + case PIDSI::PIDSI_SUBJECT: + m_metaData.insert("dc:subject", string); + break; + case PIDSI::PIDSI_AUTHOR: + m_metaData.insert("meta:initial-creator", string); + m_metaData.insert("dc:creator", string); + break; + case PIDSI::PIDSI_KEYWORDS: + m_metaData.insert("meta:keyword", string); + break; + case PIDSI::PIDSI_COMMENTS: + m_metaData.insert("dc:description", string); + break; + case PIDSI::PIDSI_TEMPLATE: + std::string templateHref(string.cstr()); + size_t found = templateHref.find_last_of("/\\"); + if (found != std::string::npos) + string = librevenge::RVNGString(templateHref.substr(found+1).c_str()); + m_metaData.insert("librevenge:template", string); + break; + } + } + else if (!strcmp(FMTID,"d5cdd502-2e9c-101b-9397-08002b2cf9ae")) + { + switch (m_idsAndOffsets[index].first) + { + case PIDDSI::PIDDSI_CATEGORY: + m_metaData.insert("librevenge:category", string); + break; + case PIDDSI::PIDDSI_LINECOUNT: + // this should actually be PIDDSI::PIDDSI_COMPANY but this + // is what company is mapped to + m_metaData.insert("librevenge:company", string); + break; + case PIDDSI::PIDDSI_LANGUAGE: + m_metaData.insert("dc:language", string); + break; + } } } } diff --git a/src/lib/VSDMetaData.h b/src/lib/VSDMetaData.h index 581b0a2..dcb06ee 100644 --- a/src/lib/VSDMetaData.h +++ b/src/lib/VSDMetaData.h @@ -34,9 +34,9 @@ private: VSDMetaData &operator=(const VSDMetaData &); void readPropertySetStream(librevenge::RVNGInputStream *input); - void readPropertySet(librevenge::RVNGInputStream *input, uint32_t offset); + void readPropertySet(librevenge::RVNGInputStream *input, uint32_t offset, char *FMTID); void readPropertyIdentifierAndOffset(librevenge::RVNGInputStream *input); - void readTypedPropertyValue(librevenge::RVNGInputStream *input, uint32_t index, uint32_t offset); + void readTypedPropertyValue(librevenge::RVNGInputStream *input, uint32_t index, uint32_t offset, char *FMTID); librevenge::RVNGString readCodePageString(librevenge::RVNGInputStream *input); uint32_t getCodePage(); diff --git a/src/lib/VSDParser.cpp b/src/lib/VSDParser.cpp index 388e84d..5da88c7 100644 --- a/src/lib/VSDParser.cpp +++ b/src/lib/VSDParser.cpp @@ -154,18 +154,30 @@ bool libvisio::VSDParser::parseMetaData() m_container->seek(0, librevenge::RVNG_SEEK_SET); if (!m_container->isStructured()) return false; - librevenge::RVNGInputStream *stream = m_container->getSubStreamByName("\x05SummaryInformation"); - if (!stream) - return false; - + bool result = false; VSDMetaData metaData; - metaData.parse(stream); + + librevenge::RVNGInputStream *sumaryInfo = m_container->getSubStreamByName("\x05SummaryInformation"); + if (sumaryInfo) + { + result = true; + metaData.parse(sumaryInfo); + delete sumaryInfo; + } + + librevenge::RVNGInputStream *docSumaryInfo = m_container->getSubStreamByName("\005DocumentSummaryInformation"); + if (docSumaryInfo) + { + result = true; + metaData.parse(docSumaryInfo); + delete docSumaryInfo; + } + m_container->seek(0, librevenge::RVNG_SEEK_SET); metaData.parseTimes(m_container); m_collector->collectMetaData(metaData.getMetaData()); - delete stream; - return true; + return result; } bool libvisio::VSDParser::parseDocument(librevenge::RVNGInputStream *input, unsigned shift) diff --git a/src/lib/VSDXMetaData.cpp b/src/lib/VSDXMetaData.cpp index 4987cb2..94b1f99 100644 --- a/src/lib/VSDXMetaData.cpp +++ b/src/lib/VSDXMetaData.cpp @@ -10,6 +10,7 @@ #include "VSDXMetaData.h" #include "VSDXMLTokenMap.h" #include "libvisio_utils.h" +#include <string> libvisio::VSDXMetaData::VSDXMetaData() : m_metaData() @@ -82,14 +83,31 @@ void libvisio::VSDXMetaData::readCoreProperties(xmlTextReaderPtr reader) case XML_CP_LASTMODIFIEDBY: m_metaData.insert("dc:creator", readString(reader, XML_CP_LASTMODIFIEDBY)); break; + case XML_DC_LANGUAGE: + m_metaData.insert("dc:language", readString(reader, XML_DC_LANGUAGE)); + break; case XML_CP_CATEGORY: m_metaData.insert("librevenge:category", readString(reader, XML_CP_CATEGORY)); break; + case XML_COMPANY: + m_metaData.insert("librevenge:company", readString(reader, XML_COMPANY)); + break; + case XML_TEMPLATE: + { + librevenge::RVNGString templateHrefRVNG = readString(reader, XML_TEMPLATE); + std::string templateHref(templateHrefRVNG.cstr()); + size_t found = templateHref.find_last_of("/\\"); + if (found != std::string::npos) + templateHrefRVNG = librevenge::RVNGString(templateHref.substr(found+1).c_str()); + m_metaData.insert("librevenge:template", templateHrefRVNG); + break; + } default: break; } } - while ((XML_CP_COREPROPERTIES != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); + while ((XML_CP_COREPROPERTIES != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType || + XML_PROPERTIES != tokenId) && 1 == ret); } bool libvisio::VSDXMetaData::parse(librevenge::RVNGInputStream *input) @@ -110,6 +128,7 @@ bool libvisio::VSDXMetaData::parse(librevenge::RVNGInputStream *input) switch (tokenId) { case XML_CP_COREPROPERTIES: + case XML_PROPERTIES: readCoreProperties(reader); break; default: diff --git a/src/lib/VSDXParser.cpp b/src/lib/VSDXParser.cpp index d694650..f11d778 100644 --- a/src/lib/VSDXParser.cpp +++ b/src/lib/VSDXParser.cpp @@ -93,9 +93,7 @@ bool libvisio::VSDXParser::parseMain() VSDContentCollector contentCollector(m_painter, groupXFormsSequence, groupMembershipsSequence, documentPageShapeOrders, styles, m_stencils); m_collector = &contentCollector; - const libvisio::VSDXRelationship *metaDataRel = rootRels.getRelationshipByType("http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"); - if (metaDataRel) - parseMetaData(m_input, metaDataRel->getTarget().c_str()); + parseMetaData(m_input, rootRels); if (!parseDocument(m_input, rel->getTarget().c_str())) return false; @@ -280,23 +278,43 @@ bool libvisio::VSDXParser::parseTheme(librevenge::RVNGInputStream *input, const return true; } -bool libvisio::VSDXParser::parseMetaData(librevenge::RVNGInputStream *input, const char *name) +bool libvisio::VSDXParser::parseMetaData(librevenge::RVNGInputStream *input, libvisio::VSDXRelationships &rels) { if (!input) return false; input->seek(0, librevenge::RVNG_SEEK_SET); if (!input->isStructured()) return false; - librevenge::RVNGInputStream *stream = input->getSubStreamByName(name); - if (!stream) - return false; + + bool result = false; VSDXMetaData metaData; - metaData.parse(stream); + const libvisio::VSDXRelationship *coreProp = rels.getRelationshipByType("http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"); + if (coreProp) + { + librevenge::RVNGInputStream *stream = input->getSubStreamByName(coreProp->getTarget().c_str()); + if (stream) + { + result = true; + metaData.parse(stream); + delete stream; + } + } + + const libvisio::VSDXRelationship *extendedProp = rels.getRelationshipByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties"); + if (extendedProp) + { + librevenge::RVNGInputStream *stream = input->getSubStreamByName(extendedProp->getTarget().c_str()); + if (stream) + { + result = true; + metaData.parse(stream); + delete stream; + } + } m_collector->collectMetaData(metaData.getMetaData()); - delete stream; - return true; + return result; } void libvisio::VSDXParser::processXmlDocument(librevenge::RVNGInputStream *input, VSDXRelationships &rels) diff --git a/src/lib/VSDXParser.h b/src/lib/VSDXParser.h index 8566403..069204c 100644 --- a/src/lib/VSDXParser.h +++ b/src/lib/VSDXParser.h @@ -54,7 +54,7 @@ private: bool parsePages(librevenge::RVNGInputStream *input, const char *name); bool parsePage(librevenge::RVNGInputStream *input, const char *name); bool parseTheme(librevenge::RVNGInputStream *input, const char *name); - bool parseMetaData(librevenge::RVNGInputStream *input, const char *name); + bool parseMetaData(librevenge::RVNGInputStream *input, VSDXRelationships &rels); void processXmlDocument(librevenge::RVNGInputStream *input, VSDXRelationships &rels); void processXmlNode(xmlTextReaderPtr reader); diff --git a/src/lib/tokens.txt b/src/lib/tokens.txt index 6b01690..c165c0f 100644 --- a/src/lib/tokens.txt +++ b/src/lib/tokens.txt @@ -234,5 +234,10 @@ dc:title dcterms:created dcterms:modified dc:description +dc:template cp:lastModifiedBy cp:category +Company +Properties +Template +dc:language diff --git a/src/test/Makefile.am b/src/test/Makefile.am index 9285cbf..619302a 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -23,6 +23,8 @@ EXTRA_DIST = \ data/fdo86664.vsdx \ data/fdo86729-ms1252.vsd \ data/fdo86729-utf8.vsd \ + data/dwg.vsd \ + data/dwg.vsdx \ $(test_SOURCES) TESTS = test diff --git a/src/test/data/dwg.vsd b/src/test/data/dwg.vsd new file mode 100644 index 0000000..bea1075 Binary files /dev/null and b/src/test/data/dwg.vsd differ diff --git a/src/test/data/dwg.vsdx b/src/test/data/dwg.vsdx new file mode 100644 index 0000000..6642f8c Binary files /dev/null and b/src/test/data/dwg.vsdx differ diff --git a/src/test/importtest.cpp b/src/test/importtest.cpp index 29b99fd..7aba7bc 100644 --- a/src/test/importtest.cpp +++ b/src/test/importtest.cpp @@ -124,11 +124,15 @@ class ImportTest : public CPPUNIT_NS::TestFixture CPPUNIT_TEST(testVsdxMetadataTitle); CPPUNIT_TEST(testVsdMetadataTitleMs1252); CPPUNIT_TEST(testVsdMetadataTitleUtf8); + CPPUNIT_TEST(testVsdUserDefinedMetadata); + CPPUNIT_TEST(testVsdxUserDefinedMetadata); CPPUNIT_TEST_SUITE_END(); void testVsdxMetadataTitle(); void testVsdMetadataTitleMs1252(); void testVsdMetadataTitleUtf8(); + void testVsdUserDefinedMetadata(); + void testVsdxUserDefinedMetadata(); xmlBufferPtr m_buffer; xmlDocPtr m_doc; @@ -203,6 +207,23 @@ void ImportTest::testVsdMetadataTitleUtf8() assertXPath(m_doc, "/document/setDocumentMetaData", "date", "2014-11-26T09:24:56Z"); } +void ImportTest::testVsdUserDefinedMetadata() +{ + m_doc = parse("dwg.vsd", m_buffer); + assertXPath(m_doc, "/document/setDocumentMetaData", "category", "Category test"); + assertXPath(m_doc, "/document/setDocumentMetaData", "company", "Company test"); + assertXPath(m_doc, "/document/setDocumentMetaData", "template", "BASICD_M.VSTX"); +} + +void ImportTest::testVsdxUserDefinedMetadata() +{ + m_doc = parse("dwg.vsdx", m_buffer); + assertXPath(m_doc, "/document/setDocumentMetaData", "category", "Category test"); + assertXPath(m_doc, "/document/setDocumentMetaData", "company", "Company test"); + assertXPath(m_doc, "/document/setDocumentMetaData", "language", "en-US"); + assertXPath(m_doc, "/document/setDocumentMetaData", "template", "BASICD_M.VSTX"); +} + CPPUNIT_TEST_SUITE_REGISTRATION(ImportTest); /* vim:set shiftwidth=2 softtabstop=2 expandtab: */ _______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
