This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5eb677b1e16e228df6e17ec139d5b6f3422e8a1f Author: tallison <[email protected]> AuthorDate: Wed Mar 18 08:30:36 2026 -0400 improve sax ooxml - comments and other components - WIP --- .../microsoft/ooxml/OOXMLInlineBodyPartMap.java | 4 ++ .../microsoft/ooxml/OOXMLPartContentCollector.java | 13 ++++- .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 35 ++++++++++++ .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 4 ++ .../ooxml/SXWPFWordExtractorDecorator.java | 64 +++++++++++++++++++--- .../microsoft/ooxml/XWPFBodyContentsHandler.java | 7 +++ .../xslf/XSLFEventBasedPowerPointExtractor.java | 5 ++ .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 5 ++ 8 files changed, 129 insertions(+), 8 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java index 3b13d910ea..0738e9939b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java @@ -75,4 +75,8 @@ class OOXMLInlineBodyPartMap { boolean hasComments() { return !comments.isEmpty(); } + + Iterable<Map.Entry<String, byte[]>> getCommentEntries() { + return comments.entrySet(); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java index e5dca8665e..48dcc692d5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java @@ -47,6 +47,7 @@ class OOXMLPartContentCollector extends DefaultHandler { "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; private final Set<String> wrapperElementNames; + private final Set<String> skipIds; private final Map<String, byte[]> contentMap = new HashMap<>(); private final Map<String, String> namespaceMappings = new HashMap<>(); @@ -59,7 +60,17 @@ class OOXMLPartContentCollector extends DefaultHandler { * (e.g., "footnote", "endnote", "comment") */ OOXMLPartContentCollector(Set<String> wrapperElementNames) { + this(wrapperElementNames, Set.of("0", "-1")); + } + + /** + * @param wrapperElementNames local names of wrapper elements to collect + * @param skipIds IDs to skip (e.g., "0", "-1" for footnote + * separator/continuation elements) + */ + OOXMLPartContentCollector(Set<String> wrapperElementNames, Set<String> skipIds) { this.wrapperElementNames = wrapperElementNames; + this.skipIds = skipIds; } @Override @@ -82,7 +93,7 @@ class OOXMLPartContentCollector extends DefaultHandler { if (wrapperElementNames.contains(localName)) { String id = atts.getValue(W_NS, "id"); - if (id != null && !"0".equals(id) && !"-1".equals(id)) { + if (id != null && !skipIds.contains(id)) { currentId = id; buffer = new ByteArrayOutputStream(); writeString(buildWrapperOpenTag()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index dab03ac30f..95082f8840 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -75,6 +75,8 @@ public class OOXMLTikaBodyPartHandler private OOXMLInlineBodyPartMap inlinePartMap = OOXMLInlineBodyPartMap.EMPTY; private ParseContext parseContext = null; + private final java.util.List<String> pendingCommentIds = new java.util.ArrayList<>(); + private final java.util.Set<String> emittedCommentIds = new java.util.HashSet<>(); public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) { this(xhtml, null); @@ -251,12 +253,38 @@ public class OOXMLTikaBodyPartHandler xhtml.characters(NEWLINE, 0, 1); } + // Emit any pending comment content after the paragraph closes + // (matching the DOM parser's behavior of appending comments after paragraphs) + emitPendingComments(); + if (tableCellDepth > 0) { pWithinCell++; } pDepth--; } + private void emitPendingComments() throws SAXException { + if (pendingCommentIds.isEmpty()) { + return; + } + for (String id : pendingCommentIds) { + byte[] xml = inlinePartMap.getComment(id); + if (xml != null) { + inlineNoteContent(xml, "comment"); + emittedCommentIds.add(id); + } + } + pendingCommentIds.clear(); + } + + /** + * Returns the set of comment IDs that were inlined during parsing. + * Used by the decorator to skip these when dumping remaining comments. + */ + public java.util.Set<String> getEmittedCommentIds() { + return emittedCommentIds; + } + @Override public void startTable() throws SAXException { @@ -353,6 +381,13 @@ public class OOXMLTikaBodyPartHandler } } + @Override + public void commentReference(String id) throws SAXException { + if (id != null) { + pendingCommentIds.add(id); + } + } + private void inlineNoteContent(byte[] xml, String cssClass) throws SAXException { // Use the inline part map's relationship map which includes relationships // from the footnote/endnote parts (needed for picture resolution) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index 8e952b9a55..4eb507c4fb 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -108,6 +108,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private final static String MOVE_FROM = "moveFrom"; private final static String MOVE_TO = "moveTo"; private final static String ENDNOTE_REFERENCE = "endnoteReference"; + private final static String COMMENT_REFERENCE = "commentReference"; private static final String TEXTBOX = "textbox"; private static final String TXBX = "txbx"; // DrawingML text box (wps:txbx in mc:Choice) private final static String FLD_CHAR = "fldChar"; @@ -372,6 +373,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (ENDNOTE_REFERENCE.equals(localName)) { String id = atts.getValue(W_NS, "id"); bodyContentsHandler.endnoteReference(id); + } else if (COMMENT_REFERENCE.equals(localName)) { + String id = atts.getValue(W_NS, "id"); + bodyContentsHandler.commentReference(id); } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart inV = true; } else if (RT.equals(localName)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index f8ed0c0f95..3dc076457b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -271,17 +271,20 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { // inlined at the point of reference in the main document OOXMLInlineBodyPartMap inlinePartMap = collectInlineParts(documentPart); - //main document + //main document — keep reference to body handler for emitted comment tracking + java.util.Set<String> emittedCommentIds = java.util.Collections.emptySet(); try { - handlePart(documentPart, styles, listManager, xhtml, inlinePartMap); + OOXMLTikaBodyPartHandler mainBodyHandler = + handlePart(documentPart, styles, listManager, xhtml, inlinePartMap); + emittedCommentIds = mainBodyHandler.getEmittedCommentIds(); } catch (ZipException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } - //dump remaining components at end (diagrams, charts, footers, comments) + //dump remaining components at end (diagrams, charts, footers) for (String rel : new String[]{AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA, XSSFRelation.CHART.getRelation(), - XWPFRelation.COMMENT.getRelation(), XWPFRelation.FOOTER.getRelation()}) { + XWPFRelation.FOOTER.getRelation()}) { //skip footers if we shouldn't extract them if (!config.isIncludeHeadersAndFooters() && rel.equals(XWPFRelation.FOOTER.getRelation())) { @@ -302,9 +305,43 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { ExceptionUtils.getStackTrace(e)); } } + //dump any comments that were NOT inlined via commentReference + handleUnreferencedComments(documentPart, styles, listManager, xhtml, + inlinePartMap, emittedCommentIds); } - private void handlePart(PackagePart packagePart, XWPFStylesShim styles, + private void handleUnreferencedComments(PackagePart documentPart, + XWPFStylesShim styles, XWPFListManager listManager, + XHTMLContentHandler xhtml, OOXMLInlineBodyPartMap inlinePartMap, + java.util.Set<String> emittedCommentIds) { + if (!inlinePartMap.hasComments()) { + return; + } + Map<String, String> linkedRelationships = inlinePartMap.getLinkedRelationships(); + for (Map.Entry<String, byte[]> entry : + inlinePartMap.getCommentEntries()) { + if (emittedCommentIds.contains(entry.getKey())) { + continue; + } + try { + xhtml.startElement("div", "class", "comment"); + XMLReaderUtils.parseSAX( + new java.io.ByteArrayInputStream(entry.getValue()), + new EmbeddedContentHandler( + new OOXMLWordAndPowerPointTextHandler( + new OOXMLTikaBodyPartHandler(xhtml), + linkedRelationships)), + context); + xhtml.endElement("div"); + } catch (TikaException | IOException | SAXException e) { + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + } + } + } + + private OOXMLTikaBodyPartHandler handlePart(PackagePart packagePart, + XWPFStylesShim styles, XWPFListManager listManager, XHTMLContentHandler xhtml, OOXMLInlineBodyPartMap inlinePartMap) throws IOException, SAXException { @@ -325,6 +362,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } + return bodyHandler; } private OOXMLInlineBodyPartMap collectInlineParts(PackagePart documentPart) { @@ -336,13 +374,25 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes"; Map<String, byte[]> endnoteMap = collectPartContent(documentPart, endnoteRel, Set.of("endnote"), allRelationships); + String commentsRel = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"; + Map<String, byte[]> commentMap = collectPartContent(documentPart, + commentsRel, Set.of("comment"), + allRelationships, Collections.emptySet()); return new OOXMLInlineBodyPartMap(footnoteMap, endnoteMap, - Collections.emptyMap(), allRelationships); + commentMap, allRelationships); } private Map<String, byte[]> collectPartContent(PackagePart documentPart, String relationshipType, Set<String> wrapperElements, Map<String, String> allRelationships) { + return collectPartContent(documentPart, relationshipType, wrapperElements, + allRelationships, Set.of("0", "-1")); + } + + private Map<String, byte[]> collectPartContent(PackagePart documentPart, + String relationshipType, Set<String> wrapperElements, + Map<String, String> allRelationships, Set<String> skipIds) { try { PackageRelationshipCollection prc = documentPart.getRelationshipsByType(relationshipType); @@ -350,7 +400,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { return Collections.emptyMap(); } OOXMLPartContentCollector collector = - new OOXMLPartContentCollector(wrapperElements); + new OOXMLPartContentCollector(wrapperElements, skipIds); for (int i = 0; i < prc.size(); i++) { PackagePart part = documentPart.getRelatedPart(prc.getRelationship(i)); // collect the part's linked relationships (for picture resolution) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java index a45a7d63f5..2bd479d0a0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java @@ -76,6 +76,13 @@ public interface XWPFBodyContentsHandler { void endnoteReference(String id) throws SAXException; + /** + * Called when a comment reference is encountered in the document body. + * + * @param id the comment ID + */ + void commentReference(String id) throws SAXException; + boolean isIncludeMoveFromText() throws SAXException; void embeddedOLERef(String refId) throws SAXException; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index 07860f13e8..8d8db557b9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -191,6 +191,11 @@ public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { } + @Override + public void commentReference(String id) { + + } + @Override public boolean isIncludeMoveFromText() { return false; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 7c5dc990fc..9b30ce15b1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -354,6 +354,11 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { } + @Override + public void commentReference(String id) { + + } + @Override public boolean isIncludeMoveFromText() { return false;
