This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4718 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 35a09870cce2883695545266631c41b740f64890 Author: tallison <[email protected]> AuthorDate: Thu Apr 9 13:16:38 2026 -0400 TIKA-4718 - check for empty comment string in .doc --- .../apache/tika/parser/microsoft/WordExtractor.java | 19 ++++++++++++++----- .../apache/tika/parser/microsoft/WordParserTest.java | 9 +++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java index a08eaa0fba..09a2b82767 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java @@ -67,6 +67,7 @@ import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; public class WordExtractor extends AbstractPOIFSExtractor { @@ -194,21 +195,29 @@ public class WordExtractor extends AbstractPOIFSExtractor { if (officeParserConfig.isIncludeShapeBasedContent()) { // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { - xhtml.element("p", paragraph); + if (!StringUtils.isBlank(paragraph)) { + xhtml.element("p", paragraph); + } } } for (String paragraph : wordExtractor.getFootnoteText()) { - xhtml.element("p", paragraph); + if (!StringUtils.isBlank(paragraph)) { + xhtml.element("p", paragraph); + } } for (String paragraph : wordExtractor.getCommentsText()) { - xhtml.element("p", paragraph); - hasComments = true; + if (!StringUtils.isBlank(paragraph)) { + xhtml.element("p", paragraph); + hasComments = true; + } } for (String paragraph : wordExtractor.getEndnoteText()) { - xhtml.element("p", paragraph); + if (!StringUtils.isBlank(paragraph)) { + xhtml.element("p", paragraph); + } } if (officeParserConfig.isIncludeHeadersAndFooters()) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java index 9710036fde..257ca9da05 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java @@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -682,4 +683,12 @@ public class WordParserTest extends TikaTest { assertEquals("true", m.get(Office.HAS_TRACK_CHANGES)); assertEquals("true", m.get(Office.HAS_COMMENTS)); } + + @Test + public void testNoFalsePositiveHasComments() throws Exception { + // TIKA-4718: POI returns empty strings from getCommentsText() for .doc files + // without real comments. Verify we don't falsely report HAS_COMMENTS. + Metadata m = getRecursiveMetadata("testWORD.doc").get(0); + assertNull(m.get(Office.HAS_COMMENTS), "testWORD.doc should not have comments"); + } }
