This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 59c84e733c TIKA-4718 - check for empty comment string in .doc (#2757)
59c84e733c is described below
commit 59c84e733cea0a87385a9ec447d60305df4cbec8
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 9 21:15:06 2026 -0400
TIKA-4718 - check for empty comment string in .doc (#2757)
---
.../apache/tika/parser/microsoft/WordExtractor.java | 19 ++++++++++++++-----
.../apache/tika/parser/microsoft/WordParserTest.java | 9 +++++++++
2 files changed, 23 insertions(+), 5 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index a08eaa0fba..09a2b82767 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -67,6 +67,7 @@ import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
public class WordExtractor extends AbstractPOIFSExtractor {
@@ -194,21 +195,29 @@ public class WordExtractor extends AbstractPOIFSExtractor
{
if (officeParserConfig.isIncludeShapeBasedContent()) {
// Do everything else
for (String paragraph : wordExtractor.getMainTextboxText()) {
- xhtml.element("p", paragraph);
+ if (!StringUtils.isBlank(paragraph)) {
+ xhtml.element("p", paragraph);
+ }
}
}
for (String paragraph : wordExtractor.getFootnoteText()) {
- xhtml.element("p", paragraph);
+ if (!StringUtils.isBlank(paragraph)) {
+ xhtml.element("p", paragraph);
+ }
}
for (String paragraph : wordExtractor.getCommentsText()) {
- xhtml.element("p", paragraph);
- hasComments = true;
+ if (!StringUtils.isBlank(paragraph)) {
+ xhtml.element("p", paragraph);
+ hasComments = true;
+ }
}
for (String paragraph : wordExtractor.getEndnoteText()) {
- xhtml.element("p", paragraph);
+ if (!StringUtils.isBlank(paragraph)) {
+ xhtml.element("p", paragraph);
+ }
}
if (officeParserConfig.isIncludeHeadersAndFooters()) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 9710036fde..257ca9da05 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -682,4 +683,12 @@ public class WordParserTest extends TikaTest {
assertEquals("true", m.get(Office.HAS_TRACK_CHANGES));
assertEquals("true", m.get(Office.HAS_COMMENTS));
}
+
+ @Test
+ public void testNoFalsePositiveHasComments() throws Exception {
+ // TIKA-4718: POI returns empty strings from getCommentsText() for
.doc files
+ // without real comments. Verify we don't falsely report HAS_COMMENTS.
+ Metadata m = getRecursiveMetadata("testWORD.doc").get(0);
+ assertNull(m.get(Office.HAS_COMMENTS), "testWORD.doc should not have
comments");
+ }
}