This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3ddfba054 TIKA-4446 -- use correct embedded path name field (#2267)
3ddfba054 is described below
commit 3ddfba054cb603179466f0be5ddb6f455cb79103
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jul 3 16:58:00 2025 -0400
TIKA-4446 -- use correct embedded path name field (#2267)
---
.../org/apache/tika/eval/app/ExtractComparer.java | 5 ++-
.../apache/tika/eval/app/SimpleComparerTest.java | 41 ++++++++++++++++++++++
.../src/test/resources/extracts/TIKA-4446-a.json | 1 +
.../src/test/resources/extracts/TIKA-4446-b.json | 1 +
4 files changed, 45 insertions(+), 3 deletions(-)
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index 1a57ac9e8..6f2865bf0 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -404,7 +404,6 @@ public class ExtractComparer extends AbstractProfiler {
if (sharedDigestKey != null) {
//first try to find matching digests
- //this does not elegantly handle multiple matching digests
return findMatchingDigests(sharedDigestKey, handledB,
metadataListA.get(aIndex), metadataListB);
}
@@ -435,7 +434,7 @@ public class ExtractComparer extends AbstractProfiler {
if (digestA == null) {
return -1;
}
- String resourceName =
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ String resourceName =
metadata.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
int cand = -1;
for (int i = 0; i < metadataListB.size(); i++) {
@@ -446,7 +445,7 @@ public class ExtractComparer extends AbstractProfiler {
String digestB = mB.get(sharedDigestKey);
if (digestA.equalsIgnoreCase(digestB)) {
cand = i;
- if (resourceName != null &&
resourceName.equals(mB.get(TikaCoreProperties.RESOURCE_NAME_KEY))) {
+ if (resourceName != null &&
resourceName.equals(mB.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH))) {
return i;
}
}
diff --git
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
index 6b900bab3..721d106a2 100644
---
a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
+++
b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
@@ -344,6 +344,47 @@ public class SimpleComparerTest extends TikaTest {
}
}
+
+ @Test
+ public void testDigestMatching() throws Exception {
+ Path p1 =
Paths.get(getResourceAsFile("/extracts/TIKA-4446-a.json").toURI());
+ Path p2 =
Paths.get(getResourceAsFile("/extracts/TIKA-4446-b.json").toURI());
+
+ EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"),
p1);
+ EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file2.pdf.json"),
p2);
+ comparer.compareFiles(fpsA, fpsB);
+ Map<Integer, String> mA = new HashMap<>();
+ Map<Integer, String> mB = new HashMap<>();
+ loadEmbeddedNames(ExtractComparer.PROFILES_A, mA);
+ loadEmbeddedNames(ExtractComparer.PROFILES_B, mB);
+ for (int i : mA.keySet()) {
+ String nA = mA.get(i);
+ String nB = mB.get(i);
+ assertEquals(nA, nB);
+ }
+ for (int i : mB.keySet()) {
+ String nA = mA.get(i);
+ String nB = mB.get(i);
+ assertEquals(nA, nB);
+ }
+ }
+
+ private void loadEmbeddedNames(TableInfo t, Map<Integer, String> m) {
+ for (Map<Cols, String> row : WRITER.getTable(t)) {
+ SortedSet<Cols> keys = new TreeSet<>(row.keySet());
+ int id = -1;
+ String embeddedName = "";
+ for (Cols key : keys) {
+ if (key.name().equals("ID")) {
+ id = Integer.parseInt(row.get(key));
+ } else if (key.name().equals("EMBEDDED_FILE_PATH")) {
+ embeddedName = row.get(key);
+ }
+ }
+ m.put(id, embeddedName);
+ }
+ }
+
@Test
@Disabled("useful for testing 2 files not in test set")
public void oneOff() throws Exception {
diff --git
a/tika-eval/tika-eval-app/src/test/resources/extracts/TIKA-4446-a.json
b/tika-eval/tika-eval-app/src/test/resources/extracts/TIKA-4446-a.json
new file mode 100644
index 000000000..69b6e0762
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/test/resources/extracts/TIKA-4446-a.json
@@ -0,0 +1 @@
+[{"msoffice:excel:has-hidden-rows":"true","X-TIKA:Parsed-By-Full-Set":["org.apache.tika.parser.CompositeParser","org.apache.tika.parser.microsoft.OfficeParser","org.apache.tika.parser.DefaultParser","org.apache.tika.parser.image.ImageParser","org.apache.tika.parser.microsoft.EMFParser","org.apache.tika.parser.image.JpegParser","org.apache.tika.parser.microsoft.WMFParser","org.apache.tika.parser.EmptyParser","org.apache.tika.parser.csv.TextAndCSVParser"],"X-TIKA:content_handler":"ToTextCo
[...]
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-app/src/test/resources/extracts/TIKA-4446-b.json
b/tika-eval/tika-eval-app/src/test/resources/extracts/TIKA-4446-b.json
new file mode 100644
index 000000000..1c6771f7b
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/test/resources/extracts/TIKA-4446-b.json
@@ -0,0 +1 @@
+[{"msoffice:excel:has-hidden-rows":"true","X-TIKA:Parsed-By-Full-Set":["org.apache.tika.parser.CompositeParser","org.apache.tika.parser.microsoft.OfficeParser","org.apache.tika.parser.DefaultParser","org.apache.tika.parser.image.ImageParser","org.apache.tika.parser.microsoft.EMFParser","org.apache.tika.parser.image.JpegParser","org.apache.tika.parser.microsoft.WMFParser","org.apache.tika.parser.EmptyParser","org.apache.tika.parser.csv.TextAndCSVParser"],"X-TIKA:content_handler":"ToTextCo
[...]
\ No newline at end of file