This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4466
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f7e4fcf86477e230ec90bff00914ef94a616d024
Author: tallison <[email protected]>
AuthorDate: Tue Aug 19 09:57:59 2025 -0400

    TIKA-4466 -- allow multiple values for many DublinCore values
---
 CHANGES.txt                                        |  3 ++
 .../java/org/apache/tika/metadata/DublinCore.java  | 20 +++++------
 .../apache/tika/parser/epub/EpubParserTest.java    | 42 ++++++++++++++++++++++
 .../org/apache/tika/parser/xml/DcXMLParser.java    |  1 +
 .../apache/tika/xmp/convert/AbstractConverter.java |  2 +-
 .../java/org/apache/tika/xmp/TikaToXMPTest.java    |  4 +--
 6 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 28aede848..d76a3e068 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,9 @@ Release 4.0.0-BETA1 - ???
 
    * Fix concurrency bug in TikaToXMP (TIKA-4393)
 
+Release 3.3.0 - ???
+
+  * Allow multiple values for many Dublin Core values (TIKA-4466).
 
 Release 3.2.1 - ???
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java 
b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
index a4e32cb8b..283080a0d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
@@ -57,7 +57,7 @@ public interface DublinCore {
      * the Digital Object Identifier (DOI) and the International Standard
      * Book Number (ISBN).
      */
-    Property IDENTIFIER = Property.internalText(
+    Property IDENTIFIER = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"identifier");
 
     /**
@@ -85,7 +85,7 @@ public interface DublinCore {
      * appropriate, named places or time periods be used in preference to
      * numeric identifiers such as sets of coordinates or date ranges.
      */
-    Property COVERAGE = Property.internalText(
+    Property COVERAGE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"coverage");
 
     /**
@@ -118,7 +118,7 @@ public interface DublinCore {
      * a graphical representation of content or a free-text account of
      * the content.
      */
-    Property DESCRIPTION = Property.internalText(
+    Property DESCRIPTION = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"description");
 
     /**
@@ -128,7 +128,7 @@ public interface DublinCore {
      * tags with optional subtags. Examples include "en" or "eng" for English,
      * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom.
      */
-    Property LANGUAGE = Property.internalText(
+    Property LANGUAGE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"language");
 
     /**
@@ -136,7 +136,7 @@ public interface DublinCore {
      * a Publisher include a person, an organisation, or a service. Typically,
      * the name of a Publisher should be used to indicate the entity.
      */
-    Property PUBLISHER = Property.internalText(
+    Property PUBLISHER = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"publisher");
 
     /**
@@ -144,7 +144,7 @@ public interface DublinCore {
      * reference the resource by means of a string or number conforming to
      * a formal identification system.
      */
-    Property RELATION = Property.internalText(
+    Property RELATION = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"relation");
 
     /**
@@ -156,7 +156,7 @@ public interface DublinCore {
      * is absent, no assumptions can be made about the status of these and
      * other rights with respect to the resource.
      */
-    Property RIGHTS = Property.internalText(
+    Property RIGHTS = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"rights");
 
     /**
@@ -166,7 +166,7 @@ public interface DublinCore {
      * means of a string or number conforming to a formal identification
      * system.
      */
-    Property SOURCE = Property.internalText(
+    Property SOURCE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"source");
 
     /**
@@ -183,7 +183,7 @@ public interface DublinCore {
      * A name given to the resource. Typically, a Title will be a name by
      * which the resource is formally known.
      */
-    Property TITLE = Property.internalText(
+    Property TITLE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"title");
 
     /**
@@ -194,7 +194,7 @@ public interface DublinCore {
      * [DCMITYPE]). To describe the physical or digital manifestation of
      * the resource, use the Format element.
      */
-    Property TYPE = Property.internalText(
+    Property TYPE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"type");
 
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index 57e6a9a3b..ae9305a49 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -18,9 +18,13 @@ package org.apache.tika.parser.epub;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
@@ -29,6 +33,7 @@ import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Epub;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -131,4 +136,41 @@ public class EpubParserTest extends TikaTest {
         List<Metadata> metadataList = 
getRecursiveMetadata("cole-voyage-of-life.epub");
         assertEquals("pre-paginated", 
metadataList.get(0).get(Epub.RENDITION_LAYOUT));
     }
+
+    @Test
+    public void testMultipleMetadataValues() throws Exception {
+        //TIKA_4466
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEPUB_multi-metadata-vals.epub");
+        Set<String> publishers = Set.of("Standard Ebooks", "Guternberg");
+        Set<String> titles = Set.of("The Inheritors", "An Extravagant Story", 
"The Inheritors: An Extravagant Story");
+        Set<String> contributors = Set.of("The League of Moveable Type", 
"zikasak", "William Holyoake", "Clare Boothby",
+                "Graeme Mackreth", "Distributed Proofreaders", "Szymon Szott", 
"David Reimer");
+        Set<String> creators = Set.of("Joseph Conrad", "Ford Madox Ford");
+        Set<String> languages = Set.of("en-GB", "en-US");
+        Set<String> descriptions = Set.of("A young writer dabbling in 
journalism meets a strange, otherworldly woman with long-term political goals.",
+                "additional description");
+        Set<String> sources = Set.of("https://www.gutenberg.org/ebooks/14888";, 
"https://archive.org/details/inheritorsanext01fordgoog/";);
+        Set<String> identifiers = 
Set.of("https://standardebooks.org/ebooks/joseph-conrad_ford-madox-ford/the-inheritors";,
+                "isbn:0571225470");
+        Set<String> subjects = Set.of("Science fiction");
+
+        Metadata m = metadataList.get(0);
+        assertEquals(publishers, set(m, TikaCoreProperties.PUBLISHER));
+        assertEquals(titles, set(m, TikaCoreProperties.TITLE));
+        assertEquals(contributors, set(m, TikaCoreProperties.CONTRIBUTOR));
+        assertEquals(creators, set(m, TikaCoreProperties.CREATOR));
+        assertEquals(languages, set(m, TikaCoreProperties.LANGUAGE));
+        assertEquals(descriptions, set(m, TikaCoreProperties.DESCRIPTION));
+        assertEquals(sources, set(m, TikaCoreProperties.SOURCE));
+        assertEquals(identifiers, set(m, TikaCoreProperties.IDENTIFIER));
+        assertEquals(subjects, set(m, TikaCoreProperties.SUBJECT));
+
+        assertEquals(2, m.getValues(TikaCoreProperties.RIGHTS).length);
+        assertTrue(m.get(TikaCoreProperties.RIGHTS).startsWith("The source 
text and artwork"));
+        assertEquals("test rights", m.getValues(TikaCoreProperties.RIGHTS)[1]);
+    }
+
+    private Set<String> set(Metadata m, Property property) {
+        return new HashSet<>(Arrays.asList(m.getValues(property)));
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
index 172bf3b8e..a3c50c6cb 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -54,6 +54,7 @@ public class DcXMLParser extends XMLParser {
                 getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, 
"format"),
                 getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, 
"identifier"),
                 getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, 
"language"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.SOURCE, 
"source"),
                 getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, 
"rights"));
     }
 
diff --git 
a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java 
b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
index ed03e2a0e..f7072665d 100644
--- a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
+++ b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
@@ -79,7 +79,7 @@ public abstract class AbstractConverter implements 
ITikaToXMPConverter {
                 registry.registerNamespace(namespace.uri, namespace.prefix);
             } catch (XMPException e) {
                 throw new TikaException(
-                        "Namespace needed by converter could not be 
registiered with XMPCore", e);
+                        "Namespace needed by converter could not be registered 
with XMPCore", e);
             }
         }
     }
diff --git a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java 
b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
index da25605f7..427ed1989 100644
--- a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
+++ b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
@@ -130,7 +130,7 @@ public class TikaToXMPTest {
 
         // general metadata is converted
         // check simple property
-        XMPProperty prop = xmp.getProperty(XMPConst.NS_DC, "language");
+        XMPProperty prop = xmp.getArrayItem(XMPConst.NS_DC, "language", 1);
         assertNotNull(prop);
         assertEquals("language", prop.getValue());
 
@@ -139,7 +139,7 @@ public class TikaToXMPTest {
         assertNotNull(prop);
         assertEquals("title", prop.getValue());
 
-        // OOXML one is not, the namespace has also not been registiered as 
the converter has not
+        // OOXML one is not, the namespace has also not been registered as the 
converter has not
         // been used
         XMPMetaFactory.getSchemaRegistry()
                 .registerNamespace(OfficeOpenXMLCore.NAMESPACE_URI, 
OfficeOpenXMLCore.PREFIX);

Reply via email to