This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4466 in repository https://gitbox.apache.org/repos/asf/tika.git
commit f7e4fcf86477e230ec90bff00914ef94a616d024 Author: tallison <[email protected]> AuthorDate: Tue Aug 19 09:57:59 2025 -0400 TIKA-4466 -- allow multiple values for many DublinCore values --- CHANGES.txt | 3 ++ .../java/org/apache/tika/metadata/DublinCore.java | 20 +++++------ .../apache/tika/parser/epub/EpubParserTest.java | 42 ++++++++++++++++++++++ .../org/apache/tika/parser/xml/DcXMLParser.java | 1 + .../apache/tika/xmp/convert/AbstractConverter.java | 2 +- .../java/org/apache/tika/xmp/TikaToXMPTest.java | 4 +-- 6 files changed, 59 insertions(+), 13 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 28aede848..d76a3e068 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,9 @@ Release 4.0.0-BETA1 - ??? * Fix concurrency bug in TikaToXMP (TIKA-4393) +Release 3.3.0 - ??? + + * Allow multiple values for many Dublin Core values (TIKA-4466). Release 3.2.1 - ??? diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java index a4e32cb8b..283080a0d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java @@ -57,7 +57,7 @@ public interface DublinCore { * the Digital Object Identifier (DOI) and the International Standard * Book Number (ISBN). */ - Property IDENTIFIER = Property.internalText( + Property IDENTIFIER = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); /** @@ -85,7 +85,7 @@ public interface DublinCore { * appropriate, named places or time periods be used in preference to * numeric identifiers such as sets of coordinates or date ranges. */ - Property COVERAGE = Property.internalText( + Property COVERAGE = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); /** @@ -118,7 +118,7 @@ public interface DublinCore { * a graphical representation of content or a free-text account of * the content. */ - Property DESCRIPTION = Property.internalText( + Property DESCRIPTION = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); /** @@ -128,7 +128,7 @@ public interface DublinCore { * tags with optional subtags. Examples include "en" or "eng" for English, * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. */ - Property LANGUAGE = Property.internalText( + Property LANGUAGE = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); /** @@ -136,7 +136,7 @@ public interface DublinCore { * a Publisher include a person, an organisation, or a service. Typically, * the name of a Publisher should be used to indicate the entity. */ - Property PUBLISHER = Property.internalText( + Property PUBLISHER = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); /** @@ -144,7 +144,7 @@ public interface DublinCore { * reference the resource by means of a string or number conforming to * a formal identification system. */ - Property RELATION = Property.internalText( + Property RELATION = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); /** @@ -156,7 +156,7 @@ public interface DublinCore { * is absent, no assumptions can be made about the status of these and * other rights with respect to the resource. */ - Property RIGHTS = Property.internalText( + Property RIGHTS = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); /** @@ -166,7 +166,7 @@ public interface DublinCore { * means of a string or number conforming to a formal identification * system. */ - Property SOURCE = Property.internalText( + Property SOURCE = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); /** @@ -183,7 +183,7 @@ public interface DublinCore { * A name given to the resource. Typically, a Title will be a name by * which the resource is formally known. */ - Property TITLE = Property.internalText( + Property TITLE = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); /** @@ -194,7 +194,7 @@ public interface DublinCore { * [DCMITYPE]). To describe the physical or digital manifestation of * the resource, use the Format element. */ - Property TYPE = Property.internalText( + Property TYPE = Property.internalTextBag( PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java index 57e6a9a3b..ae9305a49 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java @@ -18,9 +18,13 @@ package org.apache.tika.parser.epub; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.InputStream; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -29,6 +33,7 @@ import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Epub; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; @@ -131,4 +136,41 @@ public class EpubParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("cole-voyage-of-life.epub"); assertEquals("pre-paginated", metadataList.get(0).get(Epub.RENDITION_LAYOUT)); } + + @Test + public void testMultipleMetadataValues() throws Exception { + //TIKA_4466 + List<Metadata> metadataList = getRecursiveMetadata("testEPUB_multi-metadata-vals.epub"); + Set<String> publishers = Set.of("Standard Ebooks", "Guternberg"); + Set<String> titles = Set.of("The Inheritors", "An Extravagant Story", "The Inheritors: An Extravagant Story"); + Set<String> contributors = Set.of("The League of Moveable Type", "zikasak", "William Holyoake", "Clare Boothby", + "Graeme Mackreth", "Distributed Proofreaders", "Szymon Szott", "David Reimer"); + Set<String> creators = Set.of("Joseph Conrad", "Ford Madox Ford"); + Set<String> languages = Set.of("en-GB", "en-US"); + Set<String> descriptions = Set.of("A young writer dabbling in journalism meets a strange, otherworldly woman with long-term political goals.", + "additional description"); + Set<String> sources = Set.of("https://www.gutenberg.org/ebooks/14888", "https://archive.org/details/inheritorsanext01fordgoog/"); + Set<String> identifiers = Set.of("https://standardebooks.org/ebooks/joseph-conrad_ford-madox-ford/the-inheritors", + "isbn:0571225470"); + Set<String> subjects = Set.of("Science fiction"); + + Metadata m = metadataList.get(0); + assertEquals(publishers, set(m, TikaCoreProperties.PUBLISHER)); + assertEquals(titles, set(m, TikaCoreProperties.TITLE)); + assertEquals(contributors, set(m, TikaCoreProperties.CONTRIBUTOR)); + assertEquals(creators, set(m, TikaCoreProperties.CREATOR)); + assertEquals(languages, set(m, TikaCoreProperties.LANGUAGE)); + assertEquals(descriptions, set(m, TikaCoreProperties.DESCRIPTION)); + assertEquals(sources, set(m, TikaCoreProperties.SOURCE)); + assertEquals(identifiers, set(m, TikaCoreProperties.IDENTIFIER)); + assertEquals(subjects, set(m, TikaCoreProperties.SUBJECT)); + + assertEquals(2, m.getValues(TikaCoreProperties.RIGHTS).length); + assertTrue(m.get(TikaCoreProperties.RIGHTS).startsWith("The source text and artwork")); + assertEquals("test rights", m.getValues(TikaCoreProperties.RIGHTS)[1]); + } + + private Set<String> set(Metadata m, Property property) { + return new HashSet<>(Arrays.asList(m.getValues(property))); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java index 172bf3b8e..a3c50c6cb 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java @@ -54,6 +54,7 @@ public class DcXMLParser extends XMLParser { getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"), getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"), getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"), + getDublinCoreHandler(metadata, TikaCoreProperties.SOURCE, "source"), getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights")); } diff --git a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java index ed03e2a0e..f7072665d 100644 --- a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java +++ b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java @@ -79,7 +79,7 @@ public abstract class AbstractConverter implements ITikaToXMPConverter { registry.registerNamespace(namespace.uri, namespace.prefix); } catch (XMPException e) { throw new TikaException( - "Namespace needed by converter could not be registiered with XMPCore", e); + "Namespace needed by converter could not be registered with XMPCore", e); } } } diff --git a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java index da25605f7..427ed1989 100644 --- a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java +++ b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java @@ -130,7 +130,7 @@ public class TikaToXMPTest { // general metadata is converted // check simple property - XMPProperty prop = xmp.getProperty(XMPConst.NS_DC, "language"); + XMPProperty prop = xmp.getArrayItem(XMPConst.NS_DC, "language", 1); assertNotNull(prop); assertEquals("language", prop.getValue()); @@ -139,7 +139,7 @@ public class TikaToXMPTest { assertNotNull(prop); assertEquals("title", prop.getValue()); - // OOXML one is not, the namespace has also not been registiered as the converter has not + // OOXML one is not, the namespace has also not been registered as the converter has not // been used XMPMetaFactory.getSchemaRegistry() .registerNamespace(OfficeOpenXMLCore.NAMESPACE_URI, OfficeOpenXMLCore.PREFIX);
