This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a7e6ac7386 fix merge conflicts
a7e6ac7386 is described below
commit a7e6ac73862d0d8d719ace8b9d0cb6d39efb926b
Author: tallison <[email protected]>
AuthorDate: Tue Mar 24 14:51:55 2026 -0400
fix merge conflicts
---
.../tika/parser/image/AbstractImageParser.java | 23 ++++++++--
.../apache/tika/parser/ocr/ImagePreprocessor.java | 4 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 49 ++++++++++++++++------
.../tika/parser/ocr/TesseractOCRParserTest.java | 4 +-
4 files changed, 60 insertions(+), 20 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
index b97371bfe2..37236e7e7f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
@@ -98,13 +98,30 @@ public abstract class AbstractImageParser implements Parser
{
try (TikaInputStream pathStream = TikaInputStream.get(path)) {
//specify ocr content type
+ String originalParserOverride =
+
metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
+ String originalContentType =
metadata.get(Metadata.CONTENT_TYPE);
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
ocrMediaType.toString());
//need to use bodycontenthandler to filter out re-dumping of
metadata
//in xhtmlhandler
- ocrParser.parse(pathStream,
- new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), metadata,
- context);
+ try {
+ ocrParser.parse(pathStream,
+ new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), metadata,
+ context);
+ } finally {
+ if (originalParserOverride == null) {
+
metadata.remove(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
+ } else {
+
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+ originalParserOverride);
+ }
+ if (originalContentType == null) {
+ metadata.remove(Metadata.CONTENT_TYPE);
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE,
originalContentType);
+ }
+ }
}
xhtml.endDocument();
} finally {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
index 0473f3bed3..8932af5ada 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
@@ -56,7 +56,7 @@ class ImagePreprocessor implements Serializable {
double angle = config.isApplyRotation() ? getAngle(sourceFile,
metadata) : 0d;
- if (config.isEnableImagePreprocessing() || config.isApplyRotation() &&
angle != 0) {
+ if (config.isEnableImagePreprocessing() || (config.isApplyRotation()
&& angle != 0)) {
// process the image - parameter values can be set in
TesseractOCRConfig.properties
CommandLine commandLine = new CommandLine(fullImageMagickPath);
if (SystemUtils.IS_OS_WINDOWS) {
@@ -98,12 +98,12 @@ class ImagePreprocessor implements Serializable {
DefaultExecutor executor = DefaultExecutor.builder().get();
try {
executor.execute(commandLine);
+ metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
LOG.warn("ImageMagick failed (commandline: " + commandLine +
")", e);
}
- metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 5c4ded595a..2639b457ae 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -231,8 +231,8 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
// Try running ImageMagick program from there, and see if it exists +
works
String[] checkCmd = {fullImageMagickPath};
- boolean hasImageMagick = ExternalParser.check(checkCmd);
- if (!hasImageMagick) {
+ this.hasImageMagick = ExternalParser.check(checkCmd);
+ if (!this.hasImageMagick) {
LOG.debug("ImageMagick does not appear to be installed " +
"(commandline: " +
fullImageMagickPath + ")");
}
@@ -260,6 +260,7 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
@Override
public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
ParseContext parseContext) throws IOException,
SAXException, TikaException {
+ normalizeOCRMimeMetadata(metadata);
TesseractOCRConfig config = getConfig(parseContext);
@@ -272,10 +273,14 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
//if you haven't checked yet, and a per file config requests
imagemagick
//and if the default is not to use image processing
- if (!hasCheckedForImageMagick && config.isEnableImagePreprocessing()) {
+ if (!hasCheckedForImageMagick && (config.isEnableImagePreprocessing()
|| config.isApplyRotation())) {
hasImageMagick = hasImageMagick();
}
+ if (hasImageMagick && imagePreprocessor == null) {
+ imagePreprocessor = new
ImagePreprocessor(defaultConfig.getImageMagickPath() + getImageMagickProg());
+ }
+
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tikaStream = TikaInputStream.get(tis, tmp,
metadata);
@@ -325,6 +330,25 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
return defaultConfig;
}
+ private void normalizeOCRMimeMetadata(Metadata metadata) {
+ String parserOverride =
metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
+ if (parserOverride != null) {
+ MediaType overrideType = MediaType.parse(parserOverride);
+ if (overrideType != null &&
overrideType.getSubtype().startsWith(OCR)) {
+
metadata.remove(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
+ }
+ }
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ if (contentType != null) {
+ MediaType parsedType = MediaType.parse(contentType);
+ if (parsedType != null && parsedType.getSubtype().startsWith(OCR))
{
+ metadata.set(Metadata.CONTENT_TYPE,
+ new MediaType(parsedType.getType(),
+
parsedType.getSubtype().substring(OCR.length())).toString());
+ }
+ }
+ }
+
private ContentHandler getContentHandler(boolean isInlineContent,
ContentHandler handler, Metadata
metadata, ParseContext parseContext) {
if (! isInlineContent) {
@@ -407,23 +431,24 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
}
private void extractOSD(InputStream is, Metadata metadata) throws
IOException {
- Matcher matcher = Pattern.compile("^([^:]+):\\s+(.*)").matcher("");
- try (BufferedReader reader = new BufferedReader(new
InputStreamReader(is,
- UTF_8))) {
+ Matcher matcher = Pattern.compile("^([^:]+):\\s*(.*)").matcher("");
+ try (BufferedReader reader = new BufferedReader(new
InputStreamReader(is, UTF_8))) {
String line = reader.readLine();
while (line != null) {
if (matcher.reset(line).find()) {
String k = matcher.group(1);
- String v = matcher.group(2);
+ String v = matcher.group(2).trim();
+
switch (k) {
case "Page number":
metadata.set(PSM0_PAGE_NUMBER,
Integer.parseInt(v));
break;
case "Orientation in degrees":
- metadata.set(PSM0_ORIENTATION,
Integer.parseInt(v));
- break;
- case "Rotate":
- metadata.set(PSM0_ROTATE, Integer.parseInt(v));
+ case "Rotate": // Handle Tesseract 5.x+
+ int rotationValue = Integer.parseInt(v);
+ metadata.set(PSM0_ORIENTATION, rotationValue);
+ metadata.set(PSM0_ROTATE, rotationValue);
+ metadata.add(IMAGE_ROTATION,
String.valueOf((double) rotationValue));
break;
case "Orientation confidence":
metadata.set(PSM0_ORIENTATION_CONFIDENCE,
Double.parseDouble(v));
@@ -434,8 +459,6 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
case "Script confidence":
metadata.set(PSM0_SCRIPT_CONFIDENCE,
Double.parseDouble(v));
break;
- default:
- LOG.warn("I regret I don't know how to parse {}
with value {}", k, v);
}
}
line = reader.readLine();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c4b89a0db1..b54eaccc17 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -133,7 +133,7 @@ public class TesseractOCRParserTest extends TikaTest {
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
Metadata metadata = getMetadata(MediaType.image("png"));
- String ocr = getText("testRotated+10.png", metadata, parseContext);
+ String ocr = getText("testRotated+10.png", p, metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
assertEquals(10.0,
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)),
0.01);
@@ -151,7 +151,7 @@ public class TesseractOCRParserTest extends TikaTest {
parseContext.set(TesseractOCRConfig.class, config);
assumeTrue(canRun());
Metadata metadata = getMetadata(MediaType.image("png"));
- String ocr = getText("testRotated-10.png", metadata, parseContext);
+ String ocr = getText("testRotated-10.png", p, metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
assertEquals(-10.0,
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)),
0.01);