This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new a7e6ac7386 fix merge conflicts
a7e6ac7386 is described below

commit a7e6ac73862d0d8d719ace8b9d0cb6d39efb926b
Author: tallison <[email protected]>
AuthorDate: Tue Mar 24 14:51:55 2026 -0400

    fix merge conflicts
---
 .../tika/parser/image/AbstractImageParser.java     | 23 ++++++++--
 .../apache/tika/parser/ocr/ImagePreprocessor.java  |  4 +-
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 49 ++++++++++++++++------
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  4 +-
 4 files changed, 60 insertions(+), 20 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
index b97371bfe2..37236e7e7f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
@@ -98,13 +98,30 @@ public abstract class AbstractImageParser implements Parser 
{
 
             try (TikaInputStream pathStream = TikaInputStream.get(path)) {
                 //specify ocr content type
+                String originalParserOverride =
+                        
metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
+                String originalContentType = 
metadata.get(Metadata.CONTENT_TYPE);
                 metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
                         ocrMediaType.toString());
                 //need to use bodycontenthandler to filter out re-dumping of 
metadata
                 //in xhtmlhandler
-                ocrParser.parse(pathStream,
-                        new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)), metadata,
-                        context);
+                try {
+                    ocrParser.parse(pathStream,
+                            new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)), metadata,
+                            context);
+                } finally {
+                    if (originalParserOverride == null) {
+                        
metadata.remove(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
+                    } else {
+                        
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+                                originalParserOverride);
+                    }
+                    if (originalContentType == null) {
+                        metadata.remove(Metadata.CONTENT_TYPE);
+                    } else {
+                        metadata.set(Metadata.CONTENT_TYPE, 
originalContentType);
+                    }
+                }
             }
             xhtml.endDocument();
         } finally {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
index 0473f3bed3..8932af5ada 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
@@ -56,7 +56,7 @@ class ImagePreprocessor implements Serializable {
 
         double angle = config.isApplyRotation() ? getAngle(sourceFile, 
metadata) : 0d;
 
-        if (config.isEnableImagePreprocessing() || config.isApplyRotation() && 
angle != 0) {
+        if (config.isEnableImagePreprocessing() || (config.isApplyRotation() 
&& angle != 0)) {
             // process the image - parameter values can be set in 
TesseractOCRConfig.properties
             CommandLine commandLine = new CommandLine(fullImageMagickPath);
             if (SystemUtils.IS_OS_WINDOWS) {
@@ -98,12 +98,12 @@ class ImagePreprocessor implements Serializable {
             DefaultExecutor executor = DefaultExecutor.builder().get();
             try {
                 executor.execute(commandLine);
+                metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
             } catch (SecurityException e) {
                 throw e;
             } catch (Exception e) {
                 LOG.warn("ImageMagick failed (commandline: " + commandLine + 
")", e);
             }
-            metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
         }
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 5c4ded595a..2639b457ae 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -231,8 +231,8 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
 
         // Try running ImageMagick program from there, and see if it exists + 
works
         String[] checkCmd = {fullImageMagickPath};
-        boolean hasImageMagick = ExternalParser.check(checkCmd);
-        if (!hasImageMagick) {
+        this.hasImageMagick = ExternalParser.check(checkCmd);
+        if (!this.hasImageMagick) {
             LOG.debug("ImageMagick does not appear to be installed " + 
"(commandline: " +
                     fullImageMagickPath + ")");
         }
@@ -260,6 +260,7 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
     @Override
     public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
                       ParseContext parseContext) throws IOException, 
SAXException, TikaException {
+        normalizeOCRMimeMetadata(metadata);
 
         TesseractOCRConfig config = getConfig(parseContext);
 
@@ -272,10 +273,14 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
 
         //if you haven't checked yet, and a per file config requests 
imagemagick
         //and if the default is not to use image processing
-        if (!hasCheckedForImageMagick && config.isEnableImagePreprocessing()) {
+        if (!hasCheckedForImageMagick && (config.isEnableImagePreprocessing() 
|| config.isApplyRotation())) {
             hasImageMagick = hasImageMagick();
         }
 
+        if (hasImageMagick && imagePreprocessor == null) {
+            imagePreprocessor = new 
ImagePreprocessor(defaultConfig.getImageMagickPath() + getImageMagickProg());
+        }
+
         try (TemporaryResources tmp = new TemporaryResources()) {
             TikaInputStream tikaStream = TikaInputStream.get(tis, tmp, 
metadata);
 
@@ -325,6 +330,25 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
         return defaultConfig;
     }
 
+    private void normalizeOCRMimeMetadata(Metadata metadata) {
+        String parserOverride = 
metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
+        if (parserOverride != null) {
+            MediaType overrideType = MediaType.parse(parserOverride);
+            if (overrideType != null && 
overrideType.getSubtype().startsWith(OCR)) {
+                
metadata.remove(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
+            }
+        }
+        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        if (contentType != null) {
+            MediaType parsedType = MediaType.parse(contentType);
+            if (parsedType != null && parsedType.getSubtype().startsWith(OCR)) 
{
+                metadata.set(Metadata.CONTENT_TYPE,
+                        new MediaType(parsedType.getType(),
+                                
parsedType.getSubtype().substring(OCR.length())).toString());
+            }
+        }
+    }
+
     private ContentHandler getContentHandler(boolean isInlineContent,
                                              ContentHandler handler, Metadata 
metadata, ParseContext parseContext) {
         if (! isInlineContent) {
@@ -407,23 +431,24 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
     }
 
     private void extractOSD(InputStream is, Metadata metadata) throws 
IOException {
-        Matcher matcher = Pattern.compile("^([^:]+):\\s+(.*)").matcher("");
-        try (BufferedReader reader = new BufferedReader(new 
InputStreamReader(is,
-                UTF_8))) {
+        Matcher matcher = Pattern.compile("^([^:]+):\\s*(.*)").matcher("");
+        try (BufferedReader reader = new BufferedReader(new 
InputStreamReader(is, UTF_8))) {
             String line = reader.readLine();
             while (line != null) {
                 if (matcher.reset(line).find()) {
                     String k = matcher.group(1);
-                    String v = matcher.group(2);
+                    String v = matcher.group(2).trim();
+
                     switch (k) {
                         case "Page number":
                             metadata.set(PSM0_PAGE_NUMBER, 
Integer.parseInt(v));
                             break;
                         case "Orientation in degrees":
-                            metadata.set(PSM0_ORIENTATION, 
Integer.parseInt(v));
-                            break;
-                        case "Rotate":
-                            metadata.set(PSM0_ROTATE, Integer.parseInt(v));
+                        case "Rotate": // Handle Tesseract 5.x+
+                            int rotationValue = Integer.parseInt(v);
+                            metadata.set(PSM0_ORIENTATION, rotationValue);
+                            metadata.set(PSM0_ROTATE, rotationValue);
+                            metadata.add(IMAGE_ROTATION, 
String.valueOf((double) rotationValue));
                             break;
                         case "Orientation confidence":
                             metadata.set(PSM0_ORIENTATION_CONFIDENCE, 
Double.parseDouble(v));
@@ -434,8 +459,6 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
                         case "Script confidence":
                             metadata.set(PSM0_SCRIPT_CONFIDENCE, 
Double.parseDouble(v));
                             break;
-                        default:
-                            LOG.warn("I regret I don't know how to parse {} 
with value {}", k, v);
                     }
                 }
                 line = reader.readLine();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c4b89a0db1..b54eaccc17 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -133,7 +133,7 @@ public class TesseractOCRParserTest extends TikaTest {
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
         Metadata metadata = getMetadata(MediaType.image("png"));
-        String ocr = getText("testRotated+10.png", metadata, parseContext);
+        String ocr = getText("testRotated+10.png", p, metadata, parseContext);
         assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
         assertEquals(10.0, 
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)),
                 0.01);
@@ -151,7 +151,7 @@ public class TesseractOCRParserTest extends TikaTest {
         parseContext.set(TesseractOCRConfig.class, config);
         assumeTrue(canRun());
         Metadata metadata = getMetadata(MediaType.image("png"));
-        String ocr = getText("testRotated-10.png", metadata, parseContext);
+        String ocr = getText("testRotated-10.png", p, metadata, parseContext);
         assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
         assertEquals(-10.0, 
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)),
                 0.01);

Reply via email to