Hi,

please find attched some patches with unit tetss for the fixes

Best
Peter
-- 
Peter Marschall
pe...@adpm.de
>From 4768853e9f53c71f03fb5be3875dddec5d9ba019 Mon Sep 17 00:00:00 2001
From: Peter Marschall <pe...@adpm.de>
Date: Sun, 18 Apr 2021 12:49:54 +0200
Subject: [PATCH 4/5] t/158_save_hocr_structure.t: new

Add a test to test more complex aspects of the hOCR generation:
- support for 'ocr_header', 'ocr_caption', 'ocr_footer' elements
  (and their 'ocrx_...' counterparts)
- preservation of 'textangle' and 'baseline' properties
- correct indentation of closing non-leaf elements when followed by siblings

Signed-off-by: Peter Marschall <pe...@adpm.de>
---
 t/158_save_hocr_structure.t | 153 ++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 t/158_save_hocr_structure.t

diff --git a/t/158_save_hocr_structure.t b/t/158_save_hocr_structure.t
new file mode 100644
index 00000000..d6b7b660
--- /dev/null
+++ b/t/158_save_hocr_structure.t
@@ -0,0 +1,153 @@
+use warnings;
+use strict;
+use IPC::System::Simple qw(system capture);
+use Test::More tests => 1;
+
+BEGIN {
+    use Gscan2pdf::Document;
+    use Gtk3 -init;    # Could just call init separately
+}
+
+#########################
+
+Gscan2pdf::Translation::set_domain('gscan2pdf');
+use Log::Log4perl qw(:easy);
+Log::Log4perl->easy_init($WARN);
+my $logger = Log::Log4perl::get_logger;
+Gscan2pdf::Document->setup($logger);
+
+# Create test image
+system(qw(convert rose: test.pnm));
+
+my $slist = Gscan2pdf::Document->new;
+
+# dir for temporary files
+my $dir = File::Temp->newdir;
+$slist->set_dir($dir);
+
+my $hocr = <<'EOS';
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd";>
+<html>
+<head>
+<title></title>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" >
+<meta name='ocr-system' content='tesseract'>
+</head>
+<body>
+<div class='ocr_page' id='page_1' title='image "test.tif"; bbox 0 0 708 1054'>
+<div class='ocr_carea' id='block_1_1' title="bbox 87 578 328 685">
+<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 87 578 328 609">
+<span class='ocr_line' id='line_1_1' title="bbox 87 578 328 609; baseline 0.01 -7; textangle 0; x_size 31; x_descenders 7; x_ascenders 6">
+<span class='ocrx_word' id='word_1_1' title='bbox 87 578 143 602; x_wconf 96'>The</span>
+<span class='ocrx_word' id='word_1_2' title='bbox 154 578 231 609; x_wconf 96'>quick</span>
+<span class='ocrx_word' id='word_1_3' title='bbox 241 578 328 602; x_wconf 96'>brown</span>
+</span>
+</p>
+</div>
+<div class='ocr_carea' id='block_1_2' title="bbox 639 814 708 1054">
+<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 639 814 708 1054">
+<span class='ocr_line' id='line_1_2' title="bbox 639 814 670 1053; textangle 90; x_size 31; x_descenders 7; x_ascenders 6">
+<span class='ocrx_word' id='word_1_4' title='bbox 639 998 663 1053; x_wconf 96'>The</span>
+<span class='ocrx_word' id='word_1_5' title='bbox 639 911 670 987; x_wconf 96'>quick</span>
+<span class='ocrx_word' id='word_1_6' title='bbox 639 814 664 900; x_wconf 96'>brown</span>
+</span>
+</p>
+</div>
+<div class='ocr_carea' id='block_1_3' title='bbox 87 578 328 685'>
+<p class='ocr_par' id='par_1_3' title='bbox 87 578 328 685'>
+<span class='ocr_header' id='header_1_1' title='bbox 88 578 328 609; baseline 0 -7'>
+<span class='ocr_word' id='word_1_7' title='bbox 88 578 143 602; x_wconf 96'>The</span>
+<span class='ocr_word' id='word_1_8' title='bbox 154 578 230 609; x_wconf 96'>quick</span>
+<span class='ocr_word' id='word_1_9' title='bbox 241 578 328 602; x_wconf 96'>brown</span>
+</span>
+<span class='ocr_caption' id='caption_1_1' title='bbox 87 616 302 647; baseline 0 -7i; textangle 0;'>
+<span class='ocr_word' id='word_1_10' title='bbox 87 616 130 640; x_wconf 96'>fox</span>
+<span class='ocr_word' id='word_1_11' title='bbox 139 616 228 647; x_wconf 96'>jumps</span>
+<span class='ocr_word' id='word_1_12' title='bbox 239 622 302 640; x_wconf 96'>over</span>
+</span>
+<span class='ocr_footer' id='footer_1_1' title='bbox 87 654 272 685; baseline -0.005 -7'>
+<span class='ocr_word' id='word_1_13' title='bbox 87 655 132 678; x_wconf 96'>the</span>
+<span class='ocr_word' id='word_1_14' title='bbox 144 654 201 685; x_wconf 96'>lazy</span>
+<span class='ocr_word' id='word_1_15' title='bbox 211 654 272 684; x_wconf 96'>dog.</span>
+</span>
+</p>
+</div>
+</div>
+</body>
+</html>
+EOS
+
+$slist->import_files(
+    paths             => [ 'test.pnm' ],
+    finished_callback => sub {
+        $slist->{data}[0][2]->import_hocr($hocr);
+        $slist->save_hocr(
+            path => 'test.txt',
+            list_of_pages =>
+              [ $slist->{data}[0][2]{uuid} ],
+            finished_callback => sub { Gtk3->main_quit }
+        );
+    }
+);
+Gtk3->main;
+
+my $expected = <<"EOS";
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en">
+ <head>
+  <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+  <meta name='ocr-system' content='gscan2pdf $Gscan2pdf::Document::VERSION' />
+  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocr_word'/>
+ </head>
+ <body>
+  <div class='ocr_page' id='page_1' title='bbox 0 0 708 1054'>
+   <div class='ocr_carea' id='block_1_1' title='bbox 87 578 328 685'>
+    <p class='ocr_par' id='par_1_1' title='bbox 87 578 328 609'>
+     <span class='ocr_line' id='line_1_1' title='bbox 87 578 328 609; baseline 0.01 -7; textangle 0'>
+      <span class='ocr_word' id='word_1_1' title='bbox 87 578 143 602; x_wconf 96'>The</span>
+      <span class='ocr_word' id='word_1_2' title='bbox 154 578 231 609; x_wconf 96'>quick</span>
+      <span class='ocr_word' id='word_1_3' title='bbox 241 578 328 602; x_wconf 96'>brown</span>
+     </span>
+    </p>
+   </div>
+   <div class='ocr_carea' id='block_1_2' title='bbox 639 814 708 1054'>
+    <p class='ocr_par' id='par_1_2' title='bbox 639 814 708 1054'>
+     <span class='ocr_line' id='line_1_2' title='bbox 639 814 670 1053; textangle 90'>
+      <span class='ocr_word' id='word_1_4' title='bbox 639 998 663 1053; x_wconf 96'>The</span>
+      <span class='ocr_word' id='word_1_5' title='bbox 639 911 670 987; x_wconf 96'>quick</span>
+      <span class='ocr_word' id='word_1_6' title='bbox 639 814 664 900; x_wconf 96'>brown</span>
+     </span>
+    </p>
+   </div>
+   <div class='ocr_carea' id='block_1_3' title='bbox 87 578 328 685'>
+    <p class='ocr_par' id='par_1_3' title='bbox 87 578 328 685'>
+     <span class='ocr_header' id='header_1_1' title='bbox 88 578 328 609; baseline 0 -7'>
+      <span class='ocr_word' id='word_1_7' title='bbox 88 578 143 602; x_wconf 96'>The</span>
+      <span class='ocr_word' id='word_1_8' title='bbox 154 578 230 609; x_wconf 96'>quick</span>
+      <span class='ocr_word' id='word_1_9' title='bbox 241 578 328 602; x_wconf 96'>brown</span>
+     </span>
+     <span class='ocr_caption' id='caption_1_1' title='bbox 87 616 302 647; baseline 0 -7; textangle 0'>
+      <span class='ocr_word' id='word_1_10' title='bbox 87 616 130 640; x_wconf 96'>fox</span>
+      <span class='ocr_word' id='word_1_11' title='bbox 139 616 228 647; x_wconf 96'>jumps</span>
+      <span class='ocr_word' id='word_1_12' title='bbox 239 622 302 640; x_wconf 96'>over</span>
+     </span>
+     <span class='ocr_footer' id='footer_1_1' title='bbox 87 654 272 685; baseline -0.005 -7'>
+      <span class='ocr_word' id='word_1_13' title='bbox 87 655 132 678; x_wconf 96'>the</span>
+      <span class='ocr_word' id='word_1_14' title='bbox 144 654 201 685; x_wconf 96'>lazy</span>
+      <span class='ocr_word' id='word_1_15' title='bbox 211 654 272 684; x_wconf 96'>dog.</span>
+     </span>
+    </p>
+   </div>
+  </div>
+ </body>
+</html>
+EOS
+
+is capture(qw(cat test.txt)), $expected, 'saved multipage hOCR';
+
+#########################
+
+unlink 'test.pnm', 'test.txt';
+Gscan2pdf::Document->quit();
-- 
2.30.2

From 90f10b940b87699d00560d8b597e19ed5f77b49f Mon Sep 17 00:00:00 2001
From: Peter Marschall <pe...@adpm.de>
Date: Sun, 18 Apr 2021 15:30:04 +0200
Subject: [PATCH 5/5] t/156_save_hocr_with_encoding.t: remove 'baseline'
 property

Remove 'baseline property from input to avoid issues with patches preswerving
properties.
This way the test succeeds independently of those changes.

Signed-off-by: Peter Marschall <pe...@adpm.de>
---
 t/156_save_hocr_with_encoding.t | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/156_save_hocr_with_encoding.t b/t/156_save_hocr_with_encoding.t
index 41b4f7e5..1bdcb245 100644
--- a/t/156_save_hocr_with_encoding.t
+++ b/t/156_save_hocr_with_encoding.t
@@ -41,7 +41,7 @@ my $hocr = <<'EOS';
   <div class='ocr_page' id='page_1' title='image "incas1_modif.jpg"; bbox 0 0 2452 3484; ppageno 0'>
    <div class='ocr_carea' id='block_1_9' title="bbox 1249 2403 2165 3246">
     <p class='ocr_par' dir='ltr' id='par_1_12' title="bbox 1250 2403 2165 3245">
-     <span class='ocr_line' id='line_1_70' title="bbox 1251 3205 2162 3245; baseline 0.001 -9"><span class='ocrx_word' id='word_1_518' title='bbox 1251 3205 1344 3236; x_wconf 92' lang='fra' dir='ltr'>donc</span> <span class='ocrx_word' id='word_1_519' title='bbox 1359 3213 1401 3237; x_wconf 91' lang='fra' dir='ltr'>un</span> <span class='ocrx_word' id='word_1_520' title='bbox 1416 3206 1532 3245; x_wconf 86' lang='fra' dir='ltr'>village</span> <span class='ocrx_word' id='word_1_521' title='bbox 1546 3205 1567 3236; x_wconf 88' lang='fra' dir='ltr'>à</span> <span class='ocrx_word' id='word_1_522' title='bbox 1581 3205 1700 3237; x_wconf 93' lang='fra' dir='ltr'>Cuzco</span> <span class='ocrx_word' id='word_1_523' title='bbox 1714 3205 1740 3245; x_wconf 83' lang='fra'>(&lt;&lt;</span> <span class='ocrx_word' id='word_1_524' title='bbox 1756 3208 1871 3237; x_wconf 92' lang='fra' dir='ltr'>centre</span> <span class='ocrx_word' id='word_1_525' title='bbox 1885 3207 1930 3237; x_wconf 93' lang='fra' dir='ltr'>du</span> <span class='ocrx_word' id='word_1_526' title='bbox 1946 3207 2075 3237; x_wconf 91' lang='fra' dir='ltr'>monde</span> <span class='ocrx_word' id='word_1_527' title='bbox 2090 3219 2105 3232; x_wconf 88' lang='fra'><strong><em>&gt;&gt;</em></strong></span> <span class='ocrx_word' id='word_1_528' title='bbox 2120 3215 2162 3237; x_wconf 93' lang='fra' dir='ltr'>en</span> 
+     <span class='ocr_line' id='line_1_70' title="bbox 1251 3205 2162 3245"><span class='ocrx_word' id='word_1_518' title='bbox 1251 3205 1344 3236; x_wconf 92' lang='fra' dir='ltr'>donc</span> <span class='ocrx_word' id='word_1_519' title='bbox 1359 3213 1401 3237; x_wconf 91' lang='fra' dir='ltr'>un</span> <span class='ocrx_word' id='word_1_520' title='bbox 1416 3206 1532 3245; x_wconf 86' lang='fra' dir='ltr'>village</span> <span class='ocrx_word' id='word_1_521' title='bbox 1546 3205 1567 3236; x_wconf 88' lang='fra' dir='ltr'>à</span> <span class='ocrx_word' id='word_1_522' title='bbox 1581 3205 1700 3237; x_wconf 93' lang='fra' dir='ltr'>Cuzco</span> <span class='ocrx_word' id='word_1_523' title='bbox 1714 3205 1740 3245; x_wconf 83' lang='fra'>(&lt;&lt;</span> <span class='ocrx_word' id='word_1_524' title='bbox 1756 3208 1871 3237; x_wconf 92' lang='fra' dir='ltr'>centre</span> <span class='ocrx_word' id='word_1_525' title='bbox 1885 3207 1930 3237; x_wconf 93' lang='fra' dir='ltr'>du</span> <span class='ocrx_word' id='word_1_526' title='bbox 1946 3207 2075 3237; x_wconf 91' lang='fra' dir='ltr'>monde</span> <span class='ocrx_word' id='word_1_527' title='bbox 2090 3219 2105 3232; x_wconf 88' lang='fra'><strong><em>&gt;&gt;</em></strong></span> <span class='ocrx_word' id='word_1_528' title='bbox 2120 3215 2162 3237; x_wconf 93' lang='fra' dir='ltr'>en</span> 
      </span>
     </p>
    </div>
-- 
2.30.2

Reply via email to