Hi, please find attched some patches with unit tetss for the fixes
Best Peter -- Peter Marschall pe...@adpm.de
>From 4768853e9f53c71f03fb5be3875dddec5d9ba019 Mon Sep 17 00:00:00 2001 From: Peter Marschall <pe...@adpm.de> Date: Sun, 18 Apr 2021 12:49:54 +0200 Subject: [PATCH 4/5] t/158_save_hocr_structure.t: new Add a test to test more complex aspects of the hOCR generation: - support for 'ocr_header', 'ocr_caption', 'ocr_footer' elements (and their 'ocrx_...' counterparts) - preservation of 'textangle' and 'baseline' properties - correct indentation of closing non-leaf elements when followed by siblings Signed-off-by: Peter Marschall <pe...@adpm.de> --- t/158_save_hocr_structure.t | 153 ++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 t/158_save_hocr_structure.t diff --git a/t/158_save_hocr_structure.t b/t/158_save_hocr_structure.t new file mode 100644 index 00000000..d6b7b660 --- /dev/null +++ b/t/158_save_hocr_structure.t @@ -0,0 +1,153 @@ +use warnings; +use strict; +use IPC::System::Simple qw(system capture); +use Test::More tests => 1; + +BEGIN { + use Gscan2pdf::Document; + use Gtk3 -init; # Could just call init separately +} + +######################### + +Gscan2pdf::Translation::set_domain('gscan2pdf'); +use Log::Log4perl qw(:easy); +Log::Log4perl->easy_init($WARN); +my $logger = Log::Log4perl::get_logger; +Gscan2pdf::Document->setup($logger); + +# Create test image +system(qw(convert rose: test.pnm)); + +my $slist = Gscan2pdf::Document->new; + +# dir for temporary files +my $dir = File::Temp->newdir; +$slist->set_dir($dir); + +my $hocr = <<'EOS'; +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> +<title></title> +<meta http-equiv="Content-Type" content="text/html;charset=utf-8" > +<meta name='ocr-system' content='tesseract'> +</head> +<body> +<div class='ocr_page' id='page_1' title='image "test.tif"; bbox 0 0 708 1054'> +<div class='ocr_carea' id='block_1_1' title="bbox 87 578 328 685"> +<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 87 578 328 609"> +<span class='ocr_line' id='line_1_1' title="bbox 87 578 328 609; baseline 0.01 -7; textangle 0; x_size 31; x_descenders 7; x_ascenders 6"> +<span class='ocrx_word' id='word_1_1' title='bbox 87 578 143 602; x_wconf 96'>The</span> +<span class='ocrx_word' id='word_1_2' title='bbox 154 578 231 609; x_wconf 96'>quick</span> +<span class='ocrx_word' id='word_1_3' title='bbox 241 578 328 602; x_wconf 96'>brown</span> +</span> +</p> +</div> +<div class='ocr_carea' id='block_1_2' title="bbox 639 814 708 1054"> +<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 639 814 708 1054"> +<span class='ocr_line' id='line_1_2' title="bbox 639 814 670 1053; textangle 90; x_size 31; x_descenders 7; x_ascenders 6"> +<span class='ocrx_word' id='word_1_4' title='bbox 639 998 663 1053; x_wconf 96'>The</span> +<span class='ocrx_word' id='word_1_5' title='bbox 639 911 670 987; x_wconf 96'>quick</span> +<span class='ocrx_word' id='word_1_6' title='bbox 639 814 664 900; x_wconf 96'>brown</span> +</span> +</p> +</div> +<div class='ocr_carea' id='block_1_3' title='bbox 87 578 328 685'> +<p class='ocr_par' id='par_1_3' title='bbox 87 578 328 685'> +<span class='ocr_header' id='header_1_1' title='bbox 88 578 328 609; baseline 0 -7'> +<span class='ocr_word' id='word_1_7' title='bbox 88 578 143 602; x_wconf 96'>The</span> +<span class='ocr_word' id='word_1_8' title='bbox 154 578 230 609; x_wconf 96'>quick</span> +<span class='ocr_word' id='word_1_9' title='bbox 241 578 328 602; x_wconf 96'>brown</span> +</span> +<span class='ocr_caption' id='caption_1_1' title='bbox 87 616 302 647; baseline 0 -7i; textangle 0;'> +<span class='ocr_word' id='word_1_10' title='bbox 87 616 130 640; x_wconf 96'>fox</span> +<span class='ocr_word' id='word_1_11' title='bbox 139 616 228 647; x_wconf 96'>jumps</span> +<span class='ocr_word' id='word_1_12' title='bbox 239 622 302 640; x_wconf 96'>over</span> +</span> +<span class='ocr_footer' id='footer_1_1' title='bbox 87 654 272 685; baseline -0.005 -7'> +<span class='ocr_word' id='word_1_13' title='bbox 87 655 132 678; x_wconf 96'>the</span> +<span class='ocr_word' id='word_1_14' title='bbox 144 654 201 685; x_wconf 96'>lazy</span> +<span class='ocr_word' id='word_1_15' title='bbox 211 654 272 684; x_wconf 96'>dog.</span> +</span> +</p> +</div> +</div> +</body> +</html> +EOS + +$slist->import_files( + paths => [ 'test.pnm' ], + finished_callback => sub { + $slist->{data}[0][2]->import_hocr($hocr); + $slist->save_hocr( + path => 'test.txt', + list_of_pages => + [ $slist->{data}[0][2]{uuid} ], + finished_callback => sub { Gtk3->main_quit } + ); + } +); +Gtk3->main; + +my $expected = <<"EOS"; +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> + <head> + <meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> + <meta name='ocr-system' content='gscan2pdf $Gscan2pdf::Document::VERSION' /> + <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocr_word'/> + </head> + <body> + <div class='ocr_page' id='page_1' title='bbox 0 0 708 1054'> + <div class='ocr_carea' id='block_1_1' title='bbox 87 578 328 685'> + <p class='ocr_par' id='par_1_1' title='bbox 87 578 328 609'> + <span class='ocr_line' id='line_1_1' title='bbox 87 578 328 609; baseline 0.01 -7; textangle 0'> + <span class='ocr_word' id='word_1_1' title='bbox 87 578 143 602; x_wconf 96'>The</span> + <span class='ocr_word' id='word_1_2' title='bbox 154 578 231 609; x_wconf 96'>quick</span> + <span class='ocr_word' id='word_1_3' title='bbox 241 578 328 602; x_wconf 96'>brown</span> + </span> + </p> + </div> + <div class='ocr_carea' id='block_1_2' title='bbox 639 814 708 1054'> + <p class='ocr_par' id='par_1_2' title='bbox 639 814 708 1054'> + <span class='ocr_line' id='line_1_2' title='bbox 639 814 670 1053; textangle 90'> + <span class='ocr_word' id='word_1_4' title='bbox 639 998 663 1053; x_wconf 96'>The</span> + <span class='ocr_word' id='word_1_5' title='bbox 639 911 670 987; x_wconf 96'>quick</span> + <span class='ocr_word' id='word_1_6' title='bbox 639 814 664 900; x_wconf 96'>brown</span> + </span> + </p> + </div> + <div class='ocr_carea' id='block_1_3' title='bbox 87 578 328 685'> + <p class='ocr_par' id='par_1_3' title='bbox 87 578 328 685'> + <span class='ocr_header' id='header_1_1' title='bbox 88 578 328 609; baseline 0 -7'> + <span class='ocr_word' id='word_1_7' title='bbox 88 578 143 602; x_wconf 96'>The</span> + <span class='ocr_word' id='word_1_8' title='bbox 154 578 230 609; x_wconf 96'>quick</span> + <span class='ocr_word' id='word_1_9' title='bbox 241 578 328 602; x_wconf 96'>brown</span> + </span> + <span class='ocr_caption' id='caption_1_1' title='bbox 87 616 302 647; baseline 0 -7; textangle 0'> + <span class='ocr_word' id='word_1_10' title='bbox 87 616 130 640; x_wconf 96'>fox</span> + <span class='ocr_word' id='word_1_11' title='bbox 139 616 228 647; x_wconf 96'>jumps</span> + <span class='ocr_word' id='word_1_12' title='bbox 239 622 302 640; x_wconf 96'>over</span> + </span> + <span class='ocr_footer' id='footer_1_1' title='bbox 87 654 272 685; baseline -0.005 -7'> + <span class='ocr_word' id='word_1_13' title='bbox 87 655 132 678; x_wconf 96'>the</span> + <span class='ocr_word' id='word_1_14' title='bbox 144 654 201 685; x_wconf 96'>lazy</span> + <span class='ocr_word' id='word_1_15' title='bbox 211 654 272 684; x_wconf 96'>dog.</span> + </span> + </p> + </div> + </div> + </body> +</html> +EOS + +is capture(qw(cat test.txt)), $expected, 'saved multipage hOCR'; + +######################### + +unlink 'test.pnm', 'test.txt'; +Gscan2pdf::Document->quit(); -- 2.30.2
From 90f10b940b87699d00560d8b597e19ed5f77b49f Mon Sep 17 00:00:00 2001 From: Peter Marschall <pe...@adpm.de> Date: Sun, 18 Apr 2021 15:30:04 +0200 Subject: [PATCH 5/5] t/156_save_hocr_with_encoding.t: remove 'baseline' property Remove 'baseline property from input to avoid issues with patches preswerving properties. This way the test succeeds independently of those changes. Signed-off-by: Peter Marschall <pe...@adpm.de> --- t/156_save_hocr_with_encoding.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/156_save_hocr_with_encoding.t b/t/156_save_hocr_with_encoding.t index 41b4f7e5..1bdcb245 100644 --- a/t/156_save_hocr_with_encoding.t +++ b/t/156_save_hocr_with_encoding.t @@ -41,7 +41,7 @@ my $hocr = <<'EOS'; <div class='ocr_page' id='page_1' title='image "incas1_modif.jpg"; bbox 0 0 2452 3484; ppageno 0'> <div class='ocr_carea' id='block_1_9' title="bbox 1249 2403 2165 3246"> <p class='ocr_par' dir='ltr' id='par_1_12' title="bbox 1250 2403 2165 3245"> - <span class='ocr_line' id='line_1_70' title="bbox 1251 3205 2162 3245; baseline 0.001 -9"><span class='ocrx_word' id='word_1_518' title='bbox 1251 3205 1344 3236; x_wconf 92' lang='fra' dir='ltr'>donc</span> <span class='ocrx_word' id='word_1_519' title='bbox 1359 3213 1401 3237; x_wconf 91' lang='fra' dir='ltr'>un</span> <span class='ocrx_word' id='word_1_520' title='bbox 1416 3206 1532 3245; x_wconf 86' lang='fra' dir='ltr'>village</span> <span class='ocrx_word' id='word_1_521' title='bbox 1546 3205 1567 3236; x_wconf 88' lang='fra' dir='ltr'>à</span> <span class='ocrx_word' id='word_1_522' title='bbox 1581 3205 1700 3237; x_wconf 93' lang='fra' dir='ltr'>Cuzco</span> <span class='ocrx_word' id='word_1_523' title='bbox 1714 3205 1740 3245; x_wconf 83' lang='fra'>(<<</span> <span class='ocrx_word' id='word_1_524' title='bbox 1756 3208 1871 3237; x_wconf 92' lang='fra' dir='ltr'>centre</span> <span class='ocrx_word' id='word_1_525' title='bbox 1885 3207 1930 3237; x_wconf 93' lang='fra' dir='ltr'>du</span> <span class='ocrx_word' id='word_1_526' title='bbox 1946 3207 2075 3237; x_wconf 91' lang='fra' dir='ltr'>monde</span> <span class='ocrx_word' id='word_1_527' title='bbox 2090 3219 2105 3232; x_wconf 88' lang='fra'><strong><em>>></em></strong></span> <span class='ocrx_word' id='word_1_528' title='bbox 2120 3215 2162 3237; x_wconf 93' lang='fra' dir='ltr'>en</span> + <span class='ocr_line' id='line_1_70' title="bbox 1251 3205 2162 3245"><span class='ocrx_word' id='word_1_518' title='bbox 1251 3205 1344 3236; x_wconf 92' lang='fra' dir='ltr'>donc</span> <span class='ocrx_word' id='word_1_519' title='bbox 1359 3213 1401 3237; x_wconf 91' lang='fra' dir='ltr'>un</span> <span class='ocrx_word' id='word_1_520' title='bbox 1416 3206 1532 3245; x_wconf 86' lang='fra' dir='ltr'>village</span> <span class='ocrx_word' id='word_1_521' title='bbox 1546 3205 1567 3236; x_wconf 88' lang='fra' dir='ltr'>à</span> <span class='ocrx_word' id='word_1_522' title='bbox 1581 3205 1700 3237; x_wconf 93' lang='fra' dir='ltr'>Cuzco</span> <span class='ocrx_word' id='word_1_523' title='bbox 1714 3205 1740 3245; x_wconf 83' lang='fra'>(<<</span> <span class='ocrx_word' id='word_1_524' title='bbox 1756 3208 1871 3237; x_wconf 92' lang='fra' dir='ltr'>centre</span> <span class='ocrx_word' id='word_1_525' title='bbox 1885 3207 1930 3237; x_wconf 93' lang='fra' dir='ltr'>du</span> <span class='ocrx_word' id='word_1_526' title='bbox 1946 3207 2075 3237; x_wconf 91' lang='fra' dir='ltr'>monde</span> <span class='ocrx_word' id='word_1_527' title='bbox 2090 3219 2105 3232; x_wconf 88' lang='fra'><strong><em>>></em></strong></span> <span class='ocrx_word' id='word_1_528' title='bbox 2120 3215 2162 3237; x_wconf 93' lang='fra' dir='ltr'>en</span> </span> </p> </div> -- 2.30.2