This patch add the -reflow option to pdftohtml which produces nicer
HTML output, and replaces the broken patch that I submitted as
http://lists.freedesktop.org/archives/poppler/2008-September/004115.html
The patch has no significant effect on -complex and -xml mode, and -nomerge
still works. The patch also fixes this bugzilla bug for poppler:
https://bugs.freedesktop.org/show_bug.cgi?id=12522

Thanks,
        Warren
--- HtmlOutputDev.cc    Wed Oct  1 15:38:03 2008
+++ HtmlOutputDev.cc    Wed Oct  1 15:38:14 2008
@@ -53,6 +53,7 @@
 extern GBool xml;
 extern GBool showHidden;
 extern GBool noMerge;
+extern GBool reFlow;
 
 static GooString* basename(GooString* str){
   
@@ -381,10 +382,13 @@
 void HtmlPage::coalesce() {
   HtmlString *str1, *str2;
   HtmlFont *hfont1, *hfont2;
-  double space, horSpace, vertSpace, vertOverlap;
-  GBool addSpace, addLineBreak;
+  double space, horSpace, vertSpace;
+  GBool addSpace;
+  GBool nextLine;                      // is str2 on the next line below?
+  GBool addNewline;                    // should we output a newline?
   int n, i;
   double curX, curY;
+  double lineStartX=0.0;               // x-value of last line start
 
 #if 0 //~ for debugging
   for (str1 = yxStrings; str1; str1 = str1->yxNext) {
@@ -452,51 +456,65 @@
     str1->htext->insert(0, ls);
     delete ls;
   }
-  curX = str1->xMin; curY = str1->yMin;
+  lineStartX= curX = str1->xMin; curY = str1->yMin;
 
   while (str1 && (str2 = str1->yxNext)) {
     hfont2 = getFont(str2);
     space = str1->yMax - str1->yMin;
     horSpace = str2->xMin - str1->xMax;
-    addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
+    // Determine if str2 is on the line below the current line
+    addNewline=nextLine = !noMerge && (fabs(str2->yMin - str1->yMin) > 5.0);
     vertSpace = str2->yMin - str1->yMax;
 
+    // Heuristic: if the last character in str1 is a hyphen,
+    // turn off addNewline. This will "glue" hyphenated words
+    // that have been split over multiple lines.
+    if (reFlow && str1->text[str1->len -1] == '-') {
+       addNewline=0;
+       // Also remove the hyphen
+       str1->len--;
+       str1->htext->del(str1->htext->getLength() - 1, 1);
+    }
+
 //printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
 
-    if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax)
-    {
-       vertOverlap = str1->yMax - str2->yMin;
-    } else
-    if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax)
-    {
-       vertOverlap = str2->yMax - str1->yMin;
-    } else
+    // Is str2 a new paragraph?
+    if (nextLine && (
+            // Is this an indented new line?
+            (str2->xMin > lineStartX + 3.0)
+            // Or is there a blank line between this and the last line?
+         || (vertSpace > 0.5 * space)
+           // Or it is XML output, so we always separate each line
+        || xml ))
     {
-       vertOverlap = 0;
-    } 
-    
-    if (
-       (
-        (
-         (
-          (rawOrder && vertOverlap > 0.5 * space) 
-          ||
-          (!rawOrder && str2->yMin < str1->yMax)
-         ) &&
-         (horSpace > -0.5 * space && horSpace < space)
-        ) ||
-                (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak)
-       ) &&
-       (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex 
mode fonts must be the same, in other modes fonts do not metter
-       str1->dir == str2->dir // text direction the same
-       ) 
-    {
-//      printf("yes\n");
+      // A new paragraph, so keep strings separate
+//    printf("new paragraph\n"); 
+      GBool finish_a = str1->getLink() != NULL;
+      GBool finish_bold   = hfont1->isBold();
+      GBool finish_italic = hfont1->isItalic();
+      CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
+     
+      str1->xMin = curX; str1->yMin = curY; 
+      str1 = str2;
+      curX = str1->xMin; curY = str1->yMin;
+      lineStartX= str1->xMin;
+      hfont1 = hfont2;
+      if( hfont1->isBold() )
+       str1->htext->insert(0,"<b>",3);
+      if( hfont1->isItalic() )
+       str1->htext->insert(0,"<i>",3);
+      if( str1->getLink() != NULL ) {
+       GooString *ls = str1->getLink()->getLinkStart();
+       str1->htext->insert(0, ls);
+       delete ls;
+      }
+    } else {
+//    printf("same paragraph\n");
       n = str1->len + str2->len;
       if ((addSpace = horSpace > 0.1 * space)) {
         ++n;
       }
-      if (addLineBreak) {
+      if (nextLine) {
         ++n;
       }
   
@@ -507,18 +525,21 @@
                                        str1->size * sizeof(double));
       if (addSpace) {
                  str1->text[str1->len] = 0x20;
-                 str1->htext->append(xml?" ":"&nbsp;");
+                 str1->htext->append((xml || reFlow) ? " " : "&nbsp;");
                  str1->xRight[str1->len] = str2->xMin;
                  ++str1->len;
       }
-      if (addLineBreak) {
-         str1->text[str1->len] = '\n';
-         str1->htext->append("<br>");
-         str1->xRight[str1->len] = str2->xMin;
-         ++str1->len;
+      if (nextLine) {
+          if (addNewline) {
+           str1->text[str1->len] = '\n';
+           str1->htext->append(reFlow ? "\n" : "<br>");
+           str1->xRight[str1->len] = str2->xMin;
+           ++str1->len;
+          }
          str1->yMin = str2->yMin;
          str1->yMax = str2->yMax;
          str1->xMax = str2->xMax;
+         lineStartX= str2->xMin;
          int fontLineSize = hfont1->getLineSize();
          int curLineSize = (int)(vertSpace + space); 
          if( curLineSize != fontLineSize )
@@ -570,26 +591,6 @@
       }
       str1->yxNext = str2->yxNext;
       delete str2;
-    } else { // keep strings separate
-//      printf("no\n"); 
-      GBool finish_a = str1->getLink() != NULL;
-      GBool finish_bold   = hfont1->isBold();
-      GBool finish_italic = hfont1->isItalic();
-      CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
-     
-      str1->xMin = curX; str1->yMin = curY; 
-      str1 = str2;
-      curX = str1->xMin; curY = str1->yMin;
-      hfont1 = hfont2;
-      if( hfont1->isBold() )
-       str1->htext->insert(0,"<b>",3);
-      if( hfont1->isItalic() )
-       str1->htext->insert(0,"<i>",3);
-      if( str1->getLink() != NULL ) {
-       GooString *ls = str1->getLink()->getLinkStart();
-       str1->htext->insert(0, ls);
-       delete ls;
-      }
     }
   }
   str1->xMin = curX; str1->yMin = curY;
@@ -692,7 +693,7 @@
   
   if( !noframes )
   {  
-      fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" 
link=\"blue\">\n",pageFile); 
+      fputs("</HEAD>\n<BODY vlink=\"blue\" link=\"blue\">\n",pageFile); 
   }
   
   if( !ignore ) 
@@ -760,10 +761,10 @@
                str=new GooString(tmp->htext); 
                fputs(str->getCString(),f);
                delete str;      
-               fputs("<br>\n",f);  
+               fputs(reFlow ? "<p>\n" : "<br>\n",f);  
       }
     }
-       fputs("<hr>\n",f);  
+    if (!reFlow) fputs("<hr>\n",f);  
   }
 }
 
@@ -997,7 +998,7 @@
       
       dumpMetaVars(page);
       fprintf(page,"</HEAD>\n");
-      fprintf(page,"<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" 
link=\"blue\">\n");
+      fprintf(page,"<BODY vlink=\"blue\" link=\"blue\">\n");
     }
   }
   ok = gTrue; 
@@ -1444,11 +1445,11 @@
              GooString *str=GooString::fromInt(page);
              /*                complex         simple
                frames          file-4.html     files.html#4
-               noframes        file.html#4     file.html#4
+               noframes        #4              #4
               */
              if (noframes)
              {
-                 file->append(".html#");
+                 file= new GooString("#");
                  file->append(str);
              }
              else
@@ -1566,7 +1567,7 @@
                if (noframes)
                {
                        output = page; 
-                       fputs("<hr>\n", output);
+                       if (!reFlow) fputs("<hr>\n", output);
                }
                else
                {
@@ -1583,7 +1584,7 @@
  
        GBool done = newOutlineLevel(output, outlines, catalog);
        if (done && !complexMode)
-       fputs("<hr>\n", output);
+       if (!reFlow) fputs("<hr>\n", output);
        
        if (bClose)
        {
--- pdftohtml.cc        2008/09/30 00:18:37     1.1
+++ pdftohtml.cc        2008/10/01 05:47:12
@@ -54,6 +54,7 @@
 GBool noDrm=gFalse;
 
 GBool showHidden = gFalse;
+GBool reFlow = gFalse;         // Output "reflow" paragraphs
 GBool noMerge = gFalse;
 static char ownerPassword[33] = "";
 static char userPassword[33] = "";
@@ -92,12 +93,14 @@
    "zoom the pdf document (default 1.5)"},
   {"-xml",    argFlag,    &xml,         0,
    "output for XML post-processing"},
+  {"-reflow", argFlag,   &reFlow,   0,
+   "output reflow paragraphs"},
   {"-hidden", argFlag,   &showHidden,   0,
    "output hidden text"},
   {"-nomerge", argFlag, &noMerge, 0,
    "do not merge paragraphs"},   
   {"-enc",    argString,   textEncName,    sizeof(textEncName),
-   "output text encoding name"},
+   "output text encoding name (UTF-8, Latin1 etc)"},
   {"-dev",    argString,   gsDevice,       sizeof(gsDevice),
    "output device name for Ghostscript (png16m, jpeg etc)"},
   {"-v",      argFlag,     &printVersion,  0,
@@ -234,7 +237,7 @@
    { 
        complexMode = gTrue;
        noframes = gTrue;
-       noMerge = gTrue;
+       noMerge = gFalse;
    }
 
   // get page range
--- pdftohtml.1 2008/09/30 00:18:37     1.1
+++ pdftohtml.1 2008/10/01 05:42:30
@@ -52,11 +52,16 @@
 .B \-zoom <fp>
 zoom the pdf document (default 1.5)
 .TP
+.B \-reflow
+join paragraph lines together and separate paragraphs with a <p> tag. With
+this flag off, paragraph lines are separated by <br> tags and paragraphs are
+also separated by <br> tags.
+.TP
 .B \-xml
 output for XML post-processing
 .TP
 .B \-enc <string>
-output text encoding name
+output text encoding name (UTF-8, Latin1 etc)
 .TP
 .B \-opw <string>
 owner password (for encrypted files)
@@ -71,7 +76,7 @@
 output device name for Ghostscript (png16m, jpeg etc)
 .TP
 .B \-nomerge
-do not merge paragraphs
+separate output HTML lines with newline characters
 .TP
 .B \-nodrm
 override document DRM settings
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to