cpp/poppler-page.cpp       |   10 -
 glib/poppler-page.cc       |    5 
 poppler/ArthurOutputDev.cc |    1 
 poppler/CairoOutputDev.cc  |    2 
 poppler/Gfx.cc             |    6 
 poppler/PSOutputDev.cc     |    6 
 poppler/TextOutputDev.cc   |  360 ++++++++++++++++++++++++++++-----------------
 poppler/TextOutputDev.h    |   18 +-
 qt4/src/poppler-page.cc    |   12 -
 test/perf-test.cc          |    2 
 utils/pdftotext.1          |    4 
 utils/pdftotext.cc         |   10 +
 12 files changed, 270 insertions(+), 166 deletions(-)

New commits:
commit e17f09563276ee25b6acfc127b6ea360da650030
Author: Albert Astals Cid <[email protected]>
Date:   Mon Feb 6 00:25:53 2012 +0100

    [xpdf303] TextOutputDev and associated changes

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 4e2f730..d72a477 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -208,24 +208,24 @@ bool page::search(const ustring &text, rectf &r, 
search_direction_enum direction
     double rect_right = r.right();
     double rect_bottom = r.bottom();
 
-    TextOutputDev td(NULL, gTrue, gFalse, gFalse);
+    TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse);
     d->doc->doc->displayPage(&td, d->index + 1, 72, 72, rotation_value, false, 
true, false);
     TextPage *text_page = td.takeText();
 
     switch (direction) {
     case search_from_top:
         found = text_page->findText(&u[0], len,
-                    gTrue, gTrue, gFalse, gFalse, sCase, gFalse,
+                    gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse,
                     &rect_left, &rect_top, &rect_right, &rect_bottom);
         break;
     case search_next_result:
         found = text_page->findText(&u[0], len,
-                    gFalse, gTrue, gTrue, gFalse, sCase, gFalse,
+                    gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse,
                     &rect_left, &rect_top, &rect_right, &rect_bottom);
         break;
     case search_previous_result:
         found = text_page->findText(&u[0], len,
-                    gFalse, gTrue, gTrue, gFalse, sCase, gTrue,
+                    gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse,
                     &rect_left, &rect_top, &rect_right, &rect_bottom);
         break;
     }
@@ -267,7 +267,7 @@ ustring page::text(const rectf &r, text_layout_enum 
layout_mode) const
 {
     std::auto_ptr<GooString> s;
     const GBool use_raw_order = (layout_mode == raw_order_layout);
-    TextOutputDev td(0, gFalse, use_raw_order, gFalse);
+    TextOutputDev td(0, gFalse, 0, use_raw_order, gFalse);
     d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false);
     if (r.is_empty()) {
         const PDFRectangle *rect = d->page->getCropBox();
diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 7b98625..a95ff6a 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -263,7 +263,7 @@ poppler_page_get_text_page (PopplerPage *page)
     TextOutputDev *text_dev;
     Gfx           *gfx;
 
-    text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+    text_dev = new TextOutputDev (NULL, gTrue, 0, gFalse, gFalse);
     gfx = page->page->createGfx(text_dev,
                                72.0, 72.0, 0,
                                gFalse, /* useMediaBox */
@@ -888,6 +888,7 @@ poppler_page_find_text (PopplerPage *page,
                             gFalse, gTrue, // startAtTop, stopAtBottom
                             gFalse, gFalse, // startAtLast, stopAtLast
                             gFalse, gFalse, // caseSensitive, backwards
+                            gFalse, // wholeWord
                             &xMin, &yMin, &xMax, &yMax))
     {
       match = poppler_rectangle_new ();
@@ -1064,7 +1065,7 @@ poppler_page_render_to_ps (PopplerPage   *page,
                                     ps_file->first_page, ps_file->last_page,
                                     psModePS, (int)ps_file->paper_width,
                                     (int)ps_file->paper_height, 
ps_file->duplex,
-                                    0, 0, 0, 0, gFalse, gFalse);
+                                    0, 0, 0, 0, gFalse);
 
 
   ps_file->document->doc->displayPage (ps_file->out, page->index + 1, 72.0, 
72.0,
diff --git a/poppler/ArthurOutputDev.cc b/poppler/ArthurOutputDev.cc
index 92bc84f..301232b 100644
--- a/poppler/ArthurOutputDev.cc
+++ b/poppler/ArthurOutputDev.cc
@@ -292,6 +292,7 @@ void ArthurOutputDev::updateFont(GfxState *state)
   m_font = NULL;
   fileName = NULL;
   tmpBuf = NULL;
+  fontLoc = NULL;
 
   if (!(gfxFont = state->getFont())) {
     goto err1;
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index dc5698e..340b3a3 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -246,7 +246,7 @@ void CairoOutputDev::startPage(int pageNum, GfxState 
*state) {
 void CairoOutputDev::endPage() {
   if (text) {
     text->endPage();
-    text->coalesce(gTrue, gFalse);
+    text->coalesce(gTrue, 0, gFalse);
   }
 }
 
diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc
index 4fa9431..46e9c4a 100644
--- a/poppler/Gfx.cc
+++ b/poppler/Gfx.cc
@@ -2866,7 +2866,7 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
   GfxColor colorA, colorB;
   double xa, ya, xb, yb, ra, rb;
   double ta, tb, sa, sb;
-  double sz, sMin, sMax, h;
+  double sMin, sMax, h;
   double sLeft, sRight, sTop, sBottom, sZero, sDiag;
   GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero;
   GBool haveSMin, haveSMax;
@@ -2888,18 +2888,14 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
   if (h == 0) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else if (r1 - r0 == 0) {
     enclosed = gFalse;
     theta = 0;
-    sz = 0; // make gcc happy
   } else if (fabs(r1 - r0) >= h) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else {
     enclosed = gFalse;
-    sz = -r0 / (r1 - r0);
     theta = asin((r1 - r0) / h);
   }
 
diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc
index 7dbac8a..d91cae9 100644
--- a/poppler/PSOutputDev.cc
+++ b/poppler/PSOutputDev.cc
@@ -4178,7 +4178,7 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, 
GfxRadialShading *shading,
   double xMin, yMin, xMax, yMax;
   double x0, y0, r0, x1, y1, r1, t0, t1;
   double xa, ya, ra;
-  double sz, sMin, sMax, h, ta;
+  double sMin, sMax, h, ta;
   double sLeft, sRight, sTop, sBottom, sZero, sDiag;
   GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero;
   GBool haveSMin, haveSMax;
@@ -4206,18 +4206,14 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, 
GfxRadialShading *shading,
   if (h == 0) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else if (r1 - r0 == 0) {
     enclosed = gFalse;
     theta = 0;
-    sz = 0; // make gcc happy
   } else if (fabs(r1 - r0) >= h) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else {
     enclosed = gFalse;
-    sz = -r0 / (r1 - r0);
     theta = asin((r1 - r0) / h);
   }
   if (enclosed) {
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 7a0b8d9..531617d 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -663,7 +663,7 @@ void TextPool::addWord(TextWord *word) {
 
   // insert the new word
   if (cursor && wordBaseIdx == cursorBaseIdx &&
-      word->primaryCmp(cursor) > 0) {
+      word->primaryCmp(cursor) >= 0) {
     w0 = cursor;
     w1 = cursor->next;
   } else {
@@ -1053,7 +1053,7 @@ void TextLineFrag::computeCoords(GBool oneRot) {
        xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
        yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
        yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
-       base = blk->yMin + base * (blk->yMax - blk->yMin);
+       base = blk->yMin + d4 * (blk->yMax - blk->yMin);
        break;
       case 1:
        xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
@@ -1277,15 +1277,15 @@ void TextBlock::addWord(TextWord *word) {
   }
 }
 
-void TextBlock::coalesce(UnicodeMap *uMap) {
+void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) {
   TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
   TextLine *line, *line0, *line1;
   int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
   int baseIdx, bestWordBaseIdx, idx0, idx1;
   double minBase, maxBase;
-  double fontSize, delta, priDelta, secDelta;
+  double fontSize, wordSpacing, delta, priDelta, secDelta;
   TextLine **lineArray;
-  GBool found;
+  GBool found, overlap;
   int col1, col2;
   int i, j, k;
 
@@ -1295,11 +1295,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
     while (word0) {
       priDelta = dupMaxPriDelta * word0->fontSize;
       secDelta = dupMaxSecDelta * word0->fontSize;
-      if (rot == 0 || rot == 3) {
-       maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
-      } else {
-       maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
-      }
+      maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
       found = gFalse;
       word1 = word2 = NULL; // make gcc happy
       for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
@@ -1396,6 +1392,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
     maxBase = word0->base + maxIntraLineDelta * fontSize;
     minBaseIdx = pool->getBaseIdx(minBase);
     maxBaseIdx = pool->getBaseIdx(maxBase);
+    wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
 
     // find the rest of the words in this line
     while (1) {
@@ -1404,25 +1401,32 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
       // this line
       bestWordBaseIdx = 0;
       bestWord0 = bestWord1 = NULL;
-      for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+      overlap = gFalse;
+      for (baseIdx = minBaseIdx;
+          !overlap && baseIdx <= maxBaseIdx;
+          ++baseIdx) {
        for (word0 = NULL, word1 = pool->getPool(baseIdx);
             word1;
             word0 = word1, word1 = word1->next) {
          if (word1->base >= minBase &&
-             word1->base <= maxBase &&
-             (delta = lastWord->primaryDelta(word1)) >=
-               minCharSpacing * fontSize) {
-           if (delta < maxWordSpacing * fontSize &&
-               (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
-             bestWordBaseIdx = baseIdx;
-             bestWord0 = word0;
-             bestWord1 = word1;
+             word1->base <= maxBase) {
+           delta = lastWord->primaryDelta(word1);
+           if (delta < minCharSpacing * fontSize) {
+             overlap = gTrue;
+             break;
+           } else {
+             if (delta < wordSpacing &&
+                 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
+               bestWordBaseIdx = baseIdx;
+               bestWord0 = word0;
+               bestWord1 = word1;
+             }
+             break;
            }
-           break;
          }
        }
       }
-      if (!bestWord1) {
+      if (overlap || !bestWord1) {
        break;
       }
 
@@ -1469,52 +1473,79 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
 
   // column assignment
   nColumns = 0;
-  for (i = 0; i < nLines; ++i) {
-    line0 = lineArray[i];
-    col1 = 0;
-    for (j = 0; j < i; ++j) {
-      line1 = lineArray[j];
-      if (line1->primaryDelta(line0) >= 0) {
-       col2 = line1->col[line1->len] + 1;
-      } else {
-       k = 0; // make gcc happy
-       switch (rot) {
-       case 0:
-         for (k = 0;
-              k < line1->len &&
-                line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-              ++k) ;
-         break;
-       case 1:
-         for (k = 0;
-              k < line1->len &&
-                line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-              ++k) ;
-         break;
-       case 2:
-         for (k = 0;
-              k < line1->len &&
-                line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-              ++k) ;
-         break;
-       case 3:
-         for (k = 0;
-              k < line1->len &&
-                line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-              ++k) ;
-         break;
-       }
-       col2 = line1->col[k];
+  if (fixedPitch) {
+    for (i = 0; i < nLines; ++i) {
+      line0 = lineArray[i];
+      col1 = 0; // make gcc happy
+      switch (rot) {
+      case 0:
+       col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
+       break;
+      case 1:
+       col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
+       break;
+      case 2:
+       col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
+       break;
+      case 3:
+       col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
+       break;
       }
-      if (col2 > col1) {
-       col1 = col2;
+      for (k = 0; k <= line0->len; ++k) {
+       line0->col[k] += col1;
+      }
+      if (line0->col[line0->len] > nColumns) {
+       nColumns = line0->col[line0->len];
       }
     }
-    for (k = 0; k <= line0->len; ++k) {
-      line0->col[k] += col1;
-    }
-    if (line0->col[line0->len] > nColumns) {
-      nColumns = line0->col[line0->len];
+  } else {
+    for (i = 0; i < nLines; ++i) {
+      line0 = lineArray[i];
+      col1 = 0;
+      for (j = 0; j < i; ++j) {
+       line1 = lineArray[j];
+       if (line1->primaryDelta(line0) >= 0) {
+         col2 = line1->col[line1->len] + 1;
+       } else {
+         k = 0; // make gcc happy
+         switch (rot) {
+         case 0:
+           for (k = 0;
+                k < line1->len &&
+                  line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+                ++k) ;
+           break;
+         case 1:
+           for (k = 0;
+                k < line1->len &&
+                  line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+                ++k) ;
+           break;
+         case 2:
+           for (k = 0;
+                k < line1->len &&
+                  line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+                ++k) ;
+           break;
+         case 3:
+           for (k = 0;
+                k < line1->len &&
+                  line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+                ++k) ;
+           break;
+         }
+         col2 = line1->col[k];
+       }
+       if (col2 > col1) {
+         col1 = col2;
+       }
+      }
+      for (k = 0; k <= line0->len; ++k) {
+       line0->col[k] += col1;
+      }
+      if (line0->col[line0->len] > nColumns) {
+       nColumns = line0->col[line0->len];
+      }
     }
   }
   gfree(lineArray);
@@ -2111,6 +2142,8 @@ void TextPage::clear() {
     gfree(blocks);
   }
   deleteGooList(fonts, TextFontInfo);
+  deleteGooList(underlines, TextUnderline);
+  deleteGooList(links, TextLink);
 
   curWord = NULL;
   charPos = 0;
@@ -2128,6 +2161,8 @@ void TextPage::clear() {
   rawWords = NULL;
   rawLastWord = NULL;
   fonts = new GooList();
+  underlines = new GooList();
+  links = new GooList();
 }
 
 void TextPage::updateFont(GfxState *state) {
@@ -2426,7 +2461,7 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int 
yMax, AnnotLink *link)
   links->append(new TextLink(xMin, yMin, xMax, yMax, link));
 }
 
-void TextPage::coalesce(GBool physLayout, GBool doHTML) {
+void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) {
   UnicodeMap *uMap;
   TextPool *pool;
   TextWord *word0, *word1, *word2;
@@ -2454,7 +2489,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   blkList = NULL;
   lastBlk = NULL;
   nBlocks = 0;
-  primaryRot = -1;
+  primaryRot = 0;
 
 #if 0 // for debugging
   printf("*** initial words ***\n");
@@ -2918,7 +2953,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
       //~ addition to primary rotation
 
       // coalesce the block, and add it to the list
-      blk->coalesce(uMap);
+      blk->coalesce(uMap, fixedPitch);
       if (lastBlk) {
        lastBlk->next = blk;
       } else {
@@ -2926,11 +2961,12 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) 
{
       }
       lastBlk = blk;
       count[rot] += blk->charCount;
-      if (primaryRot < 0 || count[rot] > count[primaryRot]) {
-       primaryRot = rot;
-      }
       ++nBlocks;
     }
+
+    if (count[rot] > count[primaryRot]) {
+      primaryRot = rot;
+    }
   }
 
 #if 0 // for debugging
@@ -2992,75 +3028,108 @@ void TextPage::coalesce(GBool physLayout, GBool 
doHTML) {
   // sort blocks into xy order for column assignment
   if (blocks)
     gfree (blocks);
-  blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
-  for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
-    blocks[i] = blk;
-  }
-  qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+  if (physLayout && fixedPitch) {
 
-  // column assignment
-  for (i = 0; i < nBlocks; ++i) {
-    blk0 = blocks[i];
-    col1 = 0;
-    for (j = 0; j < i; ++j) {
-      blk1 = blocks[j];
-      col2 = 0; // make gcc happy
+    blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
+    for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+      blocks[i] = blk;
+      col1 = 0; // make gcc happy
       switch (primaryRot) {
       case 0:
-       if (blk0->xMin > blk1->xMax) {
-         col2 = blk1->col + blk1->nColumns + 3;
-       } else if (blk1->xMax == blk1->xMin) {
-         col2 = blk1->col;
-       } else {
-         col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
-                                   (blk1->xMax - blk1->xMin)) *
-                                  blk1->nColumns);
-       }
+       col1 = (int)(blk->xMin / fixedPitch + 0.5);
        break;
       case 1:
-       if (blk0->yMin > blk1->yMax) {
-         col2 = blk1->col + blk1->nColumns + 3;
-       } else if (blk1->yMax == blk1->yMin) {
-         col2 = blk1->col;
-       } else {
-         col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
-                                   (blk1->yMax - blk1->yMin)) *
-                                  blk1->nColumns);
-       }
+       col1 = (int)(blk->yMin / fixedPitch + 0.5);
        break;
       case 2:
-       if (blk0->xMax < blk1->xMin) {
-         col2 = blk1->col + blk1->nColumns + 3;
-       } else if (blk1->xMin == blk1->xMax) {
-         col2 = blk1->col;
-       } else {
-         col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
-                                   (blk1->xMin - blk1->xMax)) *
-                                  blk1->nColumns);
-       }
+       col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
        break;
       case 3:
-       if (blk0->yMax < blk1->yMin) {
-         col2 = blk1->col + blk1->nColumns + 3;
-       } else if (blk1->yMin == blk1->yMax) {
-         col2 = blk1->col;
-       } else {
-         col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
-                                   (blk1->yMin - blk1->yMax)) *
-                                  blk1->nColumns);
-       }
+       col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
        break;
       }
-      if (col2 > col1) {
-       col1 = col2;
+      blk->col = col1;
+      for (line = blk->lines; line; line = line->next) {
+       for (j = 0; j <= line->len; ++j) {
+         line->col[j] += col1;
+       }
       }
     }
-    blk0->col = col1;
-    for (line = blk0->lines; line; line = line->next) {
-      for (j = 0; j <= line->len; ++j) {
-       line->col[j] += col1;
+
+  } else {
+
+    // sort blocks into xy order for column assignment
+    blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
+    for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+      blocks[i] = blk;
+    }
+    qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+
+    // column assignment
+    for (i = 0; i < nBlocks; ++i) {
+      blk0 = blocks[i];
+      col1 = 0;
+      for (j = 0; j < i; ++j) {
+       blk1 = blocks[j];
+       col2 = 0; // make gcc happy
+       switch (primaryRot) {
+       case 0:
+         if (blk0->xMin > blk1->xMax) {
+           col2 = blk1->col + blk1->nColumns + 3;
+         } else if (blk1->xMax == blk1->xMin) {
+           col2 = blk1->col;
+         } else {
+           col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
+                                     (blk1->xMax - blk1->xMin)) *
+                                    blk1->nColumns);
+         }
+         break;
+       case 1:
+         if (blk0->yMin > blk1->yMax) {
+           col2 = blk1->col + blk1->nColumns + 3;
+         } else if (blk1->yMax == blk1->yMin) {
+           col2 = blk1->col;
+         } else {
+           col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
+                                     (blk1->yMax - blk1->yMin)) *
+                                    blk1->nColumns);
+         }
+         break;
+       case 2:
+         if (blk0->xMax < blk1->xMin) {
+           col2 = blk1->col + blk1->nColumns + 3;
+         } else if (blk1->xMin == blk1->xMax) {
+           col2 = blk1->col;
+         } else {
+           col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
+                                     (blk1->xMin - blk1->xMax)) *
+                                    blk1->nColumns);
+         }
+         break;
+       case 3:
+         if (blk0->yMax < blk1->yMin) {
+           col2 = blk1->col + blk1->nColumns + 3;
+         } else if (blk1->yMin == blk1->yMax) {
+           col2 = blk1->col;
+         } else {
+           col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
+                                     (blk1->yMin - blk1->yMax)) *
+                                    blk1->nColumns);
+         }
+         break;
+       }
+       if (col2 > col1) {
+         col1 = col2;
+       }
+      }
+      blk0->col = col1;
+      for (line = blk0->lines; line; line = line->next) {
+       for (j = 0; j <= line->len; ++j) {
+         line->col[j] += col1;
+       }
       }
     }
+
   }
 
 #if 0 // for debugging
@@ -3070,7 +3139,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
           blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
           blk->nColumns);
     for (line = blk->lines; line; line = line->next) {
-      printf("  line:\n");
+      printf("  line: col[0]=%d\n", line->col[0]);
       for (word0 = line->words; word0; word0 = word0->next) {
        printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f 
space=%d: '",
               word0->xMin, word0->xMax, word0->yMin, word0->yMax,
@@ -3470,6 +3539,7 @@ GBool TextPage::findText(Unicode *s, int len,
                         GBool startAtTop, GBool stopAtBottom,
                         GBool startAtLast, GBool stopAtLast,
                         GBool caseSensitive, GBool backward,
+                        GBool wholeWord,
                         double *xMin, double *yMin,
                         double *xMax, double *yMax) {
   TextBlock *blk;
@@ -3527,25 +3597,35 @@ GBool TextPage::findText(Unicode *s, int len,
     blk = blocks[i];
 
     // check: is the block above the top limit?
-    if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
+    // (this only works if the page's primary rotation is zero --
+    // otherwise the blocks won't be sorted in the useful order)
+    if (!startAtTop && primaryRot == 0 &&
+       (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
       continue;
     }
 
     // check: is the block below the bottom limit?
-    if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
+    // (this only works if the page's primary rotation is zero --
+    // otherwise the blocks won't be sorted in the useful order)
+    if (!stopAtBottom && primaryRot == 0 &&
+       (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
       break;
     }
 
     for (line = blk->lines; line; line = line->next) {
 
       // check: is the line above the top limit?
-      if (!startAtTop &&
+      // (this only works if the page's primary rotation is zero --
+      // otherwise the lines won't be sorted in the useful order)
+      if (!startAtTop && primaryRot == 0 &&
          (backward ? line->yMin > yStart : line->yMin < yStart)) {
        continue;
       }
 
       // check: is the line below the bottom limit?
-      if (!stopAtBottom &&
+      // (this only works if the page's primary rotation is zero --
+      // otherwise the lines won't be sorted in the useful order)
+      if (!stopAtBottom && primaryRot == 0 &&
          (backward ? line->yMin < yStop : line->yMin > yStop)) {
        continue;
       }
@@ -3564,9 +3644,9 @@ GBool TextPage::findText(Unicode *s, int len,
        for (k = 0; k < m; ++k) {
          txt[k] = unicodeToUpper(line->normalized[k]);
          }
-         } else {
+      } else {
        txt = line->normalized;
-         }
+      }
 
       // search each position in this line
       j = backward ? m - len : 0;
@@ -5211,9 +5291,11 @@ static void TextOutputDev_outputToFile(void *stream, 
const char *text, int len)
 }
 
 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
-                            GBool rawOrderA, GBool append) {
+                            double fixedPitchA, GBool rawOrderA,
+                            GBool append) {
   text = NULL;
   physLayout = physLayoutA;
+  fixedPitch = physLayout ? fixedPitchA : 0;
   rawOrder = rawOrderA;
   doHTML = gFalse;
   ok = gTrue;
@@ -5246,11 +5328,13 @@ TextOutputDev::TextOutputDev(char *fileName, GBool 
physLayoutA,
 }
 
 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
-                            GBool physLayoutA, GBool rawOrderA) {
+                            GBool physLayoutA, double fixedPitchA,
+                            GBool rawOrderA) {
   outputFunc = func;
   outputStream = stream;
   needClose = gFalse;
   physLayout = physLayoutA;
+  fixedPitch = physLayout ? fixedPitchA : 0;
   rawOrder = rawOrderA;
   doHTML = gFalse;
   text = new TextPage(rawOrderA);
@@ -5277,12 +5361,16 @@ void TextOutputDev::startPage(int pageNum, GfxState 
*state) {
 
 void TextOutputDev::endPage() {
   text->endPage();
-  text->coalesce(physLayout, doHTML);
+  text->coalesce(physLayout, fixedPitch, doHTML);
   if (outputStream) {
     text->dump(outputStream, outputFunc, physLayout);
   }
 }
 
+void TextOutputDev::restoreState(GfxState *state) {
+  text->updateFont(state);
+}
+
 void TextOutputDev::updateFont(GfxState *state) {
   text->updateFont(state);
 }
@@ -5465,10 +5553,12 @@ GBool TextOutputDev::findText(Unicode *s, int len,
                              GBool startAtTop, GBool stopAtBottom,
                              GBool startAtLast, GBool stopAtLast,
                              GBool caseSensitive, GBool backward,
+                             GBool wholeWord,
                              double *xMin, double *yMin,
                              double *xMax, double *yMax) {
   return text->findText(s, len, startAtTop, stopAtBottom,
-                       startAtLast, stopAtLast, caseSensitive, backward,
+                       startAtLast, stopAtLast,
+                       caseSensitive, backward, wholeWord,
                        xMin, yMin, xMax, yMax);
 }
 
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index fd34c8b..e31876b 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -333,7 +333,7 @@ public:
 
   void addWord(TextWord *word);
 
-  void coalesce(UnicodeMap *uMap);
+  void coalesce(UnicodeMap *uMap, double fixedPitch);
 
   // Update this block's priMin and priMax values, looking at <blk>.
   void updatePriMinMax(TextBlock *blk);
@@ -521,7 +521,7 @@ public:
   void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link);
 
   // Coalesce strings that look like parts of the same line.
-  void coalesce(GBool physLayout, GBool doHTML);
+  void coalesce(GBool physLayout, double fixedPitch, GBool doHTML);
 
   // Find a string.  If <startAtTop> is true, starts looking at the
   // top of the page; else if <startAtLast> is true, starts looking
@@ -534,6 +534,7 @@ public:
                 GBool startAtTop, GBool stopAtBottom,
                 GBool startAtLast, GBool stopAtLast,
                 GBool caseSensitive, GBool backward,
+                GBool wholeWord,
                 double *xMin, double *yMin,
                 double *xMax, double *yMax);
 
@@ -676,14 +677,16 @@ public:
   // is maintained.  If <rawOrder> is true, the text is kept in
   // content stream order.
   TextOutputDev(char *fileName, GBool physLayoutA,
-               GBool rawOrderA, GBool append);
+               double fixedPitchA, GBool rawOrderA,
+               GBool append);
 
   // Create a TextOutputDev which will write to a generic stream.  If
   // <physLayoutA> is true, the original physical layout of the text
   // is maintained.  If <rawOrder> is true, the text is kept in
   // content stream order.
   TextOutputDev(TextOutputFunc func, void *stream,
-               GBool physLayoutA, GBool rawOrderA);
+               GBool physLayoutA, double fixedPitchA,
+               GBool rawOrderA);
 
   // Destructor.
   virtual ~TextOutputDev();
@@ -719,6 +722,9 @@ public:
   // End a page.
   virtual void endPage();
 
+  //----- save/restore graphics state
+  virtual void restoreState(GfxState *state);
+
   //----- update text state
   virtual void updateFont(GfxState *state);
 
@@ -754,6 +760,7 @@ public:
                 GBool startAtTop, GBool stopAtBottom,
                 GBool startAtLast, GBool stopAtLast,
                 GBool caseSensitive, GBool backward,
+                GBool wholeWord,
                 double *xMin, double *yMin,
                 double *xMax, double *yMax);
 
@@ -804,6 +811,9 @@ private:
   TextPage *text;              // text for the current page
   GBool physLayout;            // maintain original physical layout when
                                //   dumping text
+  double fixedPitch;           // if physLayout is true and this is non-zero,
+                               //   assume fixed-pitch characters with this
+                               //   width
   GBool rawOrder;              // keep text in content stream order
   GBool doHTML;                        // extra processing for HTML conversion
   GBool ok;                    // set up ok?
diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc
index 9dc1d15..398a69b 100644
--- a/qt4/src/poppler-page.cc
+++ b/qt4/src/poppler-page.cc
@@ -330,7 +330,7 @@ QString Page::text(const QRectF &r, TextLayout textLayout) 
const
   QString result;
   
   const GBool rawOrder = textLayout == RawOrderLayout;
-  output_dev = new TextOutputDev(0, gFalse, rawOrder, gFalse);
+  output_dev = new TextOutputDev(0, gFalse, 0, rawOrder, gFalse);
   m_page->parentDoc->doc->displayPageSlice(output_dev, m_page->index + 1, 72, 
72,
       0, false, true, false, -1, -1, -1, -1);
   if (r.isNull())
@@ -371,19 +371,19 @@ bool Page::search(const QString &text, double &sLeft, 
double &sTop, double &sRig
   int rotation = (int)rotate * 90;
 
   // fetch ourselves a textpage
-  TextOutputDev td(NULL, gTrue, gFalse, gFalse);
+  TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse);
   m_page->parentDoc->doc->displayPage( &td, m_page->index + 1, 72, 72, 
rotation, false, true, false );
   TextPage *textPage=td.takeText();
 
   if (direction == FromTop)
     found = textPage->findText( u.data(), len, 
-            gTrue, gTrue, gFalse, gFalse, sCase, gFalse, &sLeft, &sTop, 
&sRight, &sBottom );
+            gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse, &sLeft, 
&sTop, &sRight, &sBottom );
   else if ( direction == NextResult )
     found = textPage->findText( u.data(), len, 
-            gFalse, gTrue, gTrue, gFalse, sCase, gFalse, &sLeft, &sTop, 
&sRight, &sBottom );
+            gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse, &sLeft, 
&sTop, &sRight, &sBottom );
   else if ( direction == PreviousResult )
     found = textPage->findText( u.data(), len, 
-            gFalse, gTrue, gTrue, gFalse, sCase, gTrue, &sLeft, &sTop, 
&sRight, &sBottom );
+            gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse, &sLeft, &sTop, 
&sRight, &sBottom );
 
   textPage->decRefCnt();
 
@@ -414,7 +414,7 @@ QList<TextBox*> Page::textList(Rotation rotate) const
   
   QList<TextBox*> output_list;
   
-  output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse);
+  output_dev = new TextOutputDev(0, gFalse, 0, gFalse, gFalse);
   
   int rotation = (int)rotate * 90;
 
diff --git a/test/perf-test.cc b/test/perf-test.cc
index b6fb8f8..6bdda97 100644
--- a/test/perf-test.cc
+++ b/test/perf-test.cc
@@ -840,7 +840,7 @@ static void RenderPdfAsText(const char *fileName)
 
     LogInfo("started: %s\n", fileName);
 
-    TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, gFalse, gFalse);
+    TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, 0, gFalse, 
gFalse);
     if (!textOut->isOk()) {
         delete textOut;
         return;
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 587f1a9..88fbf70 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -53,6 +53,10 @@ Maintain (as best as possible) the original physical layout 
of the
 text.  The default is to \'undo' physical layout (columns,
 hyphenation, etc.) and output the text in reading order.
 .TP
+.BI \-fixed " number"
+Assume fixed-pitch (or tabular) text, with the specified character
+width (in points).  This forces physical layout mode.
+.TP
 .B \-raw
 Keep the text in content stream order.  This is a hack which often
 "undoes" column formatting, etc.  Use of raw mode is no longer
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 2e7b32e..a170f1b 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -68,6 +68,7 @@ static int w = 0;
 static int h = 0;
 static GBool bbox = gFalse;
 static GBool physLayout = gFalse;
+static double fixedPitch = 0;
 static GBool rawOrder = gFalse;
 static GBool htmlMeta = gFalse;
 static char textEncName[128] = "";
@@ -97,6 +98,8 @@ static const ArgDesc argDesc[] = {
    "height of crop area in pixels (default is 0)"},
   {"-layout",  argFlag,     &physLayout,    0,
    "maintain original physical layout"},
+  {"-fixed",   argFP,       &fixedPitch,    0,
+   "assume fixed-pitch (or tabular) text"},
   {"-raw",     argFlag,     &rawOrder,      0,
    "keep strings in content stream order"},
   {"-htmlmeta", argFlag,   &htmlMeta,       0,
@@ -197,6 +200,9 @@ int main(int argc, char *argv[]) {
   }
 
   fileName = new GooString(argv[1]);
+  if (fixedPitch) {
+    physLayout = gTrue;
+  }
 
   if (textEncName[0]) {
     globalParams->setTextEncoding(textEncName);
@@ -333,7 +339,7 @@ int main(int argc, char *argv[]) {
 
   // write text file
   if (bbox) {
-    textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta);
+    textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, 
htmlMeta);
     if (!(f = fopen(textFileName->getCString(), "ab"))) {
       error(errIO, -1, "Couldn't open text file '{0:t}' for append", 
textFileName);
       exitCode = 2;
@@ -367,7 +373,7 @@ int main(int argc, char *argv[]) {
     fclose(f);
   } else {
     textOut = new TextOutputDev(textFileName->getCString(),
-                               physLayout, rawOrder, htmlMeta);
+                               physLayout, fixedPitch, rawOrder, htmlMeta);
     if (textOut->isOk()) {
       if ((w==0) && (h==0) && (x==0) && (y==0)) {
        doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to