Poppler does not make table selection in "order". It detects tables as columns, because poppler uses distance between text to decide what is a column so tables are selected in column order when the "logic way" is as rows.
Other problem in selection caused by that heuristic is when you have a pdf with near columns or text with spaces. I looked at acroread to see how it does columns and tables selection and I realized that it selects text in "order", I mean, in the order that you put it in pdf file. To see that I created a text pdf file with inkscape. So the selection logic is simple, we select the nearest word to the first selection point and the nearest word to the last selection point, and every word between that two words (in text order, no matter where the words are at screen) is selected too. I have implemented [1] that logic and it seems to work better that current one. I made a video to show the new logic implemented in action [2]. To implement that I use TextWordList in TextPage, and to get that list well ordered I create TextOutputDev as rawOrder in selection, I have change that only in glib frontend so other frontends maybe don't work ok. So the big implementation problem is to find the first and the last index in wordlist that defines the selection, and it is an easy algorithm. And for RTL documents I reverse wordlist by line and change word selection index, so the algorithm works with RTL too. So, what do you think about that new selection algorithm? It seems that works better than current one, and it's simpler, but I don't know if I forget something about selection or maybe performance... I attach the patch, it's divided in two commits, and maybe commit messages aren't *correct*. [1] http://github.com/danigm/poppler/commits/selection [2] http://www.youtube.com/watch?v=9bRH1yLCs4o
>From a5dc3df399b63ec546c89dd17cde380d7255bb66 Mon Sep 17 00:00:00 2001 From: Daniel Garcia <[email protected]> Date: Thu, 2 Sep 2010 11:20:07 +0200 Subject: [PATCH 1/2] Selecting text in raw order --- glib/poppler-page.cc | 2 +- poppler/TextOutputDev.cc | 245 ++++++++++++++++++++++++++++++++++++++++++++-- poppler/TextOutputDev.h | 8 ++ 3 files changed, 245 insertions(+), 10 deletions(-) diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index bc95e65..287726b 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -240,7 +240,7 @@ poppler_page_get_text_page (PopplerPage *page) TextOutputDev *text_dev; Gfx *gfx; - text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse); + text_dev = new TextOutputDev (NULL, gTrue, gTrue, gFalse); gfx = page->page->createGfx(text_dev, 72.0, 72.0, 0, gFalse, /* useMediaBox */ diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 576bcc9..c7107ad 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -416,6 +416,27 @@ inline int TextWord::primaryCmp(TextWord *word) { return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } +inline int TextWord::secondaryCmp(TextWord *word) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = yMin - word->yMin; + break; + case 1: + cmp = xMin - word->xMin; + break; + case 2: + cmp = word->yMax - yMax; + break; + case 3: + cmp = word->xMax - xMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + double TextWord::primaryDelta(TextWord *word) { double delta; @@ -2361,6 +2382,24 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { if (rawOrder) { primaryRot = 0; primaryLR = gTrue; + + // determine the primary direction + lrCount = 0; + TextWordList *wordlist = makeWordList(gFalse); + if (wordlist->getLength()) { + for (word0 = wordlist->get(0); word0; word0 = word0->next) { + for (i = 0; i < word0->len; ++i) { + if (unicodeTypeL(word0->text[i])) { + ++lrCount; + } else if (unicodeTypeR(word0->text[i])) { + --lrCount; + } + } + } + primaryLR = lrCount >= 0; + } + delete wordlist; + return; } @@ -4105,6 +4144,7 @@ public: PDFRectangle *selection); virtual void visitWord (TextWord *word, int begin, int end, PDFRectangle *selection); + void drawRegion (PDFRectangle *region); private: OutputDev *out; @@ -4184,6 +4224,21 @@ void TextSelectionPainter::visitLine (TextLine *line, state->clearPath(); } +void TextSelectionPainter::drawRegion (PDFRectangle *region) +{ + state->setFillColor(box_color); + out->updateFillColor(state); + + state->moveTo(region->x1, region->y1); + state->lineTo(region->x2, region->y1); + state->lineTo(region->x2, region->y2); + state->lineTo(region->x1, region->y2); + state->closePath(); + + out->fill(state); + state->clearPath(); +} + void TextSelectionPainter::visitWord (TextWord *word, int begin, int end, PDFRectangle *selection) { @@ -4543,6 +4598,73 @@ void TextPage::visitSelection(TextSelectionVisitor *visitor, } } +void TextPage::getSelectionWordLimits(PDFRectangle *selection, + SelectionStyle style, + double scale, + int *first, + int *last, + int *first_c, + int *last_c) { + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word = NULL; + double distance, minor=-1, minor1=-1; + double xmin, ymin, xmax, ymax; + double x1, y1, x2, y2; + int tmp; + + x1 = selection->x1; + x2 = selection->x2; + + y1 = selection->y1; + y2 = selection->y2; + + for (int i=0; i<wordlist->getLength(); i++) { + word = wordlist->get(i); + + for (int j=0; j<word->getLength(); j++) { + word->getCharBBox(j, &xmin, &ymin, &xmax, &ymax); + + distance = fabs(x1 - xmin) + fabs(y1 - ymin); + if (minor < 0 || distance < minor) { + *first = i; + *first_c = j; + minor = distance; + } + + distance = fabs(x1 - xmin) + fabs(y1 - ymax); + if (minor < 0 || distance < minor) { + *first = i; + *first_c = j; + minor = distance; + } + + distance = fabs(x2 - xmax) + fabs(y2 - ymax); + if (minor1 < 0 || distance < minor1) { + *last = i; + *last_c = j; + minor1 = distance; + } + } + } + if (*first > *last) { + tmp = *last; + *last = *first; + *first = tmp; + + tmp = *last_c; + *last_c = *first_c; + *first_c = tmp; + } + + if (*first == *last && *first_c > *last_c) { + tmp = *last_c; + *last_c = *first_c; + *first_c = tmp; + } + + delete wordlist; +} + void TextPage::drawSelection(OutputDev *out, double scale, int rotation, @@ -4550,30 +4672,135 @@ void TextPage::drawSelection(OutputDev *out, SelectionStyle style, GfxColor *glyph_color, GfxColor *box_color) { - TextSelectionPainter painter(this, scale, rotation, - out, box_color, glyph_color); + TextSelectionPainter painter(this, scale, rotation, + out, box_color, glyph_color); + int first, last, first_c, last_c, begin, end; + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word = NULL; + PDFRectangle *rect; + GooList *rlist; + + getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c); + rlist = getSelectionRegion(selection, style, scale); + for(int i=0; i<rlist->getLength(); i++) { + rect = (PDFRectangle *)rlist->get(i); + painter.drawRegion(rect); + } + + for(int i=first; i<=last; i++) { + word = wordlist->get(i); + if (i == first && i == last) { + begin = first_c; + end = last_c + 1; + } else if (i == first) { + begin = first_c; + end = word->getLength(); + } else if (i == last) { + begin = 0; + end = last_c + 1; + } else { + begin = 0; + end = word->getLength(); + } + + painter.visitWord(word, begin, end, selection); + } - visitSelection(&painter, selection, style); + delete wordlist; } GooList *TextPage::getSelectionRegion(PDFRectangle *selection, SelectionStyle style, double scale) { - TextSelectionSizer sizer(this, scale); + GooList *ret = new GooList(); + PDFRectangle *rect = NULL; + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word=NULL, *prevword=NULL; + int first=0, last=0, first_c=0, last_c=0; + double xmin, ymin, xmax, ymax; + double xmin1, ymin1, xmax1, ymax1; + + getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c); + + for (int i=first; i<=last; i++) { + word = wordlist->get(i); + if (prevword && !word->secondaryCmp(prevword) && rect) { + if (i == last) { + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + } + else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + } + rect->x2 = xmax; + continue; + } + + if (i == first && i == last) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + rect = new PDFRectangle(xmin1, ymin1, xmax, ymax); + ret->append(rect); + } else if (i == first) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + rect = new PDFRectangle(xmin1, ymin1, xmax, ymax); + ret->append(rect); + } else if (i == last) { + word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + rect = new PDFRectangle(xmin, ymin, xmax1, ymax1); + ret->append(rect); + } else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + rect = new PDFRectangle(xmin, ymin, xmax, ymax); + ret->append(rect); + } + prevword = word; + } - visitSelection(&sizer, selection, style); + delete wordlist; - return sizer.getRegion(); + return ret; } GooString *TextPage::getSelectionText(PDFRectangle *selection, SelectionStyle style) { - TextSelectionDumper dumper(this); + GooString *ret = new GooString(); + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word=NULL, *prevword=NULL; + int first=0, last=0, first_c=0, last_c=0; + UnicodeMap *uMap; + // get the output encoding + if (!(uMap = globalParams->getTextEncoding())) { + return ret; + } + + getSelectionWordLimits(selection, style, 1, &first, &last, &first_c, &last_c); - visitSelection(&dumper, selection, style); + for (int i=first; i<=last; i++) { + word = wordlist->get(i); + if (prevword) { + if (word->secondaryCmp(prevword)) { + ret->append('\n'); + } else { + ret->append(' '); + } + } + if (i == first && i == last) { + dumpFragment(word->text + first_c, last_c - first_c, uMap, ret); + } else if (i == first) { + dumpFragment(word->text + first_c, word->len - first_c, uMap, ret); + } else if (i == last) { + dumpFragment(word->text, last_c, uMap, ret); + } else { + dumpFragment(word->text, word->len, uMap, ret); + } + prevword = word; + } + delete wordlist; - return dumper.getText(); + return ret; } GBool TextPage::findCharRange(int pos, int length, diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 438aee4..27d0d6a 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -125,6 +125,7 @@ public: // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>), // based on a primary-axis comparison, e.g., x ordering if rot=0. int primaryCmp(TextWord *word); + int secondaryCmp(TextWord *word); // Return the distance along the primary axis between <this> and // <word>. @@ -581,6 +582,13 @@ private: void clear(); void assignColumns(TextLineFrag *frags, int nFrags, GBool rot); int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s); + void getSelectionWordLimits(PDFRectangle *selection, + SelectionStyle style, + double scale, + int *first, + int *last, + int *first_c, + int *last_c); GBool rawOrder; // keep text in content stream order -- 1.7.2.2.169.gb5442 >From 8a5ffeecc01b192387fed1ab6f895765a4b583e7 Mon Sep 17 00:00:00 2001 From: Daniel Garcia <[email protected]> Date: Thu, 2 Sep 2010 15:00:20 +0200 Subject: [PATCH 2/2] List of words ordered in raw mode and RTL --- poppler/TextOutputDev.cc | 184 +++++++++++++++++++++++++++++++++++----------- 1 files changed, 140 insertions(+), 44 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index c7107ad..e2bb187 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -1881,15 +1881,28 @@ TextWordList::TextWordList(TextPage *text, GBool physLayout) { TextFlow *flow; TextBlock *blk; TextLine *line; - TextWord *word; + TextWord *word, *prevword=NULL; TextWord **wordArray; int nWords, i; words = new GooList(); if (text->rawOrder) { - for (word = text->rawWords; word; word = word->next) { - words->append(word); + if (text->primaryLR) { + for (word = text->rawWords; word; word = word->next) { + words->append(word); + } + } else { + i = 0; + for (word = text->rawWords; word; word = word->next) { + if (prevword) { + if (word->secondaryCmp(prevword)) { + i = getLength(); + } + } + words->insert(i, word); + prevword = word; + } } } else if (physLayout) { @@ -4606,7 +4619,7 @@ void TextPage::getSelectionWordLimits(PDFRectangle *selection, int *first_c, int *last_c) { TextWordList *wordlist = makeWordList(gFalse); - TextWord *word = NULL; + TextWord *word=NULL; double distance, minor=-1, minor1=-1; double xmin, ymin, xmax, ymax; double x1, y1, x2, y2; @@ -4624,21 +4637,21 @@ void TextPage::getSelectionWordLimits(PDFRectangle *selection, for (int j=0; j<word->getLength(); j++) { word->getCharBBox(j, &xmin, &ymin, &xmax, &ymax); - distance = fabs(x1 - xmin) + fabs(y1 - ymin); + distance = fabs(x1 - xmin) + 10*fabs(y1 - ymin); if (minor < 0 || distance < minor) { *first = i; *first_c = j; minor = distance; } - distance = fabs(x1 - xmin) + fabs(y1 - ymax); + distance = fabs(x1 - xmin) + 10*fabs(y1 - ymax); if (minor < 0 || distance < minor) { *first = i; *first_c = j; minor = distance; } - distance = fabs(x2 - xmax) + fabs(y2 - ymax); + distance = fabs(x2 - xmax) + 10*fabs(y2 - ymax); if (minor1 < 0 || distance < minor1) { *last = i; *last_c = j; @@ -4646,6 +4659,38 @@ void TextPage::getSelectionWordLimits(PDFRectangle *selection, } } } + + switch (style) { + case selectionStyleGlyph: + break; + case selectionStyleLine: + for (int i=*first; i>=0; i--) { + word = wordlist->get(i); + if (!word->secondaryCmp(wordlist->get(*first))) { + *first = i; + } + } + for (int i=*last; i<wordlist->getLength(); i++) { + word = wordlist->get(i); + if (!word->secondaryCmp(wordlist->get(*last))) { + *last = i; + } + } + case selectionStyleWord: + *first_c = wordlist->get(*first)->getLength() - 1; + if (primaryLR) { + *last_c = wordlist->get(*last)->getLength() - 1; + } else { + *last_c = 0; + } + if (last == first) { + *last_c = wordlist->get(*last)->getLength() - 1; + *first_c = 0; + } + break; + default: break; + } + if (*first > *last) { tmp = *last; *last = *first; @@ -4689,18 +4734,34 @@ void TextPage::drawSelection(OutputDev *out, for(int i=first; i<=last; i++) { word = wordlist->get(i); - if (i == first && i == last) { - begin = first_c; - end = last_c + 1; - } else if (i == first) { - begin = first_c; - end = word->getLength(); - } else if (i == last) { - begin = 0; - end = last_c + 1; + if (primaryLR) { + if (i == first && i == last) { + begin = first_c; + end = last_c + 1; + } else if (i == first) { + begin = first_c; + end = word->getLength(); + } else if (i == last) { + begin = 0; + end = last_c + 1; + } else { + begin = 0; + end = word->getLength(); + } } else { - begin = 0; - end = word->getLength(); + if (i == first && i == last) { + begin = first_c; + end = last_c + 1; + } else if (i == first) { + begin = 0; + end = first_c + 1; + } else if (i == last) { + begin = last_c; + end = word->getLength(); + } else { + begin = 0; + end = word->getLength(); + } } painter.visitWord(word, begin, end, selection); @@ -4731,30 +4792,52 @@ GooList *TextPage::getSelectionRegion(PDFRectangle *selection, else { word->getBBox(&xmin, &ymin, &xmax, &ymax); } - rect->x2 = xmax; + + if (primaryLR) { + rect->x2 = xmax; + } else { + rect->x1 = xmin; + } + prevword = word; continue; } - if (i == first && i == last) { - word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); - word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); - rect = new PDFRectangle(xmin1, ymin1, xmax, ymax); - ret->append(rect); - } else if (i == first) { - word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); - word->getBBox(&xmin, &ymin, &xmax, &ymax); - rect = new PDFRectangle(xmin1, ymin1, xmax, ymax); - ret->append(rect); - } else if (i == last) { - word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1); - word->getBBox(&xmin, &ymin, &xmax, &ymax); - rect = new PDFRectangle(xmin, ymin, xmax1, ymax1); - ret->append(rect); + if (primaryLR) { + if (i == first && i == last) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else if (i == first) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else if (i == last) { + word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmax = xmax1; ymax = ymax1; + } else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + } } else { - word->getBBox(&xmin, &ymin, &xmax, &ymax); - rect = new PDFRectangle(xmin, ymin, xmax, ymax); - ret->append(rect); + if (i == first && i == last) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else if (i == first) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmax = xmax1; ymax = ymax1; + } else if (i == last) { + word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + } } + + rect = new PDFRectangle(xmin, ymin, xmax, ymax); + ret->append(rect); prevword = word; } @@ -4787,15 +4870,28 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection, ret->append(' '); } } - if (i == first && i == last) { - dumpFragment(word->text + first_c, last_c - first_c, uMap, ret); - } else if (i == first) { - dumpFragment(word->text + first_c, word->len - first_c, uMap, ret); - } else if (i == last) { - dumpFragment(word->text, last_c, uMap, ret); + if (primaryLR) { + if (i == first && i == last) { + dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret); + } else if (i == first) { + dumpFragment(word->text + first_c, word->len - first_c, uMap, ret); + } else if (i == last) { + dumpFragment(word->text, last_c+1, uMap, ret); + } else { + dumpFragment(word->text, word->len, uMap, ret); + } } else { - dumpFragment(word->text, word->len, uMap, ret); + if (i == first && i == last) { + dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret); + } else if (i == first) { + dumpFragment(word->text, first_c+1, uMap, ret); + } else if (i == last) { + dumpFragment(word->text + last_c, word->len, uMap, ret); + } else { + dumpFragment(word->text, word->len, uMap, ret); + } } + prevword = word; } delete wordlist; -- 1.7.2.2.169.gb5442
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
