glib/poppler-page.cc | 168 ++++++++++++++++-------------- poppler/TextOutputDev.cc | 261 +++++++++++++++++++++++++---------------------- poppler/TextOutputDev.h | 4 3 files changed, 235 insertions(+), 198 deletions(-)
New commits: commit c55b577ce69ad4bb69f5261b3e120e92c9fdb3d0 Author: Carlos Garcia Campos <[email protected]> Date: Tue Jun 25 10:01:38 2013 +0200 glib: Use TextPage::getSelectionWords to build text layout and attributes This way we can make sure that the list of words used in poppler_page_get_text_layout and poppler_page_get_text_attributes is the same that the one used in poppler_page_get_text. This fixes the mismatch between the number of characters in the text returned by poppler_page_get_text and the number of characters returned by poppler_page_get_text_layout in some documents. diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index 631edb5..9115b78 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -1979,66 +1979,66 @@ poppler_page_get_text_layout (PopplerPage *page, guint *n_rectangles) { TextPage *text; - TextWordList *wordlist; - TextWord *word, *nextword; PopplerRectangle *rect; - int i, j; + PDFRectangle selection; + int i, j, k; guint offset = 0; guint n_rects = 0; gdouble x1, y1, x2, y2; gdouble x3, y3, x4, y4; + GooList **word_list; + int n_lines; g_return_val_if_fail (POPPLER_IS_PAGE (page), FALSE); *n_rectangles = 0; + poppler_page_get_size (page, &selection.x2, &selection.y2); text = poppler_page_get_text_page (page); - wordlist = text->makeWordList (gFalse); + word_list = text->getSelectionWords (&selection, selectionStyleGlyph, &n_lines); + if (!word_list) + return FALSE; - if (wordlist->getLength () <= 0) + n_rects += n_lines - 1; + for (i = 0; i < n_lines; i++) { - delete wordlist; - return FALSE; - } - - // Getting the array size - for (i = 0; i < wordlist->getLength (); i++) - { - word = wordlist->get (i); - n_rects += word->getLength (); - if (!word->getNext () || word->getSpaceAfter ()) - n_rects++; + GooList *line_words = word_list[i]; + n_rects += line_words->getLength() - 1; + for (j = 0; j < line_words->getLength(); j++) + { + TextWord *word = (TextWord *)line_words->get(j); + n_rects += word->getLength(); + } } - n_rects--; - *n_rectangles = n_rects; *rectangles = g_new (PopplerRectangle, n_rects); + *n_rectangles = n_rects; - // Calculating each char position - for (i = 0; i < wordlist->getLength (); i++) + for (i = 0; i < n_lines; i++) { - word = wordlist->get (i); - for (j = 0; j < word->getLength (); j++) + GooList *line_words = word_list[i]; + for (j = 0; j < line_words->getLength(); j++) { + TextWord *word = (TextWord *)line_words->get(j); + for (k = 0; k < word->getLength(); k++) + { + rect = *rectangles + offset; + word->getCharBBox (k, + &(rect->x1), + &(rect->y1), + &(rect->x2), + &(rect->y2)); + offset++; + } + rect = *rectangles + offset; - word->getCharBBox (j, - &(rect->x1), - &(rect->y1), - &(rect->x2), - &(rect->y2)); - offset++; - } + word->getBBox (&x1, &y1, &x2, &y2); - // adding spaces and break lines - rect = *rectangles + offset; - word->getBBox (&x1, &y1, &x2, &y2); + if (j < line_words->getLength() - 1) + { + TextWord *next_word = (TextWord *)line_words->get(j + 1); - nextword = word->getNext (); - if (nextword) - { - if (word->getSpaceAfter ()) - { - nextword->getBBox (&x3, &y3, &x4, &y4); + next_word->getBBox(&x3, &y3, &x4, &y4); // space is from one word to other and with the same height as // first word. rect->x1 = x2; @@ -2046,20 +2046,23 @@ poppler_page_get_text_layout (PopplerPage *page, rect->x2 = x3; rect->y2 = y2; offset++; - } - } - else if (offset < n_rects) + } + } + + if (i < n_lines - 1 && offset > 0) { - // end of line - rect->x1 = x2; - rect->y1 = y2; - rect->x2 = x2; - rect->y2 = y2; - offset++; - } + // end of line + rect->x1 = x2; + rect->y1 = y2; + rect->x2 = x2; + rect->y2 = y2; + offset++; + } + + delete line_words; } - delete wordlist; + gfree (word_list); return TRUE; } @@ -2122,53 +2125,62 @@ GList * poppler_page_get_text_attributes (PopplerPage *page) { TextPage *text; - TextWordList *wordlist; + PDFRectangle selection; + GooList **word_list; + int n_lines; PopplerTextAttributes *attrs = NULL; - gint i, offset = 0; + TextWord *word, *prev_word = NULL; + gint word_i, prev_word_i; + gint i, j; + gint offset = 0; GList *attributes = NULL; g_return_val_if_fail (POPPLER_IS_PAGE (page), NULL); + poppler_page_get_size (page, &selection.x2, &selection.y2); text = poppler_page_get_text_page (page); - wordlist = text->makeWordList (gFalse); + word_list = text->getSelectionWords (&selection, selectionStyleGlyph, &n_lines); + if (!word_list) + return NULL; - if (wordlist->getLength () <= 0) + for (i = 0; i < n_lines; i++) { - delete wordlist; - return NULL; - } - - TextWord *word, *prev_word = NULL; - gint word_i, prev_word_i; + GooList *line_words = word_list[i]; + for (j = 0; j < line_words->getLength(); j++) + { + word = (TextWord *)line_words->get(j); - // Calculating each word attributes - for (i = 0; i < wordlist->getLength (); i++) - { - word = wordlist->get (i); + for (word_i = 0; word_i < word->getLength (); word_i++) + { + if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i)) + { + attrs = poppler_text_attributes_new_from_word (word, word_i); + attrs->start_index = offset; + attributes = g_list_prepend (attributes, attrs); + } + attrs->end_index = offset; + offset++; + prev_word = word; + prev_word_i = word_i; + } - for (word_i = 0; word_i < word->getLength (); word_i++) - { - if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i)) + if (j < line_words->getLength() - 1) { - attrs = poppler_text_attributes_new_from_word (word, word_i); - attrs->start_index = offset; - attributes = g_list_prepend (attributes, attrs); + attrs->end_index = offset; + offset++; } - attrs->end_index = offset; - offset++; - prev_word = word; - prev_word_i = word_i; - } - if (!word->getNext () || word->getSpaceAfter ()) + } + + if (i < n_lines - 1) { attrs->end_index = offset; offset++; } + + delete line_words; } - if (attrs) - attrs->end_index--; - delete wordlist; + gfree (word_list); return g_list_reverse(attributes); } commit fc534f571315c064005515c19d7d70ad3af1563e Author: Carlos Garcia Campos <[email protected]> Date: Tue Jun 25 10:05:01 2013 +0200 TextOutputDev: add a method to TextPage to get the selection as a list of words Returns a list of lines of words. diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 4ef5963..928e95a 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -4043,6 +4043,7 @@ public: void endPage(); GooString *getText(void); + GooList **getWordList(int *nLines); private: @@ -4177,6 +4178,29 @@ GooString *TextSelectionDumper::getText (void) return text; } +GooList **TextSelectionDumper::getWordList(int *nLinesOut) +{ + int i, j; + + if (nLines == 0) + return NULL; + + GooList **wordList = (GooList **)gmallocn(nLines, sizeof(GooList *)); + + for (i = 0; i < nLines; i++) { + GooList *lineWords = lines[i]; + wordList[i] = new GooList(); + for (j = 0; j < lineWords->getLength(); j++) { + TextWordSelection *sel = (TextWordSelection *)lineWords->get(j); + wordList[i]->append(sel->word); + } + } + + *nLinesOut = nLines; + + return wordList; +} + class TextSelectionSizer : public TextSelectionVisitor { public: TextSelectionSizer(TextPage *page, double scale); @@ -4751,6 +4775,18 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection, return dumper.getText(); } +GooList **TextPage::getSelectionWords(PDFRectangle *selection, + SelectionStyle style, + int *nLines) +{ + TextSelectionDumper dumper(this); + + visitSelection(&dumper, selection, style); + dumper.endPage(); + + return dumper.getWordList(nLines); +} + GBool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) { diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 664f9d1..6269f1c 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -563,6 +563,10 @@ public: GooString *getSelectionText(PDFRectangle *selection, SelectionStyle style); + GooList **getSelectionWords(PDFRectangle *selection, + SelectionStyle style, + int *nLines); + // Find a string by character position and length. If found, sets // the text bounding rectangle and returns true; otherwise returns // false. commit a924246b7534e86165f8e9ab6c60d56b73a17b94 Author: Carlos Garcia Campos <[email protected]> Date: Tue Jun 25 09:57:48 2013 +0200 TextOutputDev: simplify the text selection dumper Build a list of lines of words and don't try to format the text when detecting tables, simply add the words and lines in the right order. diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 2872f02..4ef5963 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -4039,26 +4039,55 @@ public: int edge_end, PDFRectangle *selection); virtual void visitWord (TextWord *word, int begin, int end, - PDFRectangle *selection) { }; + PDFRectangle *selection); + void endPage(); GooString *getText(void); private: - TextLineFrag *frags; - int nFrags, fragsSize; + + void startLine(); + void finishLine(); + + GooList **lines; + int nLines, linesSize; + GooList *words; + int tableId; + TextBlock *currentBlock; }; TextSelectionDumper::TextSelectionDumper(TextPage *page) : TextSelectionVisitor(page) { - fragsSize = 256; - frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag)); - nFrags = 0; + linesSize = 256; + lines = (GooList **)gmallocn(linesSize, sizeof(GooList *)); + nLines = 0; + + tableId = -1; + currentBlock = NULL; + words = NULL; } TextSelectionDumper::~TextSelectionDumper() { - gfree(frags); + for (int i = 0; i < nLines; i++) + deleteGooList(lines[i], TextWordSelection); + gfree(lines); +} + +void TextSelectionDumper::startLine() +{ + finishLine(); + words = new GooList(); +} + +void TextSelectionDumper::finishLine() +{ + if (words && words->getLength() > 0) + lines[nLines++] = words; + else if (words) + delete words; + words = NULL; } void TextSelectionDumper::visitLine (TextLine *line, @@ -4068,130 +4097,84 @@ void TextSelectionDumper::visitLine (TextLine *line, int edge_end, PDFRectangle *selection) { - if (nFrags == fragsSize) { - fragsSize *= 2; - frags = (TextLineFrag *) grealloc(frags, fragsSize * sizeof(TextLineFrag)); + TextLineFrag frag; + + if (nLines == linesSize) { + linesSize *= 2; + lines = (GooList **)grealloc(lines, linesSize * sizeof(GooList *)); + } + + frag.init(line, edge_begin, edge_end - edge_begin); + + if (tableId >= 0 && frag.line->blk->tableId < 0) { + finishLine(); + + tableId = -1; + currentBlock = NULL; } - frags[nFrags].init(line, edge_begin, edge_end - edge_begin); - ++nFrags; + if (frag.line->blk->tableId >= 0) { // a table + if (tableId == -1) { + tableId = frag.line->blk->tableId; + currentBlock = frag.line->blk; + } + + if (currentBlock == frag.line->blk) { // the same block + startLine(); + } else { // another block + if (currentBlock->tableEnd) { // previous block ended its row + startLine(); + } + currentBlock = frag.line->blk; + } + } else { // not a table + startLine(); + } +} +void TextSelectionDumper::visitWord (TextWord *word, int begin, int end, + PDFRectangle *selection) +{ + words->append(new TextWordSelection(word, begin, end)); +} + +void TextSelectionDumper::endPage() +{ + finishLine(); } GooString *TextSelectionDumper::getText (void) { - GooString *s; - TextLineFrag *frag; + GooString *text; int i, j; UnicodeMap *uMap; char space[8], eol[16]; int spaceLen, eolLen; - GooList *strings = NULL; - int actual_table = -1; - int actual_line = -1; - int last_length = 0; - TextBlock *actual_block = NULL; - - s = new GooString(); - uMap = globalParams->getTextEncoding(); + text = new GooString(); - if (uMap == NULL) - return s; + if (!(uMap = globalParams->getTextEncoding())) + return text; spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); - if (nFrags > 0) { - for (i = 0; i < nFrags; ++i) { - frag = &frags[i]; - - if (actual_table >= 0 && frag->line->blk->tableId < 0) { - for (j = 0; j < strings->getLength (); j++) { - s->append ((GooString*) strings->get (j)); - s->append (eol, eolLen); - delete ((GooString*) strings->get (j)); - } - delete strings; - strings = NULL; - actual_table = -1; - actual_line = -1; - actual_block = NULL; - } - - // a table - if (frag->line->blk->tableId >= 0) { - if (actual_table == -1) { - strings = new GooList(); - actual_table = frag->line->blk->tableId; - actual_block = frag->line->blk; - actual_line = -1; - } - - // the same block - if (actual_block == frag->line->blk) { - actual_line++; - if (actual_line >= strings->getLength ()) { - GooString *t = new GooString (); - // add some spaces to have this block correctly aligned - if (actual_line > 0) - for (j = 0; j < ((GooString*) (strings->get (actual_line - 1)))->getLength() - last_length - 1; j++) - t->append (space, spaceLen); - strings->append (t); - } - } - // another block - else { - // previous block ended its row - if (actual_block->tableEnd) { - for (j = 0; j < strings->getLength (); j++) { - s->append ((GooString*) strings->get (j)); - s->append (eol, eolLen); - delete ((GooString*) strings->get (j)); - } - delete strings; - - strings = new GooList(); - GooString *t = new GooString (); - strings->append (t); - } - actual_block = frag->line->blk; - actual_line = 0; - } - - page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line))); - last_length = frag->len; + for (i = 0; i < nLines; i++) { + GooList *lineWords = lines[i]; + for (j = 0; j < lineWords->getLength(); j++) { + TextWordSelection *sel = (TextWordSelection *)lineWords->get(j); - if (!frag->line->blk->tableEnd) { - ((GooString*) strings->get (actual_line))->append (space, spaceLen); - } - } - // not a table - else { - page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s); - if (i < nFrags - 1) { - s->append (eol, eolLen); - } - } - } - - if (strings != NULL) { - for (j = 0; j < strings->getLength (); j++) { - s->append((GooString*) strings->get (j)); - s->append(eol, eolLen); - delete ((GooString*) strings->get (j)); - } - delete strings; - strings = NULL; - actual_table = -1; - actual_line = -1; - actual_block = NULL; + page->dumpFragment (sel->word->text + sel->begin, sel->end - sel->begin, uMap, text); + if (j < lineWords->getLength() - 1) + text->append(space, spaceLen); } + if (i < nLines - 1) + text->append(eol, eolLen); } uMap->decRefCnt(); - return s; + return text; } class TextSelectionSizer : public TextSelectionVisitor { @@ -4763,6 +4746,7 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection, TextSelectionDumper dumper(this); visitSelection(&dumper, selection, style); + dumper.endPage(); return dumper.getText(); } commit c849094a2daf896d085937adff1f7659a09da062 Author: Carlos Garcia Campos <[email protected]> Date: Mon Jun 24 18:29:11 2013 +0200 TextOutputDev: Move TextSelection class from TextSelectionPainter to TextSelectionVisitor So that it can be used by other TextSelectionVisitor implementations. Also renamed it as TextWordSelection since it contains a word selection. diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index fe051f6..2872f02 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3999,6 +3999,21 @@ public: PDFRectangle *selection) = 0; protected: + + class TextWordSelection { + public: + TextWordSelection(TextWord *word, int begin, int end) + : word(word), + begin(begin), + end(end) + { + } + + TextWord *word; + int begin; + int end; + }; + TextPage *page; }; @@ -4265,20 +4280,6 @@ private: GfxState *state; GooList *selectionList; Matrix ctm, ictm; - - class TextSelection { - public: - TextSelection(TextWord *word, int begin, int end) - : word(word), - begin(begin), - end(end) - { - } - - TextWord *word; - int begin; - int end; - }; }; TextSelectionPainter::TextSelectionPainter(TextPage *page, @@ -4310,7 +4311,7 @@ TextSelectionPainter::TextSelectionPainter(TextPage *page, TextSelectionPainter::~TextSelectionPainter() { - deleteGooList(selectionList, TextSelection); + deleteGooList(selectionList, TextWordSelection); delete state; } @@ -4350,7 +4351,7 @@ void TextSelectionPainter::visitLine (TextLine *line, void TextSelectionPainter::visitWord (TextWord *word, int begin, int end, PDFRectangle *selection) { - selectionList->append(new TextSelection(word, begin, end)); + selectionList->append(new TextWordSelection(word, begin, end)); } void TextSelectionPainter::endPage() @@ -4362,7 +4363,7 @@ void TextSelectionPainter::endPage() out->updateFillColor(state); for (int i = 0; i < selectionList->getLength(); i++) { - TextSelection *sel = (TextSelection *) selectionList->get(i); + TextWordSelection *sel = (TextWordSelection *) selectionList->get(i); int begin = sel->begin; while (begin < sel->end) { _______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
