cpp/poppler-page.cpp | 91 +++++++++++++++++++++++++++++++++++++++++++++ cpp/poppler-page.h | 58 ++++++++++++++++++++++++++++ cpp/poppler-private.h | 9 ++++ cpp/tests/poppler-dump.cpp | 34 ++++++++++++++++ 4 files changed, 192 insertions(+)
New commits: commit 2740b3aca81a6a8c690540fc141e5923a1fff460 Author: Albert Astals Cid <[email protected]> Date: Tue Feb 27 00:47:04 2018 +0100 cpp: Add since diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index df5cb36a..93a13d18 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -1,6 +1,7 @@ /* * Copyright (C) 2009-2010, Pino Toscano <[email protected]> * Copyright (C) 2018, Suzuki Toshiya <[email protected]> + * Copyright (C) 2018, Albert Astals Cid <[email protected]> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -111,6 +112,8 @@ public: up-to-down), the std::vector contains the text in the proper order. + \since 0.63 + \note The page object owns the text_box objects as unique_ptr, the caller is not needed to free them. commit 42a6b8651f040f0960802e705b1aea82a956a63b Author: suzuki toshiya <[email protected]> Date: Tue Feb 27 00:46:18 2018 +0100 cpp: Add page::text_list diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 8913c8eb..83d48f07 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -2,6 +2,7 @@ * Copyright (C) 2009-2010, Pino Toscano <[email protected]> * Copyright (C) 2017, Albert Astals Cid <[email protected]> * Copyright (C) 2017, Jason Alan Palmer <[email protected]> + * Copyright (C) 2018, Suzuki Toshiya <[email protected]> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -285,3 +286,93 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const } return ustring::from_utf8(s->getCString()); } + +/* + * text_box object for page::text_list() + */ +text_box::~text_box() = default; + +text_box::text_box(text_box_data *data) : m_data{data} +{ +} + +ustring text_box::text() const +{ + return m_data->text; +} + +rectf text_box::bbox() const +{ + return m_data->bbox; +} + +rectf text_box::char_bbox(size_t i) const +{ + if (i < m_data->char_bboxes.size()) + return m_data->char_bboxes[i]; + return rectf(0, 0, 0, 0); +} + +bool text_box::has_space_after() const +{ + return m_data->has_space_after; +} + +std::vector<text_box> page::text_list() const +{ + std::vector<text_box> output_list; + + /* config values are same with Qt5 Page::TextList() */ + std::unique_ptr<TextOutputDev> output_dev{ + new TextOutputDev(nullptr, /* char* fileName */ + gFalse, /* GBool physLayoutA */ + 0, /* double fixedPitchA */ + gFalse, /* GBool rawOrderA */ + gFalse) /* GBool append */ + }; + + /* + * config values are same with Qt5 Page::TextList(), + * but rotation is fixed to zero. + * Few people use non-zero values. + */ + d->doc->doc->displayPageSlice(output_dev.get(), + d->index + 1, /* page */ + 72, 72, 0, /* hDPI, vDPI, rot */ + gFalse, gFalse, gFalse, /* useMediaBox, crop, printing */ + -1, -1, -1, -1, /* sliceX, sliceY, sliceW, sliceH */ + nullptr, nullptr, /* abortCheckCbk(), abortCheckCbkData */ + nullptr, nullptr, /* annotDisplayDecideCbk(), annotDisplayDecideCbkData */ + gTrue); /* copyXRef */ + + if (std::unique_ptr< TextWordList > word_list{output_dev->makeWordList()}) { + + output_list.reserve(word_list->getLength()); + for (int i = 0; i < word_list->getLength(); i ++) { + TextWord *word = word_list->get(i); + + std::unique_ptr<GooString> gooWord{word->getText()}; + ustring ustr = detail::unicode_GooString_to_ustring(gooWord.get()); + + double xMin, yMin, xMax, yMax; + word->getBBox(&xMin, &yMin, &xMax, &yMax); + + text_box tb{new text_box_data{ + ustr, + {xMin, yMin, xMax-xMin, yMax-yMin}, + {}, + word->hasSpaceAfter() == gTrue + }}; + + tb.m_data->char_bboxes.reserve(word->getLength()); + for (int j = 0; j < word->getLength(); j ++) { + word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax); + tb.m_data->char_bboxes.push_back({xMin, yMin, xMax-xMin, yMax-yMin}); + } + + output_list.push_back(std::move(tb)); + } + } + + return output_list; +} diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 7b4298a1..df5cb36a 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2009-2010, Pino Toscano <[email protected]> + * Copyright (C) 2018, Suzuki Toshiya <[email protected]> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,9 +23,45 @@ #include "poppler-global.h" #include "poppler-rectangle.h" +#include <memory> + namespace poppler { +struct text_box_data; +class POPPLER_CPP_EXPORT text_box +{ + friend class page; +public: + text_box(text_box&&) = default; + text_box& operator=(text_box&&) = default; + + ~text_box(); + + ustring text() const; + rectf bbox() const; + + /** + Get a bbox for the i-th glyph + + This method returns a rectf of the bounding box for + the i-th glyph in the text_box. + + \note The text_box object owns the rectf objects, + the caller is not needed to free them. + + \warning For too large glyph index, rectf(0,0,0,0) + is returned. The number of the glyphs and ustring + codepoints might be different in some complex scripts. + */ + rectf char_bbox(size_t i) const; + bool has_space_after() const; +private: + text_box(text_box_data *data); + + std::unique_ptr<text_box_data> m_data; +}; + class document; class document_private; class page_private; @@ -63,6 +100,24 @@ public: ustring text(const rectf &rect = rectf()) const; ustring text(const rectf &rect, text_layout_enum layout_mode) const; + /** + Returns a list of text of the page + + This method returns a std::vector of text_box that contain all + the text of the page, with roughly one text word of text + per text_box item. + + For text written in western languages (left-to-right and + up-to-down), the std::vector contains the text in the proper + order. + + \note The page object owns the text_box objects as unique_ptr, + the caller is not needed to free them. + + \warning This method is not tested with Asian scripts + */ + std::vector<text_box> text_list() const; + private: page(document_private *doc, int index); diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h index 147073d9..3753567f 100644 --- a/cpp/poppler-private.h +++ b/cpp/poppler-private.h @@ -3,6 +3,7 @@ * Copyright (C) 2013 Adrian Johnson <[email protected]> * Copyright (C) 2014, Hans-Peter Deifel <[email protected]> * Copyright (C) 2016 Jakub Alba <[email protected]> + * Copyright (C) 2018, Suzuki Toshiya <[email protected]> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -67,6 +68,14 @@ void delete_all(const Collection &c) delete_all(c.begin(), c.end()); } +struct text_box_data +{ + ustring text; + rectf bbox; + std::vector<rectf> char_bboxes; + bool has_space_after; +}; + } #endif diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index c147aad7..a1a68251 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -2,6 +2,7 @@ * Copyright (C) 2009-2010, Pino Toscano <[email protected]> * Copyright (C) 2017, 2018, Albert Astals Cid <[email protected]> * Copyright (C) 2017, Jason Alan Palmer <[email protected]> + * Copyright (C) 2018, Suzuki Toshiya <[email protected]> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -50,6 +51,7 @@ bool show_embedded_files = false; bool show_pages = false; bool show_help = false; char show_text[32]; +bool show_text_list = false; poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout; static const ArgDesc the_args[] = { @@ -71,6 +73,8 @@ static const ArgDesc the_args[] = { "show pages information" }, { "--show-text", argString, &show_text, sizeof(show_text), "show text (physical|raw) extracted from all pages" }, + { "--show-text-list", argFlag, &show_text_list, 0, + "show text list (experimental)" }, { "-h", argFlag, &show_help, 0, "print usage information" }, { "--help", argFlag, &show_help, 0, @@ -323,6 +327,28 @@ static void print_page_text(poppler::page *p) std::cout << std::endl; } +static void print_page_text_list(poppler::page *p) +{ + if (!p) { + std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl; + std::cout << std::endl; + return; + } + auto text_list = p->text_list(); + + std::cout << "---" << std::endl; + for (size_t i = 0; i < text_list.size(); i ++) { + poppler::rectf bbox = text_list[i].bbox(); + poppler::ustring ustr = text_list[i].text(); + std::cout << "[" << ustr << "] @ "; + std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )"; + std::cout << std::endl; + + } + std::cout << "---" << std::endl; +} + + int main(int argc, char *argv[]) { if (!parseArgs(the_args, &argc, argv) @@ -398,6 +424,14 @@ int main(int argc, char *argv[]) print_page_text(p.get()); } } + if (show_text_list) { + const int pages = doc->pages(); + for (int i = 0; i < pages; ++i) { + std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl; + std::unique_ptr<poppler::page> p(doc->create_page(i)); + print_page_text_list(p.get()); + } + } return 0; } _______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
