cpp/poppler-page.cpp | 27 +++++++++++++++++++++- cpp/poppler-page.h | 5 ++++ cpp/tests/poppler-dump.cpp | 53 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 76 insertions(+), 9 deletions(-)
New commits: commit b9333529bba43a71655fdbf1919ba515f7df9ca3 Author: Pino Toscano <[email protected]> Date: Wed Sep 15 17:23:54 2010 +0200 [cpp/tests] poppler-dump: convert out_ustring() to an operator<<(std::ostream&) so we have a chance to better output the bytearray of a string to the stream diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index e9a068c..cb0ee75 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -81,9 +81,13 @@ static void error(const std::string &msg) exit(1); } -static std::string out_ustring(const poppler::ustring &str) +std::ostream& operator<<(std::ostream& stream, const poppler::ustring &str) { - return str.to_latin1(); + const poppler::byte_array ba = str.to_utf8(); + for (unsigned int i = 0; i < ba.size(); ++i) { + stream << (char)(ba[i]); + } + return stream; } static std::string out_date(std::time_t date) @@ -174,7 +178,7 @@ static void print_info(poppler::document *doc) const std::vector<std::string> keys = doc->info_keys(); std::vector<std::string>::const_iterator key_it = keys.begin(), key_end = keys.end(); for (; key_it != key_end; ++key_it) { - std::cout << std::setw(out_width) << *key_it << ": " << out_ustring(doc->info_key(*key_it)) << std::endl; + std::cout << std::setw(out_width) << *key_it << ": " << doc->info_key(*key_it) << std::endl; } std::cout << std::setw(out_width) << "Date (creation)" << ": " << out_date(doc->info_date("CreationDate")) << std::endl; std::cout << std::setw(out_width) << "Date (modification)" << ": " << out_date(doc->info_date("ModDate")) << std::endl; @@ -205,14 +209,14 @@ static void print_perm(poppler::document *doc) static void print_metadata(poppler::document *doc) { std::cout << std::setw(out_width) << "Metadata" << ":" << std::endl - << out_ustring(doc->metadata()) << std::endl; + << doc->metadata() << std::endl; std::cout << std::endl; } static void print_toc_item(poppler::toc_item *item, int indent) { std::cout << std::setw(indent * 2) << " " - << "+ " << out_ustring(item->title()) << " (" << item->is_open() << ")" + << "+ " << item->title() << " (" << item->is_open() << ")" << std::endl; poppler::toc_item::iterator it = item->children_begin(), it_end = item->children_end(); for (; it != it_end; ++it) { @@ -271,7 +275,13 @@ static void print_embedded_files(poppler::document *doc) << " " << std::setw(20) << out_date(f->creation_date()) << " " << std::setw(20) << out_date(f->modification_date()) << std::endl - << " " << (f->description().empty() ? std::string("<no description>") : out_ustring(f->description())) + << " "; + if (f->description().empty()) { + std::cout << "<no description>"; + } else { + std::cout << f->description(); + } + std::cout << std::endl << " " << std::setw(35) << (f->checksum().empty() ? std::string("<no checksum>") : out_hex_string(f->checksum())) << " " << (f->mime_type().empty() ? std::string("<no mime type>") : f->mime_type()) @@ -287,7 +297,7 @@ static void print_embedded_files(poppler::document *doc) static void print_page(poppler::page *p) { std::cout << std::setw(out_width) << "Rect" << ": " << p->page_rect() << std::endl; - std::cout << std::setw(out_width) << "Label" << ": " << out_ustring(p->label()) << std::endl; + std::cout << std::setw(out_width) << "Label" << ": " << p->label() << std::endl; std::cout << std::setw(out_width) << "Duration" << ": " << p->duration() << std::endl; std::cout << std::setw(out_width) << "Orientation" << ": " << out_page_orientation(p->orientation()) << std::endl; std::cout << std::endl; @@ -295,7 +305,7 @@ static void print_page(poppler::page *p) static void print_page_text(poppler::page *p) { - std::cout << out_ustring(p->text(p->page_rect(), show_text_layout)) << std::endl; + std::cout << p->text(p->page_rect(), show_text_layout) << std::endl; std::cout << std::endl; } commit a44f711b4412332875337e9fb7509f18db806ddc Author: Pino Toscano <[email protected]> Date: Wed Sep 15 16:44:30 2010 +0200 [cpp/tests] poppler-dump: add a "--show-text <physical|raw>" option ... to show the text of a page in the specified layout diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index 104aaa4..e9a068c 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -23,6 +23,7 @@ #include <poppler-toc.h> #include <cstdlib> +#include <cstring> #include <ctime> #include <iomanip> #include <iostream> @@ -44,6 +45,8 @@ bool show_fonts = false; bool show_embedded_files = false; bool show_pages = false; bool show_help = false; +char show_text[32]; +poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout; static const ArgDesc the_args[] = { { "--show-all", argFlag, &show_all, 0, @@ -62,6 +65,8 @@ static const ArgDesc the_args[] = { "show the document-level embedded files" }, { "--show-pages", argFlag, &show_pages, 0, "show pages information" }, + { "--show-text", argString, &show_text, sizeof(show_text), + "show text (physical|raw) extracted from all pages" }, { "-h", argFlag, &show_help, 0, "print usage information" }, { "--help", argFlag, &show_help, 0, @@ -288,6 +293,12 @@ static void print_page(poppler::page *p) std::cout << std::endl; } +static void print_page_text(poppler::page *p) +{ + std::cout << out_ustring(p->text(p->page_rect(), show_text_layout)) << std::endl; + std::cout << std::endl; +} + int main(int argc, char *argv[]) { if (!parseArgs(the_args, &argc, argv) @@ -296,6 +307,16 @@ int main(int argc, char *argv[]) exit(1); } + if (show_text[0]) { + if (!memcmp(show_text, "physical", 9)) { + show_text_layout = poppler::page::physical_layout; + } else if (!memcmp(show_text, "raw", 4)) { + show_text_layout = poppler::page::raw_order_layout; + } else { + error(std::string("unrecognized text mode: '") + show_text + "'"); + } + } + std::string file_name(argv[1]); std::auto_ptr<poppler::document> doc(poppler::document::load_from_file(file_name)); @@ -345,6 +366,14 @@ int main(int argc, char *argv[]) print_page(p.get()); } } + if (show_text[0]) { + const int pages = doc->pages(); + for (int i = 0; i < pages; ++i) { + std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl; + std::auto_ptr<poppler::page> p(doc->create_page(i)); + print_page_text(p.get()); + } + } return 0; } commit 0094c9372b5b439af2564d83d6fb7439f4bdba88 Author: Pino Toscano <[email protected]> Date: Wed Sep 15 13:19:13 2010 +0200 [cpp] add a new page::text() for specifying a layout mode add a new text_layout_enum enum for the layout mode, used by the new text() make the old text() implementation call the new one with the old value (= physical) add & adapt the apidox accordingly diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 1bfb8d4..4e2f730 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -60,6 +60,12 @@ page_private::~page_private() The direction/action to follow when performing a text search. */ +/** + \enum poppler::page::text_layout_enum + + A layout of the text of a page. +*/ + page::page(document_private *doc, int index) : d(new page_private(doc, index)) @@ -234,7 +240,7 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction } /** - Returns the text in the page. + Returns the text in the page, in its physical layout. \param r if not empty, it will be extracted the text in it; otherwise, the text of the whole page @@ -243,8 +249,25 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction */ ustring page::text(const rectf &r) const { + return text(r, physical_layout); +} + +/** + Returns the text in the page. + + \param rect if not empty, it will be extracted the text in it; otherwise, the + text of the whole page + \param layout_mode the layout of the text + + \returns the text of the page in the specified rect or in the whole page + + \since 0.16 + */ +ustring page::text(const rectf &r, text_layout_enum layout_mode) const +{ std::auto_ptr<GooString> s; - TextOutputDev td(0, gFalse, gFalse, gFalse); + const GBool use_raw_order = (layout_mode == raw_order_layout); + TextOutputDev td(0, gFalse, use_raw_order, gFalse); d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false); if (r.is_empty()) { const PDFRectangle *rect = d->page->getCropBox(); diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 89fdea6..7b4298a 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -44,6 +44,10 @@ public: search_next_result, search_previous_result }; + enum text_layout_enum { + physical_layout, + raw_order_layout + }; ~page(); @@ -57,6 +61,7 @@ public: bool search(const ustring &text, rectf &r, search_direction_enum direction, case_sensitivity_enum case_sensitivity, rotation_enum rotation = rotate_0) const; ustring text(const rectf &rect = rectf()) const; + ustring text(const rectf &rect, text_layout_enum layout_mode) const; private: page(document_private *doc, int index); _______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
