Hello mpsuzuki,

again a commented diff is attached.

Personally, I would like to keep the return value of page::text_list a
proper value type without pointers into the object to keep the ownership
simpler.

Best regards,
Adam

Am 27.04.2018 um 04:24 schrieb suzuki toshiya:
> Hi,
> 
> Although nobody has commented on this, I updated this patch
> to work with my latest patch(es) for cpp encoding issue.
> 
> Regards,
> mpsuzuki
> 
> suzuki toshiya wrote:
>> Hi,
>>
>> Recently I heard some people wants to retrieve the list
>> of words from PDF, as cpp's poppler::page::text_list(),
>> but with the font information (e.g. the familyname of
>> the font).
>>
>> Considering that often the office document or academic
>> articles use different fonts for the section titles and
>> the main text, it would be reasonable for the people to
>> expect as "I want to retrieve the text boxes, but only
>> the text boxes written by Helvetica-Bold".
>>
>> What is the right way to do such? During the developmet
>> of poppler::page::text_list(), once I've tried to do such.
>> https://github.com/mpsuzuki/poppler/commit/8ce2556a62a90c034d7cea8b1dfd26715d03a8f0
>> (note: this patch was written before the stabilization
>> of unique_ptr utilization. more fix is expected in future)
>>
>> However, I feel it's slightly too big. Its changes are
>> not only for cpp frontend codes, but also for poppler/FontInfo.{cc,h}
>> and poppler/TextOutputDev.{cc,h}. I want to ask a few
>> questions...
>>
>> Q-1) a request for text_box with font info fits to poppler's
>> scope? is there any better library to request such feature?
>>
>> Q-2) if this request fits to poppler's scope, the enhancement
>> of the cpp frontend poppler::page::text_list() is the way to
>> go? having different API for such purpose is better?
>>
>> Q-3) my current patch modifies FontInfo and TextOutputDev
>> of libpoppler itself. such modification is acceptable?
>>
>> I appreciate if the maintainers can give some comments.
>>
>> Regards,
>> mpsuzuki
>>
>> _______________________________________________
>> poppler mailing list
>> [email protected]
>> https://lists.freedesktop.org/mailman/listinfo/poppler
>>
>>
>> _______________________________________________
>> poppler mailing list
>> [email protected]
>> https://lists.freedesktop.org/mailman/listinfo/poppler
diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h
new file mode 100644
index 0000000..09a3f7f
--- /dev/null
+++ b/cpp/poppler-font-private.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2009, Pino Toscano <[email protected]>
+ * Copyright (C) 2015, Tamas Szekeres <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "poppler-font.h"
+
+#include "poppler-document-private.h"
+
+#include "FontInfo.h"
+
+#include <algorithm>
+
+using namespace poppler;

I think using namespaces in headers is generally frowned upon, even if the header is private.

+class poppler::font_info_private
+{
+public:
+    font_info_private()
+        : type(font_info::unknown)
+        , is_embedded(false)
+        , is_subset(false)
+    {
+    }
+    font_info_private(FontInfo *fi)
+        : type((font_info::type_enum)fi->getType())
+        , is_embedded(fi->getEmbedded())
+        , is_subset(fi->getSubset())
+    {
+        if (fi->getName()) {
+            font_name = fi->getName()->getCString();
+        }
+        if (fi->getFile()) {
+            font_file = fi->getFile()->getCString();
+        }
+
+        ref.num = fi->getRef().num;
+        ref.gen = fi->getRef().gen;
+        emb_ref.num = fi->getEmbRef().num;
+        emb_ref.gen = fi->getEmbRef().gen;
+    }
+
+    std::string font_name;
+    std::string font_file;
+    font_info::type_enum type : 5;
+    bool is_embedded : 1;
+    bool is_subset : 1;
+    Ref ref;
+    Ref emb_ref;
+};
+
+
+class poppler::font_iterator_private
+{
+public:
+    font_iterator_private(int start_page, document_private *dd)
+        : font_info_scanner(dd->doc, start_page)
+        , total_pages(dd->doc->getNumPages())
+        , current_page((std::max)(start_page, 0))
+    {
+    }
+    ~font_iterator_private()
+    {
+    }
+
+    FontInfoScanner font_info_scanner;
+    int total_pages;
+    int current_page;
+};
diff --git a/cpp/poppler-font.cpp b/cpp/poppler-font.cpp
index fcf0f7a..289cbea 100644
--- a/cpp/poppler-font.cpp
+++ b/cpp/poppler-font.cpp
@@ -21,60 +21,14 @@
 
 #include "poppler-document-private.h"
 
+#include "poppler-font-private.h"
+
 #include "FontInfo.h"
 
 #include <algorithm>
 
 using namespace poppler;
 
-class poppler::font_info_private
-{
-public:
-    font_info_private()
-        : type(font_info::unknown)
-        , is_embedded(false)
-        , is_subset(false)
-    {
-    }
-    font_info_private(FontInfo *fi)
-        : type((font_info::type_enum)fi->getType())
-        , is_embedded(fi->getEmbedded())
-        , is_subset(fi->getSubset())
-    {
-        if (fi->getName()) {
-            font_name = fi->getName()->getCString();
-        }
-        if (fi->getFile()) {
-            font_file = fi->getFile()->getCString();
-        }
-    }
-
-    std::string font_name;
-    std::string font_file;
-    font_info::type_enum type : 5;
-    bool is_embedded : 1;
-    bool is_subset : 1;
-};
-
-
-class poppler::font_iterator_private
-{
-public:
-    font_iterator_private(int start_page, document_private *dd)
-        : font_info_scanner(dd->doc, start_page)
-        , total_pages(dd->doc->getNumPages())
-        , current_page((std::max)(start_page, 0))
-    {
-    }
-    ~font_iterator_private()
-    {
-    }
-
-    FontInfoScanner font_info_scanner;
-    int total_pages;
-    int current_page;
-};
-
 /**
  \class poppler::font_info poppler-font.h "poppler/cpp/poppler-font.h"
 
@@ -205,7 +159,7 @@ font_iterator::~font_iterator()
 /**
  Returns the fonts of the current page and advances to the next one.
  */
-std::vector<font_info> font_iterator::next()
+std::vector<font_info> font_iterator::next(bool doSubst)
 {
     if (!has_next()) {
         return std::vector<font_info>();
@@ -213,7 +167,7 @@ std::vector<font_info> font_iterator::next()
 
     ++d->current_page;
 
-    GooList *items = d->font_info_scanner.scan(1);
+    GooList *items = d->font_info_scanner.scan(1, doSubst);
     if (!items) {
         return std::vector<font_info>();
     }
@@ -225,6 +179,11 @@ std::vector<font_info> font_iterator::next()
     return fonts;
 }
 
+std::vector<font_info> font_iterator::next()
+{
+    return font_iterator::next(true);
+}
+

Couldn't you just use bool doSubst = true above or would that break ABI compatibility?

 /**
  \returns whether the iterator has more pages to advance to
 */
diff --git a/cpp/poppler-font.h b/cpp/poppler-font.h
index 854b7a4..8b372e4 100644
--- a/cpp/poppler-font.h
+++ b/cpp/poppler-font.h
@@ -67,6 +67,7 @@ private:
 
     font_info_private *d;
     friend class font_iterator;
+    friend class page;
 };
 
 
@@ -75,6 +76,7 @@ class POPPLER_CPP_EXPORT font_iterator : public poppler::noncopyable
 public:
     ~font_iterator();
 
+    std::vector<font_info> next(bool doSubst);
     std::vector<font_info> next();
     bool has_next() const;
     int current_page() const;
@@ -84,6 +86,8 @@ private:
 
     font_iterator_private *d;
     friend class document;
+    friend class page;
+    friend class page_private;
 };
 
 }
diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index e0c3446..19e3eb2 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -29,6 +29,7 @@ namespace poppler
 
 class document_private;
 class page_transition;
+class font_info;
 
 class page_private
 {
@@ -46,6 +47,10 @@ public:
 
     static inline page_private* get(const poppler::page *p)
     { return const_cast<poppler::page *>(p)->d; }
+
+    std::vector<font_info> font_info_cache;
+
+    size_t fill_font_info_cache();
 };
 
 }
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index da74759..df44060 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -26,6 +26,8 @@
 #include "poppler-document-private.h"
 #include "poppler-page-private.h"
 #include "poppler-private.h"
+#include "poppler-font-private.h"
+#include "poppler-font.h"
 
 #include "TextOutputDev.h"
 
@@ -48,6 +50,23 @@ page_private::~page_private()
     delete transition;
 }
 
+size_t page_private::fill_font_info_cache()
+{
+    if (font_info_cache.size() > 0)
+	return font_info_cache.size();
+
+    poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc);

Use of std::unique_ptr<poppler::font_iterator> or better yet, why not just on the stack:
poppler::font_iterator font_iterator{index, doc}

+    if (font_iterator->has_next()) {
+	/* do not trigger time-consuming substitution */
+	font_info_cache = font_iterator->next(false);
+    }
+
+    delete font_iterator;
+
+    return font_info_cache.size();
+}
+
 /**
  \class poppler::page poppler-page.h "poppler/cpp/poppler-page.h"
 
@@ -321,8 +340,25 @@ bool text_box::has_space_after() const
     return m_data->has_space_after;
 }
 
+int text_box::get_wmode(int i) const
+{
+    return m_data->wmodes[i];
+}
+
+double text_box::get_font_size() const
+{
+    return m_data->font_size;
+}
+
+std::string text_box::get_font_name(int i) const
+{
+    return m_data->font_infos[i]->name();
+}
+
 std::vector<text_box> page::text_list() const
 {
+    d->fill_font_info_cache();
+

Maybe it would suffice to just keep the font_info_cache alive in this method
as a local variable so it does not continue to take up memory in the page
without the user being able to drop it (except for dropping the page object)?
This would also not increase the size of page_private for people who do not
use this feature. (The only downside would of course be that the font infos are
not cached between separate calls to page::text_list, but I am not sure how useful
that is in any case.)

     std::vector<text_box>  output_list;
 
     /* config values are same with Qt5 Page::TextList() */
@@ -366,6 +402,7 @@ std::vector<text_box> page::text_list() const
                 {},
                 word->hasSpaceAfter() == gTrue
             }};
+            tb.m_data->font_size = word->getFontSize();
 
             tb.m_data->char_bboxes.reserve(word->getLength());
             for (int j = 0; j < word->getLength(); j ++) {
@@ -373,6 +410,18 @@ std::vector<text_box> page::text_list() const
                 tb.m_data->char_bboxes.push_back({xMin, yMin, xMax-xMin, yMax-yMin});
             }
 
+            tb.m_data->wmodes.reserve(word->getLength());
+            tb.m_data->font_infos.reserve(word->getLength());
+            for (int j = 0; j < word->getLength(); j ++) {
+                TextFontInfo* text_font_info = word->getFontInfo(j);
+                tb.m_data->wmodes.push_back(text_font_info->getWMode());
+                for (size_t k = 0; k < d->font_info_cache.size(); k ++) {
+                    if (text_font_info->matches(d->font_info_cache[k].d->ref)) {
+                        tb.m_data->font_infos.push_back(&(d->font_info_cache[k]));
+                    }
+                }
+            }
+
             output_list.push_back(std::move(tb));
         }
     }
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 93a13d1..5037af5 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -57,6 +57,12 @@ public:
      */
     rectf     char_bbox(size_t i) const;
     bool      has_space_after() const;
+
+    /* new functions missing in Qt frontend */
I am not sure whether this belongs here and if the maintainers of the Qt frontend would agree that something is missing?

+    int         get_wmode(int i = 0) const;
+    double      get_font_size() const;
+    std::string get_font_name(int i = 0) const; /* sometimes in legacy encoding */
+
 private:
     text_box(text_box_data *data);
 
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 501c4fc..f85c747 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -70,6 +70,7 @@ void delete_all(const Collection &c)
     delete_all(c.begin(), c.end());
 }
 
+class font_info;
 struct text_box_data
 {
     ~text_box_data();
@@ -78,6 +79,14 @@ struct text_box_data
     rectf bbox;
     std::vector<rectf> char_bboxes;
     bool has_space_after;
+
+    /*
+     * font_info objects are collected in document object,
+     * no need to duplicate here, just refer it
+     */

This implies that the return value of page::text_list is only
valid as long as the document object exists which quite a change
w.r.t. lifetime semantics and should be part of the public documentation.

+    std::vector<int> wmodes;
+    double font_size;
+    std::vector<font_info*> font_infos;
 };
 
 }
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index a1a6825..de0e5c3 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -340,8 +340,12 @@ static void print_page_text_list(poppler::page *p)
     for (size_t i = 0; i < text_list.size(); i ++) {
         poppler::rectf bbox = text_list[i].bbox();
         poppler::ustring ustr = text_list[i].text();
+        std::string font_name = text_list[i].get_font_name(0);
+        double font_size = text_list[i].get_font_size();
+        int wmode = text_list[i].get_wmode();
         std::cout << "[" << ustr << "] @ ";
         std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+        std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
         std::cout << std::endl;
 
     }
diff --git a/poppler/FontInfo.cc b/poppler/FontInfo.cc
index 18f21f4..3eb6d91 100644
--- a/poppler/FontInfo.cc
+++ b/poppler/FontInfo.cc
@@ -50,7 +50,7 @@ FontInfoScanner::FontInfoScanner(PDFDoc *docA, int firstPage) {
 FontInfoScanner::~FontInfoScanner() {
 }
 
-GooList *FontInfoScanner::scan(int nPages) {
+GooList *FontInfoScanner::scan(int nPages, bool doSubst) {
   GooList *result;
   Page *page;
   Dict *resDict;
@@ -74,14 +74,14 @@ GooList *FontInfoScanner::scan(int nPages) {
     if (!page) continue;
 
     if ((resDict = page->getResourceDictCopy(xrefA))) {
-      scanFonts(xrefA, resDict, result);
+      scanFonts(xrefA, resDict, doSubst, result);
       delete resDict;
     }
     annots = page->getAnnots();
     for (int i = 0; i < annots->getNumAnnots(); ++i) {
       Object obj1 = annots->getAnnot(i)->getAppearanceResDict();
       if (obj1.isDict()) {
-        scanFonts(xrefA, obj1.getDict(), result);
+        scanFonts(xrefA, obj1.getDict(), doSubst, result);
       }
     }
   }
@@ -92,7 +92,11 @@ GooList *FontInfoScanner::scan(int nPages) {
   return result;
 }
 
-void FontInfoScanner::scanFonts(XRef *xrefA, Dict *resDict, GooList *fontsList) {
+GooList *FontInfoScanner::scan(int nPages) {
+  return FontInfoScanner::scan(nPages, true);
+}
+
+void FontInfoScanner::scanFonts(XRef *xrefA, Dict *resDict, bool doSubst, GooList *fontsList) {
   GfxFontDict *gfxFontDict;
   GfxFont *font;
 
@@ -115,7 +119,7 @@ void FontInfoScanner::scanFonts(XRef *xrefA, Dict *resDict, GooList *fontsList)
 
         // add this font to the list if not already found
         if (fonts.find(fontRef.num) == fonts.end()) {
-          fontsList->append(new FontInfo(font, xrefA));
+          fontsList->append(new FontInfo(font, xrefA, doSubst));
           fonts.insert(fontRef.num);
         }
       }
@@ -145,15 +149,18 @@ void FontInfoScanner::scanFonts(XRef *xrefA, Dict *resDict, GooList *fontsList)
         if (obj2.isStream()) {
           Object resObj = obj2.streamGetDict()->lookup("Resources");
           if (resObj.isDict() && resObj.getDict() != resDict) {
-            scanFonts(xrefA, resObj.getDict(), fontsList);
+            scanFonts(xrefA, resObj.getDict(), doSubst, fontsList);
           }
         }
       }
     }
   }
 }
+void FontInfoScanner::scanFonts(XRef *xrefA, Dict *resDict, GooList *fontsList) {
+  return FontInfoScanner::scanFonts(xrefA, resDict, true, fontsList);
+}
 
-FontInfo::FontInfo(GfxFont *font, XRef *xref) {
+FontInfo::FontInfo(GfxFont *font, XRef *xref, bool doSubst) {
   const GooString *origName;
 
   fontRef = *font->getID();
@@ -178,7 +185,7 @@ FontInfo::FontInfo(GfxFont *font, XRef *xref) {
 
   file = nullptr;
   substituteName = nullptr;
-  if (!emb)
+  if (!emb && doSubst)
   {
     SysFontType dummy;
     int dummy2;
diff --git a/poppler/FontInfo.h b/poppler/FontInfo.h
index f650146..d6d3e8c 100644
--- a/poppler/FontInfo.h
+++ b/poppler/FontInfo.h
@@ -50,7 +50,7 @@ public:
   };
     
   // Constructor.
-  FontInfo(GfxFont *fontA, XRef *xrefA);
+  FontInfo(GfxFont *fontA, XRef *xrefA, bool doSubst = true);
   // Copy constructor
   FontInfo(FontInfo& f);
   // Destructor.
@@ -91,6 +91,7 @@ public:
   ~FontInfoScanner();
 
   GooList *scan(int nPages);
+  GooList *scan(int nPages, bool doSubst);

Since this is internal you should be able to use a default parameter bool doSubst = true here.
 
 private:
 
@@ -99,6 +100,7 @@ private:
   std::set<int> fonts;
   std::set<int> visitedObjects;
 
+  void scanFonts(XRef *xrefA, Dict *resDict, bool doSubst, GooList *fontsList);
   void scanFonts(XRef *xrefA, Dict *resDict, GooList *fontsList);

Maybe also a default parameter after reoderding?

 };
 
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 4a3070a..fa8918f 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -324,10 +324,26 @@ TextFontInfo::~TextFontInfo() {
 #endif
 }
 
+GBool TextFontInfo::matches(Ref ref) {
+  return ( gfxFont->getID()->num == ref.num && gfxFont->getID()->gen == ref.gen);
+}
+
+GBool TextFontInfo::matches(Ref *ref) {
+  return ( gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen);
+}
+

Why doesn't the first one call the second one? Are both really necessary?

 GBool TextFontInfo::matches(GfxState *state) {
   return state->getFont() == gfxFont;
 }
 
+GBool TextFontInfo::matches(GfxFont *gf) {
+  return (gfxFont->getID()->num == gf->getID()->num && gfxFont->getID()->gen == gf->getID()->gen);
+}
+
+GBool TextFontInfo::matches(FontInfo *fontInfo) {
+  return (fontInfo->getRef().num == gfxFont->getID()->num && fontInfo->getRef().gen == gfxFont->getID()->gen);
+}
+
 GBool TextFontInfo::matches(TextFontInfo *fontInfo) {
   return gfxFont == fontInfo->gfxFont;
 }
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 092acd6..8244989 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -40,6 +40,7 @@
 #include "goo/gtypes.h"
 #include "GfxFont.h"
 #include "GfxState.h"
+#include "FontInfo.h"
 #include "OutputDev.h"
 
 class GooString;
@@ -83,7 +84,11 @@ public:
   TextFontInfo(const TextFontInfo &) = delete;
   TextFontInfo& operator=(const TextFontInfo &) = delete;
 
+  GBool matches(Ref ref);
+  GBool matches(Ref *ref);
   GBool matches(GfxState *state);
+  GBool matches(GfxFont *gfxFont);
+  GBool matches(FontInfo *fontInfo);
   GBool matches(TextFontInfo *fontInfo);
 
   // Get the font ascent, or a default value if the font is not set

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________
poppler mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to