On Sat, Jan 08, 2011 at 10:26:43AM +0100, Carlos Garcia Campos wrote: > Thanks for the patch!. Comments inline below > > > From 389d49e3413ce09601b574308bd6bbd46044e6b3 Mon Sep 17 00:00:00 2001 > > From: danigm <[email protected]> > > Date: Wed, 5 Jan 2011 14:07:59 +0100 > > Subject: [PATCH] [glib] Added poppler_page_get_raw_text function > > > > --- > > glib/poppler-page.cc | 54 > > +++++++++++++++++++++++++++++++++++++++++++++++++- > > glib/poppler-page.h | 1 + > > 2 files changed, 54 insertions(+), 1 deletions(-) > > > > diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc > > index a8e6b2d..8966f7e 100644 > > --- a/glib/poppler-page.cc > > +++ b/glib/poppler-page.cc > > @@ -2117,7 +2117,7 @@ poppler_page_get_crop_box (PopplerPage *page, > > PopplerRectangle *rect) > > * This array must be freed with g_free () when done. > > * > > * The position in the array represents an offset in the text returned by > > - * poppler_page_get_text() > > + * poppler_page_get_raw_text() > > Why? if they are compatible is because they return the same, I guess > get_text_layout() wants the text in reading order.
The thing is get_text_layout now uses wordlist to get the text layout
so it represents the offset returned by get_raw_text, no get_text.
Maybe we should change the name and create a new one with read order
text layout. So we will have get_text_layout and get_raw_text_layout.
>
> > * Return value: %TRUE if the page contains text, %FALSE otherwise
> > *
> > @@ -2200,3 +2200,55 @@ poppler_page_get_text_layout (PopplerPage
> > *page,
> >
> > return TRUE;
> > }
> > +
> > +/**
> > + * poppler_page_get_raw_text:
> > + * @page: A #PopplerPage
>
> You should explain here what raw_text() exactly is, and why it is
> different from get_text().
>
> > + * Return value: a pointer to the text page in raw order
> > + * as a string
>
> This is new API, add Since: 0.18 here and remember to add the symbol
> to glib/reference/poppler-sections.txt
>
> > + **/
> > +char *
> > +poppler_page_get_raw_text (PopplerPage *page)
> > +{
> > + TextPage *text;
> > + TextWordList *wordlist;
> > + TextWord *word, *nextword;
> > + char *craw_text;
> > + GooString *raw_text;
> > + int i = 0;
> > +
> > + raw_text = new GooString();
> > +
> > + g_return_val_if_fail (POPPLER_IS_PAGE (page), FALSE);
>
> s/FALSE/NULL/
>
> > + text = poppler_page_get_text_page (page);
> > + wordlist = text->makeWordList (gFalse);
> > +
> > + if (wordlist->getLength () <= 0)
> > + return NULL;
>
> You are leaking wordlist and raw_text in this early return. Delete the
> wordlist when length <= 0 and create raw_text after the if.
>
> > + for (i = 0; i < wordlist->getLength (); i++)
> > + {
> > + word = wordlist->get (i);
> > + raw_text->append (word->getText ());
>
> word->getText() returns a new allocated GooString and
> GooString::append() copies the given string, so you are leaking the
> GooString here.
>
> > + nextword = word->getNext ();
> > + if (nextword)
> > + {
> > + raw_text->append (' ');
> > + }
> > + else
> > + {
> > + raw_text->append ('\n');
> > + }
>
> Don't use braces for single line clauses. Here you could use something
> like
>
> raw_text->append (nextword ? ' ' : '\n');
>
> > + }
> > +
> > + craw_text = g_strdup (raw_text->getCString ());
>
> We can avoid this g_strdup() by using a GString instead of a
> GooString.
>
> GString *raw_text = g_string_new (NULL);
>
> raw_text = g_string_append_len (raw_text, wordText->getCString(),
> wordText->getLength());
> raw_text = g_string_append_c (raw_text, nextword ? ' ' : '\n');
>
> craw_text = g_string_free (raw_text, FALSE);
>
> > + delete wordlist;
> > + delete raw_text;
> > +
> > + return craw_text;
> > +}
> > diff --git a/glib/poppler-page.h b/glib/poppler-page.h
> > index d40c0ee..333cb23 100644
> > --- a/glib/poppler-page.h
> > +++ b/glib/poppler-page.h
> > @@ -128,6 +128,7 @@ void poppler_page_get_crop_box
> > (PopplerPage *page,
> > gboolean poppler_page_get_text_layout (PopplerPage
> > *page,
> > PopplerRectangle
> > **rectangles,
> > guint
> > *n_rectangles);
> > +char *poppler_page_get_raw_text (PopplerPage
> > *page);
> >
> > /* A rectangle on a page, with coordinates in PDF points. */
> > #define POPPLER_TYPE_RECTANGLE (poppler_rectangle_get_type ())
I've made all the previous changes. Here's the new patch.
From bc4cc4cab9ef566643d6b39f15955a75413fd3b8 Mon Sep 17 00:00:00 2001 From: danigm <[email protected]> Date: Wed, 5 Jan 2011 14:07:59 +0100 Subject: [PATCH] [glib] Added poppler_page_get_raw_text function --- glib/poppler-page.cc | 56 ++++++++++++++++++++++++++++++++++- glib/poppler-page.h | 1 + glib/reference/poppler-sections.txt | 1 + 3 files changed, 57 insertions(+), 1 deletions(-) diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index a8e6b2d..cca807c 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -2117,7 +2117,7 @@ poppler_page_get_crop_box (PopplerPage *page, PopplerRectangle *rect) * This array must be freed with g_free () when done. * * The position in the array represents an offset in the text returned by - * poppler_page_get_text() + * poppler_page_get_raw_text() * * Return value: %TRUE if the page contains text, %FALSE otherwise * @@ -2200,3 +2200,57 @@ poppler_page_get_text_layout (PopplerPage *page, return TRUE; } + +/** + * poppler_page_get_raw_text: + * @page: A #PopplerPage + * + * Raw text is the page text, order by its internal position inside + * the pdf document. + * + * Return value: a pointer to the text page in raw order + * as a string + * + * Since: 0.18 + **/ +char * +poppler_page_get_raw_text (PopplerPage *page) +{ + TextPage *text; + TextWordList *wordlist; + TextWord *word, *nextword; + char *craw_text; + GooString *wordText; + GString *raw_text; + int i = 0; + + g_return_val_if_fail (POPPLER_IS_PAGE (page), NULL); + + text = poppler_page_get_text_page (page); + wordlist = text->makeWordList (gFalse); + + if (wordlist->getLength () <= 0) + { + delete wordlist; + return NULL; + } + + raw_text = g_string_new (NULL); + + for (i = 0; i < wordlist->getLength (); i++) + { + word = wordlist->get (i); + wordText = word->getText (); + raw_text = g_string_append_len (raw_text, wordText->getCString(), wordText->getLength()); + delete wordText; + + nextword = word->getNext (); + raw_text = g_string_append_c (raw_text, nextword ? ' ' : '\n'); + } + + craw_text = g_string_free (raw_text, FALSE); + + delete wordlist; + + return craw_text; +} diff --git a/glib/poppler-page.h b/glib/poppler-page.h index d40c0ee..333cb23 100644 --- a/glib/poppler-page.h +++ b/glib/poppler-page.h @@ -128,6 +128,7 @@ void poppler_page_get_crop_box (PopplerPage *page, gboolean poppler_page_get_text_layout (PopplerPage *page, PopplerRectangle **rectangles, guint *n_rectangles); +char *poppler_page_get_raw_text (PopplerPage *page); /* A rectangle on a page, with coordinates in PDF points. */ #define POPPLER_TYPE_RECTANGLE (poppler_rectangle_get_type ()) diff --git a/glib/reference/poppler-sections.txt b/glib/reference/poppler-sections.txt index 6ad9ee2..bea4dfe 100644 --- a/glib/reference/poppler-sections.txt +++ b/glib/reference/poppler-sections.txt @@ -38,6 +38,7 @@ poppler_page_get_selected_text poppler_page_find_text poppler_page_get_text poppler_page_get_text_layout +poppler_page_get_raw_text poppler_page_get_link_mapping poppler_page_free_link_mapping poppler_page_get_image_mapping -- 1.7.3.4.742.g987cd
pgpc4M6nnAY0f.pgp
Description: PGP signature
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
