Hi all,
I used the poppler_page_get_selection_region() to find the line
rectangles of each and every line in a page.
>From that I find the blocks, then I find the columns of the page. From the
number of columns of the page, Iam able to sort the blocks. So that the
selection is very good.
Right now in poppler the selection is bit a problem. After doing all these
its almost look like Adobe Reader's Selection.
Please give me suggestions on improving this.
I attached two files with this mail.
getcol.c is able to sort the blocks in single/multicolumn pdfs.
getcolumn.c is based on the above sorting used to do the selection.
Thanks
--
A Srinivas
#include <poppler.h>
#include <poppler-document.h>
#include <poppler-page.h>
#include <string.h>
#include <stdlib.h>
#include <gtk/gtk.h>
#include <sys/time.h>
int page_no; // pageno
int npages; // num pages
int pdf_column = 1; // default single column
PopplerRectangle **column_rect; // find the boundaries of the columns
gdouble bx1 = 0, by1=0, bx2=0, by2=0;
gint handler_id;
struct timeval start_time, end_time;
gboolean
block_overlap (PopplerRectangle *block, PopplerRectangle *area)
{
if(!((block->x1 >= area->x2) || (block->x2 <= area->x1) || (block->y1 >= area->y2) || (block->y2 <= area->y1)))
return TRUE;
else
return FALSE;
}
gboolean
rectangle_overlap (PopplerRectangle *block, PopplerRectangle *area)
{
if(!((block->x1 >= area->x2) || (block->x2 <= area->x1) || (block->y1 >= area->y2) || (block->y2 <= area->y1-3)))
return TRUE;
else
return FALSE;
}
void find_block(GList **blocks, PopplerRectangle *block,PopplerRectangle *newrect,gboolean flag)
{
double xmin,ymin,xmax,ymax;
xmin = block->x1;
ymin = block->y1;
xmax = block->x2;
ymax = block->y2;
gboolean ind = FALSE;
if (rectangle_overlap (block, newrect))
{
xmin = (xmin > newrect->x1) ? newrect->x1 : xmin;
xmax = (xmax < newrect->x2) ? newrect->x2 : xmax;
ymin = (ymin > newrect->y1) ? newrect->y1 : ymin;
ymax = (ymax < newrect->y2) ? newrect->y2 : ymax;
block->x1 = xmin;
block->x2 = xmax;
block->y1 = ymin;
block->y2 = ymax;
}
else {
ind = TRUE;
}
if(flag || ind) {
PopplerRectangle *blk = g_new(PopplerRectangle,1);
blk->x1 = block->x1;
blk->y1 = block->y1;
blk->x2 = block->x2;
blk->y2 = block->y2;
block->x1 = newrect->x1;
block->y1 = newrect->y1;
block->x2 = newrect->x2;
block->y2 = newrect->y2;
PopplerRectangle *last = NULL;
if (g_list_length(*blocks) > 0) {
int i;
gboolean overlap =false;
for(i=0; i<g_list_length(*blocks); i++) {
last = (PopplerRectangle *)g_list_nth_data(*blocks, i);
if (rectangle_overlap(last, blk)) { // if overlap take the union of rectangles
last->x1 = last->x1 < blk->x1 ? last->x1: blk->x1;
last->y1 = last->y1 < blk->y1 ? last->y1: blk->y1;
last->x2 = last->x2 > blk->x2 ? last->x2: blk->x2;
last->y2 = last->y2 > blk->y2 ? last->y2: blk->y2;
overlap = true;
}
}
if(!overlap)
*blocks = g_list_append(*blocks,blk);
}
else
*blocks = g_list_append(*blocks,blk);
if(flag && ind)
*blocks = g_list_append(*blocks,block);
}
}
gboolean y_axis_overlap (PopplerRectangle *a, PopplerRectangle *b)
{
if (!(a->x1 > b->x2 || a->x2 < b->x1))
return true;
return false;
}
gboolean x_axis_overlap (PopplerRectangle *a, PopplerRectangle *b)
{
if (!(a->y1 > b->y2 || a->y2 < b->y1))
return true;
return false;
}
gint find_column(PopplerRectangle *a)
{
int i;
for(i=0; i<pdf_column; i++) {
if (block_overlap(a, column_rect[i]))
return i;
}
}
gint sort_groups(PopplerRectangle *a, PopplerRectangle *b)
{
int c1 = find_column(a);
int c2 = find_column(b);
if(c1 < c2) return -1;
if(c1 == c2) {
if(a->y1 < b->y1) return -1;
else if(a->y1 == b->y1) {
if(a->x1 < b->x1) return -1;
if(a->x1 == b->x1) return 0;
if(a->x1 > b->x1) return 1;
} else if(a->y1 > b->y1)
return 1;
}
if(c1 > c2) return 1;
}
gint sort_yaxis(PopplerRectangle *a, PopplerRectangle *b)
{
if (a->y1 < b->y1) return -1; // a comes before b
if (a->y1 == b->y1) return 0; // equal
if (a->y1 > b->y1) return 1; // a comes after b
}
/* sort blocks according to height */
gint compare_func (PopplerRectangle *a, PopplerRectangle *b)
{
gdouble heightA = a->y2 - a->y1;
gdouble heightB = b->y2 - b->y1;
if (heightA > heightB) return -1;
if (heightA == heightB) return 0;
if (heightA < heightB) return 1;
}
gboolean rect_equal (PopplerRectangle *a, PopplerRectangle *b)
{
if (a->x1 == b->x1 && a->y1 == b->y1 && a->x2 == b->x2 && a->y2 == b->y2)
return true;
return false;
}
gboolean check_group(PopplerRectangle *rect, PopplerRectangle *grprect)
{
if (rect->y2 < grprect->y1) // || rect_equal(rect, grprect))
return true;
else
return false;
}
/* If the line rectangle is > 80% overlap with boundary rectangle then include
else ignore the rectangle */
gboolean
check_rectangle_overlap (PopplerRectangle *block, PopplerRectangle *area, gboolean topbottom)
{
if (area->y1 >= block->y1 && area->y2 <= block->y2)
return TRUE;
gdouble h = area->y2 - area->y1;
if (topbottom == true)
area->y1 = area->y2 - (h*0.8);
else
area->y2 = area->y2 - (h*0.2);
if (area->y1 >= block->y1 && area->y2 <= block->y2)
return TRUE;
return FALSE;
}
void
page_boundary (GList *blocks, double *xmin, double *ymin, double *xmax, double *ymax, int *minx_index, int *maxx_index, int *factor)
{
int i,j;
int length = g_list_length (blocks);
for (i=0; i<length; i++)
{
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
if(*xmin > pr->x1) { *xmin = pr->x1; *minx_index = i; }
if(*ymin > pr->y1) { *ymin = pr->y1; }
if(*xmax < pr->x2) { *xmax = pr->x2; *maxx_index = i; }
if(*ymax < pr->y2) { *ymax = pr->y2; }
}
PopplerRectangle *minxblock = (PopplerRectangle *) g_list_nth_data (blocks, *minx_index);
if (minxblock && (minxblock->y2 - minxblock->y1) < 60) {
PopplerRectangle *temp = g_new (PopplerRectangle, 1);
for(i=*xmin; i<(*xmin+(minxblock->x2 - minxblock->x1)); i++) {
temp->x1 = *xmin;
temp->x2 = i;
temp->y1 = minxblock->y1;
temp->y2 = minxblock->y2;
int overlap = 0;
for(j=0; j<length; j++) {
PopplerRectangle *pr = (PopplerRectangle *) g_list_nth_data (blocks, j);
if (y_axis_overlap(temp, pr))
overlap++;
if (overlap > 1)
break;
}
if (overlap > 1) {
*factor = i - *xmin;
break;
}
}
}
PopplerRectangle *maxxblock = (PopplerRectangle *) g_list_nth_data (blocks, *maxx_index);
if (maxxblock && (maxxblock->y2 - maxxblock->y1) < 60) {
PopplerRectangle *temp = g_new (PopplerRectangle, 1);
for(i=*xmax; i>maxxblock->x1; i--) {
temp->x1 = i;
temp->x2 = *xmax;
temp->y1 = minxblock->y1;
temp->y2 = minxblock->y2;
int overlap = 0;
for(j=0; j<length; j++) {
PopplerRectangle *pr = (PopplerRectangle *) g_list_nth_data (blocks, j);
if (y_axis_overlap(temp, pr))
overlap++;
if (overlap > 1)
break;
}
if (overlap > 1) {
*factor = *factor + *xmax - i;
break;
}
}
}
}
gint
page_find_column(GList *blocks, int page_width)
{
int k = 0, n = 2;
for(k=0; k<g_list_length(blocks); k++) {
PopplerRectangle *tmp = (PopplerRectangle *)g_list_nth_data (blocks, k);
gdouble width = (tmp->x2 - tmp->x1);
if(width <= 60)
continue;
n = 2;
while (n*width < page_width)
n++;
return n-1;
}
}
void
page_set_columns_boundary (gdouble page_width, int factor, int pdf_column, double xmin, double ymin, double ymax)
{
int i;
gdouble col_width = (page_width - factor)/pdf_column;
column_rect = g_new (PopplerRectangle*, pdf_column);
xmin += factor;
for (i=0; i<pdf_column; i++) {
column_rect[i] = g_new (PopplerRectangle, 1);
column_rect[i]->x1 = xmin;
column_rect[i]->y1 = ymin;
column_rect[i]->x2 = xmin+col_width;
column_rect[i]->y2 = ymax;
xmin = xmin+col_width;
}
}
void
page_merge_small_blocks(GList *blocks, int pdf_column)
{
int i, j, k, column1 = 0, column2 = 0;
int length = g_list_length (blocks);
for (i = 0; i < length; i++)
{
PopplerRectangle * pr1 = (PopplerRectangle *)g_list_nth_data (blocks, i);
for (j=0; j<pdf_column; j++) {
if (pr1->x1 >= column_rect[j]->x1 && pr1->x2 <= column_rect[j]->x2) {
column1 = j;
break;
}
}
for (k = 0; k < length; k++)
{
PopplerRectangle * pr2 = (PopplerRectangle *)g_list_nth_data (blocks, k);
if (i != k) {
for (j=0; j<pdf_column; j++) {
if (pr2->x1 >= column_rect[j]->x1 && pr2->x2 <= column_rect[j]->x2) {
column2 = j;
break;
}
}
if (column1 == column2) {
if (x_axis_overlap(pr1,pr2)) {
pr1->x1 = pr2->x1 = pr1->x1 < pr2->x1 ? pr1->x1 : pr2->x1;
pr1->y1 = pr2->y1 = pr1->y1 < pr2->y1 ? pr1->y1 : pr2->y1;
pr1->x2 = pr2->x2 = pr1->x2 > pr2->x2 ? pr1->x2 : pr2->x2;
pr1->y2 = pr2->y2 = pr1->y2 > pr2->y2 ? pr1->y2 : pr2->y2;
}
}
}
}
}
}
GList*
page_find_intersection_blocks(GList *blocks, int pdf_column, gdouble page_width, gdouble page_height, int *num_ips)
{
int i, j, column1;
*num_ips = 1;
GList *intersection_blocks = NULL;
for (i=0; i<g_list_length(blocks); i++) {
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
column1 = -1;
for (j=0; j<pdf_column; j++) {
if (pr->x1 >= column_rect[j]->x1 && pr->x2 <= column_rect[j]->x2) {
column1 = j;
break;
}
}
if (column1 == -1) {
intersection_blocks = g_list_append (intersection_blocks, pr);
(*num_ips)++;
}
}
PopplerRectangle *lastgroup = g_new (PopplerRectangle, 1);
lastgroup->x1 = 0;
lastgroup->y1 = 2*page_height;
lastgroup->x2 = page_width;
lastgroup->y2 = 2*page_height;
intersection_blocks = g_list_append (intersection_blocks, lastgroup);
if (intersection_blocks)
intersection_blocks = g_list_sort (intersection_blocks, (GCompareFunc) sort_yaxis);
return intersection_blocks;
}
GList **
page_divide_blocks_to_groups(GList *blocks, GList *intersection_blocks, int num_ips)
{
int i, j;
GList **groups = g_new (GList*, num_ips);
for (i=0; i<num_ips; i++)
groups[i] = NULL;
/* Take the block find the appropriate group */
if (intersection_blocks) {
for (i=0; i<g_list_length(blocks); i++) {
int group = -1;
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
PopplerRectangle *temp = poppler_rectangle_copy (pr);
for (j=0; j<num_ips; j++) {
PopplerRectangle *grp_rect = (PopplerRectangle *)g_list_nth_data (intersection_blocks, j);
if (check_group(pr, grp_rect)) {
group = j;
break;
}
}
if (group != -1)
groups[group] = g_list_append (groups[group], temp);
}
}
else {
for (i=0; i<num_ips; i++) {
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
PopplerRectangle *temp = poppler_rectangle_copy (pr);
groups[i] = g_list_append (groups[i], temp);
}
}
return groups;
}
void
page_merge_groups(GList **groups, GList **blocks, int num_ips)
{
int i;
*blocks = NULL;
for (i=0; i<num_ips; i++) {
if (groups[i]) {
if (*blocks == NULL)
*blocks = groups[i];
else
*blocks = g_list_concat (*blocks, groups[i]);
}
}
}
GList*
page_blocks_remove_redundant_blocks(GList *blocks)
{
int i, j;
GList *merged_blocks = NULL;
int length = g_list_length(blocks);
for (i=0; i<length; i++) {
PopplerRectangle * pr1 = (PopplerRectangle *)g_list_nth_data (blocks, i);
gboolean ind = false;
for(j=i+1; j<length; j++) {
PopplerRectangle * pr2 = (PopplerRectangle *)g_list_nth_data (blocks, j);
if (rect_equal (pr1, pr2) && i!=j)
ind = true;
}
if(!ind)
merged_blocks = g_list_append(merged_blocks, pr1);
}
return merged_blocks;
}
GList*
find_intersection_blocks (PopplerRectangle *rectangle, GList *blocks)
{
int i;
GList *iblocks = NULL;
for(i = 0; i<g_list_length(blocks); i++) {
PopplerRectangle *pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
if(rectangle_overlap (rectangle, pr))
iblocks = g_list_append (iblocks, (gpointer)i);
}
return iblocks;
}
void swap (double *data1, double *data2)
{
double temp;
temp = *data1;
*data1 = *data2;
*data2 = temp;
}
GList*
page_get_results(GList *rects, GList *blocks, PopplerRectangle *boundary_rect, cairo_t *cr)
{
GList *tcresult = NULL;
if (boundary_rect->x2 < boundary_rect->x1)
swap(&(boundary_rect->x2), &(boundary_rect->x1));
if (boundary_rect->y2 < boundary_rect->y1)
swap(&(boundary_rect->y2), &(boundary_rect->y1));
int start, end;
GList *intersectionblocks = find_intersection_blocks (boundary_rect, blocks);
start = (gint)g_list_nth_data(intersectionblocks, 0);
end = (gint)g_list_nth_data(intersectionblocks, g_list_length(intersectionblocks)-1);
cairo_set_source_rgba (cr, 0.8, 0.0, 0.0, 0.3);
cairo_move_to (cr, boundary_rect->x1, boundary_rect->y1);
cairo_line_to (cr, boundary_rect->x2, boundary_rect->y2);
cairo_stroke (cr);
if(start > end) {
int temp = start;
start = end;
end = temp;
}
int i, j;
int first = 0, last = 0;
for(i=0; i<g_list_length(rects); i++) {
PopplerRectangle *prectangle = (PopplerRectangle *) g_list_nth_data (rects, i);
for (j=start; j<=end; j++) {
PopplerRectangle *blk = (PopplerRectangle *) g_list_nth_data (blocks, j);
if (rectangle_overlap(blk, prectangle)) {
if (j!= start && j!= end)
tcresult = g_list_append (tcresult, prectangle);
else if (j == start && j == end) {
gboolean tb = true;
if (first > 0) tb = false;
if (check_rectangle_overlap(boundary_rect, prectangle, tb)) {
if ((prectangle->x1 < boundary_rect->x1) && first == 0) {
prectangle->x1 = boundary_rect->x1;
}
if ((prectangle->x2 > boundary_rect->x2) && last == 1) {
prectangle->x2 = boundary_rect->x2;
last = 2;
}
PopplerRectangle * pr1 = (PopplerRectangle *)g_list_nth_data (rects, i+2);
if(pr1) {
if (!check_rectangle_overlap (boundary_rect, pr1, false))
last = 1;
}
tcresult = g_list_append (tcresult, prectangle);
first = 1;
}
} else if (j == start) {
if (check_rectangle_overlap(boundary_rect, prectangle, true) || first == 1) {
if ((prectangle->x1 < boundary_rect->x1) && first == 0) {
prectangle->x1 = boundary_rect->x1;
}
tcresult = g_list_append (tcresult, prectangle);
first = 1;
}
}
else if (j == end) {
if (boundary_rect->y2 > prectangle->y2) {
if ((prectangle->x2 > boundary_rect->x2) && last == 1) {
prectangle->x2 = boundary_rect->x2;
last = 2;
}
tcresult = g_list_append (tcresult, prectangle);
PopplerRectangle * pr1 = (PopplerRectangle *)g_list_nth_data (rects, i+2);
if(pr1) {
if (pr1->y2 > boundary_rect->y2)
last = 1;
}
}
}
}
}
}
return tcresult;
}
void
show_result (GList *result, cairo_t *cr)
{
if(result != NULL) {
int i;
for(i = 0; i<g_list_length (result); i++) {
PopplerRectangle *rect = (PopplerRectangle *)g_list_nth_data (result, i);
cairo_set_source_rgba (cr, 0.0, 0.0, 1.0, 0.1);
cairo_rectangle (cr,
rect->x1,
rect->y1,
rect->x2 - rect->x1,
rect->y2 - rect->y1);
cairo_fill (cr);
}
cairo_stroke (cr);
}
}
gboolean expose_event_cb (GtkWidget *widget, GdkEventExpose *event, gchar *filename)
{
cairo_t *cr = gdk_cairo_create (widget->window);
cairo_set_source_rgb (cr, 1.0, 1.0, 1.0);
cairo_paint (cr);
GList *rects = NULL;
char *nfilename = (char*)malloc(strlen(filename)+8);
strcpy(nfilename,"file://");
strcat(nfilename,filename);
PopplerDocument *popplerdoc;
PopplerPage *page;
popplerdoc = poppler_document_new_from_file(nfilename,NULL,NULL);
npages = poppler_document_get_n_pages(popplerdoc);
int i,j;
page = poppler_document_get_page(popplerdoc,page_no);
//cairo_scale (cr, 0.5, 0.5); //SCALE, SCALE);
poppler_page_render (page, cr);
double w,h;
poppler_page_get_size(page,&w,&h);
PopplerRectangle *prect = g_new(PopplerRectangle,1);
prect->x1 = 0;
prect->y1 = 0;
prect->x2 = w;
prect->y2 = h;
rects = poppler_page_get_selection_region(page,1.0,POPPLER_SELECTION_LINE,prect);
/* FIND THE BLOCKS */
GList *blocks = NULL;
PopplerRectangle *block = NULL;
for (i = 0; i < g_list_length (rects); i++)
{
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (rects, i);
PopplerRectangle *temp = poppler_rectangle_copy (pr);
if(block == NULL) {
block = temp;
}
else {
if(i != g_list_length(rects)-1)
find_block(&blocks, block, temp, FALSE);
else
find_block(&blocks, block, temp, TRUE);
}
} /* END OF FINDING BLOCKS */
/* FIND PAGE BOUNDARY */
double xmin = 2000,ymin = 2000 , xmax = 0,ymax = 0;
int minx_index = g_list_length (blocks);
int maxx_index = 0;
int factor = 0;
page_boundary (blocks, &xmin, &ymin, &xmax, &ymax, &minx_index, &maxx_index, &factor);
/* FIND THE COLUMN OF THE PAGE*/
gdouble page_width = (xmax - xmin);
gdouble page_height= (ymax - ymin);
blocks = g_list_sort (blocks, (GCompareFunc) compare_func);
pdf_column = page_find_column(blocks, page_width);
/* SET THE BOUNDARIES FOR MULTICOLUMNS*/
page_set_columns_boundary(page_width, factor, pdf_column, xmin, ymin, ymax);
/* UNION SMALL BLOCKS IN MULTICOLUMN PAGE */
page_merge_small_blocks(blocks, pdf_column);
/* FIND THE INTERSECTION BLOCKS AND SET THE GROUPS ACCORDING TO THOSE POINTS*/
int num_ips = 0;
GList *intersection_blocks = page_find_intersection_blocks(blocks, pdf_column, page_width, page_height, &num_ips);
/* DIVIDE THE BLOCKS INTO GROUPS ACCORDING TO INTERSECTION BLOCKS */
GList **groups = page_divide_blocks_to_groups(blocks, intersection_blocks, num_ips);
/* Sort individual blocks within groups */
for (i=0; i<num_ips; i++)
groups[i] = g_list_sort (groups[i], (GCompareFunc) sort_groups);
/* MERGE THE ALL GROUPS IN TO SINGLE*/
page_merge_groups (groups, &blocks, num_ips);
/* REMOVE REPETITION BLOCKS*/
GList *merged_blocks = page_blocks_remove_redundant_blocks(blocks);
PopplerRectangle *boundary_rect = g_new (PopplerRectangle,1);
PopplerRectangle *page_rect = g_new (PopplerRectangle,1);
page_rect->x1 = 0;
page_rect->y1 = 0;
page_rect->x2 = w;
page_rect->y2 = h;
boundary_rect->x1 = bx1;
boundary_rect->y1 = by1;
boundary_rect->x2 = bx2;
boundary_rect->y2 = by2;
GList *result = page_get_results(rects, merged_blocks, boundary_rect, cr);
show_result (result, cr);
cairo_destroy (cr);
poppler_page_selection_region_free(rects);
g_object_unref (page);
g_object_unref (popplerdoc);
return TRUE;
}
gboolean
button_move_event_cb (GtkWidget *widget, GdkEventButton *event, gchar *filename)
{
bx2 = event->x;
by2 = event->y;
gtk_widget_queue_draw(widget);
}
gboolean
button_press_event_cb (GtkWidget *widget, GdkEventButton *event, gchar *filename)
{
gettimeofday(&start_time, NULL);
bx1 = event->x;
by1 = event->y;
handler_id = g_signal_connect (G_OBJECT (widget), "motion_notify_event", G_CALLBACK (button_move_event_cb), filename);
return TRUE;
}
gboolean
button_release_event_cb (GtkWidget *widget, GdkEventButton *event, gchar *filename)
{
bx2 = event->x;
by2 = event->y;
gtk_widget_queue_draw(widget);
g_signal_handler_disconnect(widget, handler_id);
gettimeofday(&end_time, NULL);
double t1 = start_time.tv_sec + (start_time.tv_usec/1000000.0);
double t2 = end_time.tv_sec + (end_time.tv_usec/1000000.0);
printf("\n start = %lf , end = %lf , diff = %lf \n", t1, t2, (t2-t1));
return TRUE;
}
void button1_clicked (GtkButton *button, GtkWidget *draw)
{
if (page_no > 0)
{
page_no--;
bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
gtk_widget_queue_draw (draw);
}
}
void button2_clicked (GtkButton *button, GtkWidget *draw)
{
if (page_no < npages-1)
{
page_no++;
bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
gtk_widget_queue_draw (draw);
}
}
int main(int argc, char **argv) {
g_type_init();
gtk_init (&argc, &argv);
GtkWidget *window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
GtkWidget *draw = gtk_drawing_area_new ();
GtkWidget *vbox = gtk_vbox_new (0, 0);
GtkWidget *hbox = gtk_hbox_new (0, 0);
GtkWidget *button1 = gtk_button_new_with_label ("<<");
GtkWidget *button2 = gtk_button_new_with_label (">>");
gtk_box_pack_start (GTK_BOX (vbox), draw, 0, 0, 0);
gtk_box_pack_start (GTK_BOX (vbox), hbox, 0, 0, 0);
gtk_box_pack_end (GTK_BOX (hbox), button2, 0, 0, 10);
gtk_box_pack_end (GTK_BOX (hbox), button1, 0, 0, 10);
gtk_window_set_default_size (GTK_WINDOW (window), 612, 792);
gtk_widget_set_size_request (draw, 612, 792);
gtk_container_add (GTK_CONTAINER (window), vbox);
page_no = 0;
gtk_widget_show_all (window);
g_signal_connect (G_OBJECT (draw), "expose-event", G_CALLBACK (expose_event_cb), argv [1]);
g_signal_connect (G_OBJECT (draw), "button-press-event", G_CALLBACK (button_press_event_cb), argv [1]);
g_signal_connect (G_OBJECT (draw), "button-release-event", G_CALLBACK (button_release_event_cb), argv [1]);
gtk_widget_add_events (draw, GDK_ALL_EVENTS_MASK);
g_signal_connect (G_OBJECT (window), "destroy", gtk_main_quit, NULL);
g_signal_connect (G_OBJECT (button1), "clicked", G_CALLBACK (button1_clicked), draw);
g_signal_connect (G_OBJECT (button2), "clicked", G_CALLBACK (button2_clicked), draw);
gtk_main ();
}
#include <poppler.h>
#include <poppler-document.h>
#include <poppler-page.h>
#include <string.h>
#include <stdlib.h>
#include <gtk/gtk.h>
int page_no; // pageno
int npages; // num pages
GList *blocks = NULL;
int pdf_column = 1; // default single column
PopplerRectangle **column_rect; // find the boundaries of the columns
gboolean
block_overlap (PopplerRectangle *block, PopplerRectangle *area)
{
if(!((block->x1 >= area->x2) || (block->x2 <= area->x1) || (block->y1 >= area->y2) || (block->y2 <= area->y1)))
return TRUE;
else
return FALSE;
}
gboolean
rectangle_overlap (PopplerRectangle *block, PopplerRectangle *area)
{
if(!((block->x1 >= area->x2) || (block->x2 <= area->x1) || (block->y1 >= area->y2) || (block->y2 <= area->y1-3)))
return TRUE;
else
return FALSE;
}
void find_block(PopplerRectangle *block,PopplerRectangle *newrect,gboolean flag)
{
double xmin,ymin,xmax,ymax;
xmin = block->x1;
ymin = block->y1;
xmax = block->x2;
ymax = block->y2;
gboolean ind = FALSE;
if (rectangle_overlap (block, newrect))
{
xmin = (xmin > newrect->x1) ? newrect->x1 : xmin;
xmax = (xmax < newrect->x2) ? newrect->x2 : xmax;
ymin = (ymin > newrect->y1) ? newrect->y1 : ymin;
ymax = (ymax < newrect->y2) ? newrect->y2 : ymax;
block->x1 = xmin;
block->x2 = xmax;
block->y1 = ymin;
block->y2 = ymax;
}
else {
ind = TRUE;
}
if(flag || ind) {
PopplerRectangle *blk = g_new(PopplerRectangle,1);
blk->x1 = block->x1;
blk->y1 = block->y1;
blk->x2 = block->x2;
blk->y2 = block->y2;
block->x1 = newrect->x1;
block->y1 = newrect->y1;
block->x2 = newrect->x2;
block->y2 = newrect->y2;
PopplerRectangle *last = NULL;
if (g_list_length(blocks) > 0) {
int i;
gboolean overlap =false;
for(i=0; i<g_list_length(blocks); i++) {
last = (PopplerRectangle *)g_list_nth_data(blocks, i);
if (rectangle_overlap(last, blk)) { // if overlap take the union of rectangles
last->x1 = last->x1 < blk->x1 ? last->x1: blk->x1;
last->y1 = last->y1 < blk->y1 ? last->y1: blk->y1;
last->x2 = last->x2 > blk->x2 ? last->x2: blk->x2;
last->y2 = last->y2 > blk->y2 ? last->y2: blk->y2;
overlap = true;
}
}
if(!overlap)
blocks = g_list_append(blocks,blk);
}
else {
blocks = g_list_append(blocks,blk);
}
if(flag && ind)
blocks = g_list_append(blocks,block);
}
}
gboolean y_axis_overlap (PopplerRectangle *a, PopplerRectangle *b)
{
if (!(a->x1 > b->x2 || a->x2 < b->x1))
return true;
return false;
}
gboolean x_axis_overlap (PopplerRectangle *a, PopplerRectangle *b)
{
if (!(a->y1 > b->y2 || a->y2 < b->y1))
return true;
return false;
}
gint find_column(PopplerRectangle *a)
{
int i;
for(i=0; i<pdf_column; i++) {
if (block_overlap(a, column_rect[i]))
return i;
}
}
gint sort_groups(PopplerRectangle *a, PopplerRectangle *b)
{
int c1 = find_column(a);
int c2 = find_column(b);
if(c1 < c2) return -1;
if(c1 == c2) {
if(a->y1 < b->y1) return -1;
else if(a->y1 == b->y1) {
if(a->x1 < b->x1) return -1;
if(a->x1 == b->x1) return 0;
if(a->x1 > b->x1) return 1;
} else if(a->y1 > b->y1)
return 1;
}
if(c1 > c2) return 1;
}
gint sort_yaxis(PopplerRectangle *a, PopplerRectangle *b)
{
if (a->y1 < b->y1) return -1; // a comes before b
if (a->y1 == b->y1) return 0; // equal
if (a->y1 > b->y1) return 1; // a comes after b
}
/* sort blocks according to height */
gint compare_func (PopplerRectangle *a, PopplerRectangle *b)
{
gdouble heightA = a->y2 - a->y1;
gdouble heightB = b->y2 - b->y1;
if (heightA > heightB) return -1;
if (heightA == heightB) return 0;
if (heightA < heightB) return 1;
}
gboolean rect_equal (PopplerRectangle *a, PopplerRectangle *b)
{
if (a->x1 == b->x1 && a->y1 == b->y1 && a->x2 == b->x2 && a->y2 == b->y2)
return true;
return false;
}
gboolean check_group(PopplerRectangle *rect, PopplerRectangle *grprect)
{
if (rect->y2 < grprect->y1 || rect_equal(rect, grprect))
return true;
else
return false;
}
gboolean expose_event_cb (GtkWidget *widget, GdkEventExpose *event, gchar *filename)
{
cairo_t *cr = gdk_cairo_create (widget->window);
cairo_set_source_rgb (cr, 1.0, 1.0, 1.0);
cairo_paint (cr);
GList *rects = NULL;
char *nfilename = (char*)malloc(strlen(filename)+8);
strcpy(nfilename,"file://");
strcat(nfilename,filename);
PopplerDocument *popplerdoc;
PopplerPage *page;
popplerdoc = poppler_document_new_from_file(nfilename,NULL,NULL);
npages = poppler_document_get_n_pages(popplerdoc);
int i,j;
page = poppler_document_get_page(popplerdoc,page_no);
//cairo_scale (cr, 0.5, 0.5); //SCALE, SCALE);
poppler_page_render (page, cr);
double w,h;
poppler_page_get_size(page,&w,&h);
PopplerRectangle *prect = g_new(PopplerRectangle,1);
prect->x1 = 0;
prect->y1 = 0;
prect->x2 = w;
prect->y2 = h;
rects = poppler_page_get_selection_region(page,1.0,POPPLER_SELECTION_LINE,prect);
/* FIND THE BLOCKS */
blocks = NULL;
PopplerRectangle *block = NULL;
for (i = 0; i < g_list_length (rects); i++)
{
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (rects, i);
PopplerRectangle *temp = poppler_rectangle_copy (pr);
if(block == NULL) {
block = temp;
}
else {
if(i != g_list_length(rects)-1)
find_block(block, temp, FALSE);
else
find_block(block, temp, TRUE);
}
} /* END OF FINDING BLOCKS */
/* FIND PAGE BOUNDARY */
double xmin = 2000,ymin = 2000 , xmax = 0,ymax = 0;
int minx_index = g_list_length (blocks);
int maxx_index = 0;
int factor = 0;
if(blocks != NULL) {
for (i = 0; i < g_list_length (blocks); i++)
{
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
if(xmin > pr->x1) { xmin = pr->x1; minx_index = i; }
if(ymin > pr->y1) { ymin = pr->y1; }
if(xmax < pr->x2) { xmax = pr->x2; maxx_index = i; }
if(ymax < pr->y2) { ymax = pr->y2; }
}
PopplerRectangle *minxblock = (PopplerRectangle *) g_list_nth_data (blocks, minx_index);
if ((minxblock->y2 - minxblock->y1) < 50) {
PopplerRectangle *temp = g_new (PopplerRectangle, 1);
for(i=xmin; i<(xmin+(minxblock->x2 - minxblock->x1)); i++) {
temp->x1 = xmin;
temp->x2 = i;
temp->y1 = minxblock->y1;
temp->y2 = minxblock->y2;
int overlap = 0;
for(j=0; j<g_list_length(blocks); j++) {
PopplerRectangle *pr = (PopplerRectangle *) g_list_nth_data (blocks, j);
if (y_axis_overlap(temp, pr)) {
overlap++;
}
if (overlap > 1)
break;
}
if (overlap > 1) {
factor = i - xmin;
break;
}
}
}
PopplerRectangle *maxxblock = (PopplerRectangle *) g_list_nth_data (blocks, maxx_index);
if ((maxxblock->y2 - maxxblock->y1) < 50) {
PopplerRectangle *temp = g_new (PopplerRectangle, 1);
for(i=xmax; i>maxxblock->x1; i--) {
temp->x1 = i;
temp->x2 = xmax;
temp->y1 = minxblock->y1;
temp->y2 = minxblock->y2;
int overlap = 0;
for(j=0; j<g_list_length(blocks); j++) {
PopplerRectangle *pr = (PopplerRectangle *) g_list_nth_data (blocks, j);
if (y_axis_overlap(temp, pr)) {
overlap++;
}
if (overlap > 1)
break;
}
if (overlap > 1) {
factor = factor + xmax - i;
break;
}
}
}
} /* END OF FINDING PAGE BOUNDARY */
/* FIND THE COLUMN OF THE PAGE*/
gdouble page_width = (xmax - xmin);
gdouble page_height= (ymax - ymin);
blocks = g_list_sort (blocks, (GCompareFunc) compare_func);
int k = 0;
int n = 2;
for(k=0; k<g_list_length(blocks); k++) {
PopplerRectangle *tmp = (PopplerRectangle *)g_list_nth_data (blocks, k);
gdouble width = (tmp->x2 - tmp->x1);
if ( width <= 50) {
continue;
}
n = 2;
while (n*width < page_width)
n++;
break;
} /* END OF FINDING COLUMN OF THE BLOCK */
pdf_column = n-1; // set the num columns in the document
printf("\n pdf column = %d \n", pdf_column);
/* UNION THOSE SMALL BLOCKS IN SINGLE COLUMN*/
if (n-1 == 1) { //single column
for(i=0; i<g_list_length(blocks); i++) {
PopplerRectangle *pr1 = (PopplerRectangle *)g_list_nth_data (blocks, i);
for(j=0; j<g_list_length(blocks); j++) {
PopplerRectangle *pr2 = (PopplerRectangle *)g_list_nth_data (blocks, j);
if (x_axis_overlap(pr1,pr2)) { // UNION THOSE BLOCKS
pr1->x1 = pr2->x1 = pr1->x1 < pr2->x1 ? pr1->x1 : pr2->x1;
pr1->y1 = pr2->y1 = pr1->y1 < pr2->y1 ? pr1->y1 : pr2->y1;
pr1->x2 = pr2->x2 = pr1->x2 > pr2->x2 ? pr1->x2 : pr2->x2;
pr1->y2 = pr2->y2 = pr1->y2 > pr2->y2 ? pr1->y2 : pr2->y2;
}
}
}
} /*END OF UNION SMALL BLOCKS IN SINGLE COLUMN*/
factor = 0;
printf("\n factor = %d \n", factor);
/* SET THE BOUNDARIES FOR MULTICOLUMNS*/
gdouble col_width = (page_width - factor)/(n-1);
column_rect = g_new (PopplerRectangle*, n-1);
xmin += factor;
for (i=0; i<n-1; i++) {
column_rect[i] = g_new (PopplerRectangle, 1);
column_rect[i]->x1 = xmin;
column_rect[i]->y1 = ymin;
column_rect[i]->x2 = xmin+col_width;
column_rect[i]->y2 = ymax;
xmin = xmin+col_width;
if (i % pdf_column == 0)
cairo_set_source_rgba (cr, 1.0, 0.0, 0.0, 0.3);
if (i % pdf_column == 1)
cairo_set_source_rgba (cr, 0.0, 0.0, 1.0, 0.3);
else
cairo_set_source_rgba (cr, 0.0, 1.0, 0.0, 0.3);
/*cairo_rectangle (cr,
column_rect[i]->x1,
column_rect[i]->y1,
column_rect[i]->x2 - column_rect[i]->x1,
column_rect[i]->y2 - column_rect[i]->y1);
cairo_fill (cr);*/
} /*END OF SETTING BOUNDARIES FOR MULTICOLUMN*/
/* UNION SMALL BLOCKS IN MULTICOLUMN PAGE */
int column1 = 0, column2 = 0;
for (i = 0; i < g_list_length (blocks); i++)
{
PopplerRectangle * pr1 = (PopplerRectangle *)g_list_nth_data (blocks, i);
for (j=0; j<n-1; j++) {
if (pr1->x1 >= column_rect[j]->x1 && pr1->x2 <= column_rect[j]->x2) {
column1 = j; break;
}
}
for (k = 0; k < g_list_length (blocks); k++)
{
PopplerRectangle * pr2 = (PopplerRectangle *)g_list_nth_data (blocks, k);
if (i != k) {
for (j=0; j<n-1; j++) {
if (pr2->x1 >= column_rect[j]->x1 && pr2->x2 <= column_rect[j]->x2) {
column2 = j; break;
}
}
if (column1 == column2) {
if (x_axis_overlap(pr1,pr2)) {
pr1->x1 = pr2->x1 = pr1->x1 < pr2->x1 ? pr1->x1 : pr2->x1;
pr1->y1 = pr2->y1 = pr1->y1 < pr2->y1 ? pr1->y1 : pr2->y1;
pr1->x2 = pr2->x2 = pr1->x2 > pr2->x2 ? pr1->x2 : pr2->x2;
pr1->y2 = pr2->y2 = pr1->y2 > pr2->y2 ? pr1->y2 : pr2->y2;
}
}
}
}
} /* END OF UNION SMALL BLOCKS IN MULTICOLUMN PAGE */
/* FIND THE INTERSECTION BLOCKS AND SET THE GROUPS ACCORDING TO THOSE POINTS*/
/* Find the intersection Points */
int num_ips = 1;
GList *intersection_blocks = NULL;
for (i=0; i<g_list_length(blocks); i++) {
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
column1 = -1;
for (j=0; j<n-1; j++) {
if (pr->x1 >= column_rect[j]->x1 && pr->x2 <= column_rect[j]->x2) {
column1 = j; break;
}
}
if (column1 == -1) {
intersection_blocks = g_list_append (intersection_blocks, pr);
num_ips++;
}
}
PopplerRectangle *lastgroup = g_new (PopplerRectangle, 1);
lastgroup->x1 = 0;
lastgroup->y1 = 2*page_height;
lastgroup->x2 = page_width;
lastgroup->y2 = 2*page_height;
intersection_blocks = g_list_append (intersection_blocks, lastgroup);
if (intersection_blocks)
intersection_blocks = g_list_sort (intersection_blocks, (GCompareFunc) sort_yaxis);
GList **groups = g_new (GList*, num_ips);
for (i=0; i<num_ips; i++) {
groups[i] = NULL;
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (intersection_blocks, i);
}
/* Take the block find the appropriate group */
if (intersection_blocks) {
for (i=0; i<g_list_length(blocks); i++) {
int group = -1;
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
PopplerRectangle *temp = poppler_rectangle_copy (pr);
for (j=0; j<num_ips; j++) {
PopplerRectangle *grp_rect = (PopplerRectangle *)g_list_nth_data (intersection_blocks, j);
if (check_group(pr, grp_rect)) {
group = j;
break;
}
}
if (group != -1)
groups[group] = g_list_append (groups[group], temp);
}
}
else {
for (i=0; i<num_ips; i++) {
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (blocks, i);
PopplerRectangle *temp = poppler_rectangle_copy (pr);
groups[i] = g_list_append (groups[i], temp);
}
}
/*END OF FINDING INTERSECTION BLOCKS AND ARRANGE THE BLOCKS ACCORDING TO GROUPS*/
/* Sort individual groups within */
for (i=0; i<num_ips; i++) {
groups[i] = g_list_sort (groups[i], (GCompareFunc) sort_groups);
}
/* MERGE THE ALL GROUPS IN TO SINGLE*/
blocks = NULL;
for (i=0; i<num_ips; i++) {
if (groups[i]) {
if (blocks == NULL) {
blocks = groups[i];
}
else {
blocks = g_list_concat (blocks, groups[i]);
}
}
} /*END OF MERGING BLOCKS*/
/* REMOVE REPETITION BLOCKS*/
GList *merged_blocks = NULL;
for (i=0; i<g_list_length(blocks); i++) {
PopplerRectangle * pr1 = (PopplerRectangle *)g_list_nth_data (blocks, i);
gboolean ind = false;
for(j=i+1; j<g_list_length(blocks); j++) {
PopplerRectangle * pr2 = (PopplerRectangle *)g_list_nth_data (blocks, j);
if (rect_equal (pr1, pr2) && i!=j) {
ind = true;
}
}
if(!ind)
merged_blocks = g_list_append(merged_blocks, pr1);
} /* END OF REMOVING REPETITION BLOCKS*/
/* DISPLAY THE BOUNDARIES OF SORTED BLOCKS*/
char num [6];
PangoLayout *layout;
PangoFontDescription *desc;
PangoContext *context;
for (i = 0; i < g_list_length (merged_blocks); i++)
{
PopplerRectangle * pr = (PopplerRectangle *)g_list_nth_data (merged_blocks, i);
cairo_set_source_rgba (cr, 0.0, 1.0, 0.0, 0.3);
cairo_rectangle (cr,
pr->x1,
pr->y1,
pr->x2 - pr->x1,
pr->y2 - pr->y1);
cairo_fill (cr);
sprintf (num, "%d", i+1);
cairo_set_source_rgba (cr, 0.0, 0.0, 0.0, 1.0);
context = pango_cairo_create_context (cr);
layout = pango_layout_new (context);
pango_layout_set_text (layout, num, -1);
desc = pango_font_description_from_string ("sans 15");
pango_layout_set_font_description (layout, desc);
pango_font_description_free (desc);
cairo_move_to (cr, pr->x1 +(pr->x2 - pr->x1)/2.0, pr->y1 + (pr->y2 - pr->y1)/2.0);
pango_cairo_layout_path (cr, layout);
cairo_fill (cr);
} /* END OF DISPLAY*/
cairo_destroy (cr);
poppler_page_selection_region_free(rects);
g_object_unref (page);
g_object_unref (popplerdoc);
return TRUE;
}
gboolean
button_press_event_cb (GtkWidget *widget, GdkEventButton *event, gchar *filename)
{
cairo_t *cr = gdk_cairo_create (widget->window);
cairo_set_source_rgb (cr, 1.0, 1.0, 1.0);
cairo_paint (cr);
GList *rects = NULL;
char *nfilename = (char*)malloc(strlen(filename)+8);
strcpy(nfilename,"file://");
strcat(nfilename,filename);
PopplerDocument *popplerdoc;
PopplerPage *page;
popplerdoc = poppler_document_new_from_file(nfilename,NULL,NULL);
page = poppler_document_get_page(popplerdoc,page_no);
poppler_page_render (page, cr);
cairo_destroy (cr);
g_object_unref (page);
g_object_unref (popplerdoc);
return TRUE;
}
void button1_clicked (GtkButton *button, GtkWidget *draw)
{
if (page_no > 0)
{
page_no--;
gtk_widget_queue_draw (draw);
}
}
void button2_clicked (GtkButton *button, GtkWidget *draw)
{
if (page_no < npages-1)
{
page_no++;
gtk_widget_queue_draw (draw);
}
}
void button3_clicked (GtkButton *button, GtkWidget *draw)
{
double x,y,xw,yw;
//printf("Enter the rectangle x , y , xw, yw ");
scanf("%lf%lf%lf%lf",&x,&y,&xw,&yw);
cairo_t *cr = gdk_cairo_create (draw->window);
cairo_set_source_rgba (cr, 0.0, 0.0, 1.0, 0.2);
cairo_rectangle (cr, x, y, xw, yw);
cairo_fill (cr);
cairo_destroy (cr);
// gtk_widget_queue_draw (draw);
}
int main(int argc, char **argv) {
g_type_init();
gtk_init (&argc, &argv);
GtkWidget *window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
GtkWidget *draw = gtk_drawing_area_new ();
GtkWidget *vbox = gtk_vbox_new (0, 0);
GtkWidget *hbox = gtk_hbox_new (0, 0);
GtkWidget *button1 = gtk_button_new_with_label ("<<");
GtkWidget *button2 = gtk_button_new_with_label (">>");
GtkWidget *button3 = gtk_button_new_with_label ("Find Block");
gtk_box_pack_start (GTK_BOX (vbox), draw, 0, 0, 0);
gtk_box_pack_start (GTK_BOX (vbox), hbox, 0, 0, 0);
gtk_box_pack_end (GTK_BOX (hbox), button2, 0, 0, 10);
gtk_box_pack_end (GTK_BOX (hbox), button1, 0, 0, 10);
gtk_box_pack_end (GTK_BOX (hbox), button3, 0, 0, 10);
gtk_window_set_default_size (GTK_WINDOW (window), 612, 792);
gtk_widget_set_size_request (draw, 612, 792);
gtk_container_add (GTK_CONTAINER (window), vbox);
page_no = 0;
gtk_widget_show_all (window);
g_signal_connect (G_OBJECT (draw), "expose-event", G_CALLBACK (expose_event_cb), argv [1]);
g_signal_connect (G_OBJECT (draw), "button-press-event", G_CALLBACK (button_press_event_cb), argv [1]);
gtk_widget_add_events (draw, GDK_ALL_EVENTS_MASK);
g_signal_connect (G_OBJECT (window), "destroy", gtk_main_quit, NULL);
g_signal_connect (G_OBJECT (button1), "clicked", G_CALLBACK (button1_clicked), draw);
g_signal_connect (G_OBJECT (button2), "clicked", G_CALLBACK (button2_clicked), draw);
g_signal_connect (G_OBJECT (button3), "clicked", G_CALLBACK (button3_clicked), draw);
gtk_main ();
}
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler