Bug#695798: webalizer: Matching referrers and sites for grouping is very primitive

Nikos Mavrogiannopoulos Wed, 12 Dec 2012 11:42:18 -0800

Package: webalizer
Version: 2.23.05-1
Severity: wishlist
Tags: upstream

Hello,
 Trying to group the google sites with webalizer is hell as its matching
algorithm is primitive.  For example:
www.google.co.uk, www.google.com and www.google.ie cannot be catched by using
something like www.google.*


The attached (quick) patch allows perl regular expressions for advanced
matching and allows rules like:
GroupReferrer   @www\.google\..*        Google
GroupReferrer   @[a-zA-Z\.]*freecode\.com/.*    Freecode

regards,
Nikos

diff -ur webalizer-2.23-05/configure.in webalizer-2.23-05.new/configure.in
--- webalizer-2.23-05/configure.in	2011-01-03 05:53:44.000000000 +0100
+++ webalizer-2.23-05.new/configure.in	2012-12-12 19:50:55.604735560 +0100
@@ -119,6 +119,11 @@
   AC_MSG_ERROR(z library not found.. please install libz)
 fi
 
+AC_CHECK_LIB(pcre, main, PCRE="yes"; LIBS="-lpcre ${LIBS}", PCRE="no")
+if test "${PCRE}" = "no"; then
+  AC_MSG_ERROR(libpcre was not found.. please install it!)
+fi
+
 AC_CHECK_HEADER(zlib.h, HDR="yes", HDR="no")
 if test "${HDR}" = "no"; then
   AC_MSG_ERROR(zlib.h header not found.. please install)
diff -ur webalizer-2.23-05/linklist.c webalizer-2.23-05.new/linklist.c
--- webalizer-2.23-05/linklist.c	2011-01-03 05:51:20.000000000 +0100
+++ webalizer-2.23-05.new/linklist.c	2012-12-12 20:22:42.652680042 +0100
@@ -32,6 +32,7 @@
 #include <unistd.h>                           /* normal stuff             */
 #include <ctype.h>
 #include <sys/utsname.h>
+#include <pcre.h>
 
 /* ensure sys/types */
 #ifndef _SYS_TYPES_H
@@ -52,6 +53,8 @@
 #include "lang.h"
 #include "linklist.h"
 
+#define OVECCOUNT 30
+
 /* internal function prototypes */
 
 NLISTPTR new_nlist(char *);                         /* new list node       */
@@ -274,9 +277,38 @@
 int isinstr(char *str, char *cp)
 {
    char *cp1,*cp2;
+   char *pattern;
+   const char* error;
+   int erroffset, rc;
+   int ovector[OVECCOUNT];
+   
+   pcre *re;
 
    cp1=(cp+strlen(cp))-1;
-   if (*cp=='*')
+
+   if (*cp=='@') /* perl regex */
+   {
+     pattern = ++cp;
+     
+     re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
+     if (re == NULL)
+     {
+       fprintf(stderr, "Error in perl regex at offset %d: %s\n", erroffset, error);
+       return 0;
+     }
+     
+     rc = pcre_exec(re, NULL, str, strlen(str), 0, 0, ovector, OVECCOUNT);
+     if (rc < 0)
+     {
+       pcre_free(re);
+       return 0;
+     }
+
+     /* fprintf(stderr, "\nMatch succeeded (%s l: %s) at offset %d\n", pattern, str, ovector[0]); */
+     pcre_free(re);
+     return 1;
+   }
+   else if (*cp=='*')
    {
       /* if leading wildcard, start from end */
       cp2=str+strlen(str)-1;
diff -ur webalizer-2.23-05/sample.conf webalizer-2.23-05.new/sample.conf
--- webalizer-2.23-05/sample.conf	2011-01-03 05:53:33.000000000 +0100
+++ webalizer-2.23-05.new/sample.conf	2012-12-12 20:28:57.340669134 +0100
@@ -572,6 +572,9 @@
 #GroupSite	*.compuserve.com
 
 #GroupReferrer	yahoo.com/	Yahoo!
+# note that patterns starting with '@' are perl regular expressions
+#GroupReferrer	@www\.google\..* Google
+#GroupReferrer   www.bing.com/   Bing
 #GroupReferrer	excite.com/     Excite
 #GroupReferrer	infoseek.com/   InfoSeek
 #GroupReferrer	webcrawler.com/ WebCrawler

Bug#695798: webalizer: Matching referrers and sites for grouping is very primitive

Reply via email to