Package: webalizer Version: 2.23.05-1 Severity: wishlist Tags: upstream Hello, Trying to group the google sites with webalizer is hell as its matching algorithm is primitive. For example: www.google.co.uk, www.google.com and www.google.ie cannot be catched by using something like www.google.*
The attached (quick) patch allows perl regular expressions for advanced matching and allows rules like: GroupReferrer @www\.google\..* Google GroupReferrer @[a-zA-Z\.]*freecode\.com/.* Freecode regards, Nikos
diff -ur webalizer-2.23-05/configure.in webalizer-2.23-05.new/configure.in --- webalizer-2.23-05/configure.in 2011-01-03 05:53:44.000000000 +0100 +++ webalizer-2.23-05.new/configure.in 2012-12-12 19:50:55.604735560 +0100 @@ -119,6 +119,11 @@ AC_MSG_ERROR(z library not found.. please install libz) fi +AC_CHECK_LIB(pcre, main, PCRE="yes"; LIBS="-lpcre ${LIBS}", PCRE="no") +if test "${PCRE}" = "no"; then + AC_MSG_ERROR(libpcre was not found.. please install it!) +fi + AC_CHECK_HEADER(zlib.h, HDR="yes", HDR="no") if test "${HDR}" = "no"; then AC_MSG_ERROR(zlib.h header not found.. please install) diff -ur webalizer-2.23-05/linklist.c webalizer-2.23-05.new/linklist.c --- webalizer-2.23-05/linklist.c 2011-01-03 05:51:20.000000000 +0100 +++ webalizer-2.23-05.new/linklist.c 2012-12-12 20:22:42.652680042 +0100 @@ -32,6 +32,7 @@ #include <unistd.h> /* normal stuff */ #include <ctype.h> #include <sys/utsname.h> +#include <pcre.h> /* ensure sys/types */ #ifndef _SYS_TYPES_H @@ -52,6 +53,8 @@ #include "lang.h" #include "linklist.h" +#define OVECCOUNT 30 + /* internal function prototypes */ NLISTPTR new_nlist(char *); /* new list node */ @@ -274,9 +277,38 @@ int isinstr(char *str, char *cp) { char *cp1,*cp2; + char *pattern; + const char* error; + int erroffset, rc; + int ovector[OVECCOUNT]; + + pcre *re; cp1=(cp+strlen(cp))-1; - if (*cp=='*') + + if (*cp=='@') /* perl regex */ + { + pattern = ++cp; + + re = pcre_compile(pattern, 0, &error, &erroffset, NULL); + if (re == NULL) + { + fprintf(stderr, "Error in perl regex at offset %d: %s\n", erroffset, error); + return 0; + } + + rc = pcre_exec(re, NULL, str, strlen(str), 0, 0, ovector, OVECCOUNT); + if (rc < 0) + { + pcre_free(re); + return 0; + } + + /* fprintf(stderr, "\nMatch succeeded (%s l: %s) at offset %d\n", pattern, str, ovector[0]); */ + pcre_free(re); + return 1; + } + else if (*cp=='*') { /* if leading wildcard, start from end */ cp2=str+strlen(str)-1; diff -ur webalizer-2.23-05/sample.conf webalizer-2.23-05.new/sample.conf --- webalizer-2.23-05/sample.conf 2011-01-03 05:53:33.000000000 +0100 +++ webalizer-2.23-05.new/sample.conf 2012-12-12 20:28:57.340669134 +0100 @@ -572,6 +572,9 @@ #GroupSite *.compuserve.com #GroupReferrer yahoo.com/ Yahoo! +# note that patterns starting with '@' are perl regular expressions +#GroupReferrer @www\.google\..* Google +#GroupReferrer www.bing.com/ Bing #GroupReferrer excite.com/ Excite #GroupReferrer infoseek.com/ InfoSeek #GroupReferrer webcrawler.com/ WebCrawler