Package: visitors
Version: 0.4a-1
Severity: normal

The parser in visitors doesn't correctly extract the User Agent
string from the normal log file format.

In particular it fails on some common cases, including one produced 
by IE (conntype). As a result browser stats will be incomplete and
less accurate.

The patch attached is a hack to avoid rewriting too much code, and 
seems to do a better job, but it needs proper inspection, 
and possibly rewriting.

The patch also includes fixes for two other bugs reported for visitors.
 Long line lengths
 Old lists of user agent strings

-- System Information:
Debian Release: 3.1
  APT prefers testing
  APT policy: (500, 'testing')
Architecture: i386 (i686)
Kernel: Linux 2.4.27-2-686-smp
Locale: LANG=C, LC_CTYPE=C (charmap=ANSI_X3.4-1968)

Versions of packages visitors depends on:
ii  libc6                       2.3.2.ds1-22 GNU C Library: Shared libraries an

-- no debconf information
--- visitors.c.orig	Mon Jan 17 14:49:58 2005
+++ visitors.c	Tue Mar 29 14:48:00 2005
@@ -763,7 +763,7 @@
 int vi_parse_line(struct logline *ll, char *l)
 {
 	char *date, *hour, *timezone, *host, *agent, *req, *ref, *p;
-	char *agent_start = NULL, *req_end = NULL, *ref_end = NULL;
+	char *req_end = NULL, *ref_end = NULL;
 
 	/* Seek the start of the different components */
 
@@ -773,18 +773,26 @@
 	if ((date = strchr(l, '[')) == NULL) return 1;
 	date++;
 	/* agent */
-	if ((agent = strchr(l, '(')) == NULL) {
+
+	/* SRW original code sets agent="" if no "(" found.
+	 * else sets "p" to the location of the agent opening "
+	 * modified to find last " instead */
+	 
+	if ((agent = strrchr(l, '"')) == NULL) {
 		agent = "";
 	} else {
-		p = agent;
+		*agent = '\0';
+		p = agent - 1;
 		while (p >= l) {
 			if (*p == '"') {
-				agent_start = p;
+				agent = p+1;
 				break;
 			}
 			p--;
 		}
-	}
+	}   
+
+
 	/* req */
 	if ((req = strstr(l, "\"GET")) != NULL ||
 	    (req = strstr(l, "\"POST")) != NULL ||
@@ -843,24 +851,6 @@
 		ref_end = p;
 		*p = '\0';
 	}
-	/* agent */
-	if ((p = strchr(agent, ')')) == NULL) {
-		agent = "";
-	} else {
-		char *aux;
-
-		aux = strchr(p, '"');
-		if (aux)
-			*aux = '\0';
-		else
-			*(p+1) = '\0';
-		if (agent_start) {
-			if ((!req_end || (req_end != agent_start)) &&
-			    (!ref_end || (ref_end != agent_start))) {
-				agent = agent_start+1;
-			}
-		}
-	}
 
 	/* Fill the struture */
 	ll->host = host;
@@ -1040,13 +1030,23 @@
 	/* Note that the order matters. For example Safari
 	 * send an user agent where there is the string "Gecko"
 	 * so it must be before Gecko. */
+
+	/* Note that the parser looks for first "(" so we can't easily count
+	 * Useragents that report without brackets like;
+	 *  "asteria",
+	 *  "contype",
+	 *  "Windows-MP",
+	 *  "Gigabot",
+	 *  etc - we should modify to find fifth '"' perhaps? */
+ 
 	char *browserslist[] = {
 		"Opera", NULL,
 		"MSIE", "Explorer",
 		"Safari", NULL,
 		"Konqueror", NULL,
 		"Galeon", NULL,
-		"Gecko", "Netscape/Mozilla",
+		"Firefox", NULL,
+		"Gecko", "Other Gecko",
 		"Wget", NULL,
 		"Lynx", NULL,
 		"Links ", "Links",
@@ -1056,6 +1056,15 @@
 		"NATSU-MICAN", NULL,
 		"www.googlebot.com", "GoogleBot",
 		"www.google.com/bot", "GoogleBot",
+		"msnbot", "MSNbot",
+		"Slurp", "Yahoo Slurp",
+		"Jeeves", "Ask Jeeves",
+		"ZyBorg", NULL,
+		"asteria", NULL,
+		"contype", "Explorer",
+		"Gigabot", NULL,
+		"Windows-Media-Player", "Windows-MP",
+		"NSPlayer", NULL,
 		"", "Unknown",
 		NULL, NULL,
 	};
@@ -1743,7 +1752,7 @@
 			r = (0xAA*val)/max;
 			g = (0xBB*val)/max;
 			b = (0xFF*val)/max;
-			fprintf(fp, "<td style=\"background-color: #%02X%02X%02X;\">&nbsp;</td>", r, g, b);
+			fprintf(fp, "<td style=\"background-color: #%02X%02X%02X;\">&nbsp;</td>\n", r, g, b);
 		}
 		fprintf(fp, "</tr>\n");
 	}

Reply via email to