Package: visitors Version: 0.4a-1 Severity: normal
The parser in visitors doesn't correctly extract the User Agent string from the normal log file format. In particular it fails on some common cases, including one produced by IE (conntype). As a result browser stats will be incomplete and less accurate. The patch attached is a hack to avoid rewriting too much code, and seems to do a better job, but it needs proper inspection, and possibly rewriting. The patch also includes fixes for two other bugs reported for visitors. Long line lengths Old lists of user agent strings -- System Information: Debian Release: 3.1 APT prefers testing APT policy: (500, 'testing') Architecture: i386 (i686) Kernel: Linux 2.4.27-2-686-smp Locale: LANG=C, LC_CTYPE=C (charmap=ANSI_X3.4-1968) Versions of packages visitors depends on: ii libc6 2.3.2.ds1-22 GNU C Library: Shared libraries an -- no debconf information
--- visitors.c.orig Mon Jan 17 14:49:58 2005 +++ visitors.c Tue Mar 29 14:48:00 2005 @@ -763,7 +763,7 @@ int vi_parse_line(struct logline *ll, char *l) { char *date, *hour, *timezone, *host, *agent, *req, *ref, *p; - char *agent_start = NULL, *req_end = NULL, *ref_end = NULL; + char *req_end = NULL, *ref_end = NULL; /* Seek the start of the different components */ @@ -773,18 +773,26 @@ if ((date = strchr(l, '[')) == NULL) return 1; date++; /* agent */ - if ((agent = strchr(l, '(')) == NULL) { + + /* SRW original code sets agent="" if no "(" found. + * else sets "p" to the location of the agent opening " + * modified to find last " instead */ + + if ((agent = strrchr(l, '"')) == NULL) { agent = ""; } else { - p = agent; + *agent = '\0'; + p = agent - 1; while (p >= l) { if (*p == '"') { - agent_start = p; + agent = p+1; break; } p--; } - } + } + + /* req */ if ((req = strstr(l, "\"GET")) != NULL || (req = strstr(l, "\"POST")) != NULL || @@ -843,24 +851,6 @@ ref_end = p; *p = '\0'; } - /* agent */ - if ((p = strchr(agent, ')')) == NULL) { - agent = ""; - } else { - char *aux; - - aux = strchr(p, '"'); - if (aux) - *aux = '\0'; - else - *(p+1) = '\0'; - if (agent_start) { - if ((!req_end || (req_end != agent_start)) && - (!ref_end || (ref_end != agent_start))) { - agent = agent_start+1; - } - } - } /* Fill the struture */ ll->host = host; @@ -1040,13 +1030,23 @@ /* Note that the order matters. For example Safari * send an user agent where there is the string "Gecko" * so it must be before Gecko. */ + + /* Note that the parser looks for first "(" so we can't easily count + * Useragents that report without brackets like; + * "asteria", + * "contype", + * "Windows-MP", + * "Gigabot", + * etc - we should modify to find fifth '"' perhaps? */ + char *browserslist[] = { "Opera", NULL, "MSIE", "Explorer", "Safari", NULL, "Konqueror", NULL, "Galeon", NULL, - "Gecko", "Netscape/Mozilla", + "Firefox", NULL, + "Gecko", "Other Gecko", "Wget", NULL, "Lynx", NULL, "Links ", "Links", @@ -1056,6 +1056,15 @@ "NATSU-MICAN", NULL, "www.googlebot.com", "GoogleBot", "www.google.com/bot", "GoogleBot", + "msnbot", "MSNbot", + "Slurp", "Yahoo Slurp", + "Jeeves", "Ask Jeeves", + "ZyBorg", NULL, + "asteria", NULL, + "contype", "Explorer", + "Gigabot", NULL, + "Windows-Media-Player", "Windows-MP", + "NSPlayer", NULL, "", "Unknown", NULL, NULL, }; @@ -1743,7 +1752,7 @@ r = (0xAA*val)/max; g = (0xBB*val)/max; b = (0xFF*val)/max; - fprintf(fp, "<td style=\"background-color: #%02X%02X%02X;\"> </td>", r, g, b); + fprintf(fp, "<td style=\"background-color: #%02X%02X%02X;\"> </td>\n", r, g, b); } fprintf(fp, "</tr>\n"); }