update-stats-geoip/.gitignore | 3 update-stats-geoip/README | 25 +++++ update-stats-geoip/analyser.py | 50 ++++++++++ update-stats-geoip/compiler.py | 203 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 281 insertions(+)
New commits: commit b81dbf61684484263690043909334f17f85709c5 Author: Andrzej Hunt <[email protected]> Date: Thu Jan 23 15:43:30 2014 +0000 Add update-stats-geoip for tracking of updates pings per country. Change-Id: Id0740a4ae484c96ad7456c1747c24ac5591966fa Reviewed-on: https://gerrit.libreoffice.org/7613 Reviewed-by: Michael Meeks <[email protected]> Tested-by: Michael Meeks <[email protected]> diff --git a/update-stats-geoip/.gitignore b/update-stats-geoip/.gitignore new file mode 100644 index 0000000..70ca242 --- /dev/null +++ b/update-stats-geoip/.gitignore @@ -0,0 +1,3 @@ +data +GeoIP.dat +storage-country-months diff --git a/update-stats-geoip/README b/update-stats-geoip/README new file mode 100644 index 0000000..163893f --- /dev/null +++ b/update-stats-geoip/README @@ -0,0 +1,25 @@ +Produces a csv list of pings per country from the Update pings. + + +1. Ensure you have the GeoLite Country Database, "GeoIP.dat", and place + it in the script folder: + + http://dev.maxmind.com/geoip/legacy/geolite/ + +2. Place the stats data in data/ (relative to the script folder) + +3. Run compiler.py -- this can take a few hours, and places it's temporary data + in storage-country-months/ + +4. Run analyser.py (is fast) + +If you have updated the ping data in data/, re-running compiler.py should take +much less time as it reuses the previously parsed data (although this isn't +fully tested yet). + +To count unique IPs per month rather than update pings, for a quick hack solution +it would probably be sufficient to replace line 117: +ipHits[sReadableVersion][sIP] = ipHits.get(sReadableVersion, {}).get(sIP,0) + 1 +with +ipHits[sReadableVersion][sIP] = 1 + diff --git a/update-stats-geoip/analyser.py b/update-stats-geoip/analyser.py new file mode 100755 index 0000000..14ca245 --- /dev/null +++ b/update-stats-geoip/analyser.py @@ -0,0 +1,50 @@ +#!/usr/bin/python + +# Introduces Python 3 style print +from __future__ import print_function + +import argparse +import collections +import datetime +import cPickle as pickle +import os + +VERSIONLIST = set({"3.3", "3.4", "3.5", "3.6", "4.0", "4.1", "4.2"}) + +parser = argparse.ArgumentParser(description='Process LO Update Ping data') +#parser.addArgument('versions', metavar='V', type=int, nargs='+', + #help='The LO versions you would like to analyse (3.5, 3.6, 4.0, 4.1.)') + +args = parser.parse_args() + + +sPrefix = "country-months" + +#sSep = "\t" # Easier on the eyes +sSep = ";" # Easier on the software + +# Year/Month to start with. +year = 2011 +month = 1 + +sHeaderLine="Country" + sSep + "Version" + sSep + "YearMonth" + sSep + "hits" + +print(sHeaderLine) + +while (datetime.date(year, month, 1) + datetime.timedelta(days=20)) < datetime.date.today(): + month += 1 + if (month > 12): + year += 1 + month = 1 + aData = collections.defaultdict(dict) + for version in VERSIONLIST: + sFile = "storage-" + sPrefix + "/" + version + "/" + str(year) + "/" + str(month).zfill(2) + "/countryhits.dat" + if not os.path.exists(sFile): + continue; + with open(sFile, 'r') as aFile: + aData[version] = pickle.load(aFile) + + for sVersion in VERSIONLIST: + for sCountry,nHits in aData.get(sVersion, {}).iteritems(): + if (nHits > 300) and len(sCountry) > 0: + print(sCountry + sSep + sVersion + sSep + str(year)+str(month).zfill(2) + sSep + str(nHits)) diff --git a/update-stats-geoip/compiler.py b/update-stats-geoip/compiler.py new file mode 100755 index 0000000..7192b3e --- /dev/null +++ b/update-stats-geoip/compiler.py @@ -0,0 +1,203 @@ +#!/usr/bin/python +# +# You will need to get the GeoLite ASN database from: +# http://dev.maxmind.com/geoip/legacy/geolite/ +# +# No proper support for ipV6 addresses yet (i.e. geolocation/ISP data) -- they +# are in a separate db, and only available in newer version of python-geoip (i.e. +# would have to be built separately) so possibly not worth the bother yet? + +# Introduces Python 3 style print +from __future__ import print_function + +import bz2 +import collections +import ConfigParser +import datetime +import GeoIP +import cPickle as pickle +import re +import sys +import time +import os.path + +#sResolution = "%W" # Split by week +sResolution = "%m" # Split by month + +sPrefix = "country-months" + + +sGEOIPFile = "GeoIP.dat" + +gi = GeoIP.open(sGEOIPFile,GeoIP.GEOIP_STANDARD) + +def getCountryForIP(sIP): + return str(gi.country_name_by_addr(sIP)) + +dataLessDates = { + datetime.date(2013,1,3), + datetime.date(2013,2,28), + datetime.date(2013,3,6), + datetime.date(2013,3,7), + datetime.date(2013,4,11), + datetime.date(2013,4,12), + datetime.date(2013,4,13), + datetime.date(2013,4,14), + datetime.date(2013,8,28), + datetime.date(2013,8,29), + datetime.date(2013,8,30), + datetime.date(2013,8,31), + datetime.date(2013,9,1),datetime.date(2013,9,2),datetime.date(2013,9,3),datetime.date(2013,9,4),datetime.date(2013,9,5),datetime.date(2013,9,6),datetime.date(2013,9,7), + datetime.date(2013,9,8),datetime.date(2013,9,9),datetime.date(2013,9,10) + } + +VERSIONLIST = set({"3.3", "3.4", "3.5", "3.6", "4.0", "4.1", "4.2"}) + +linePattern = re.compile('^([^ ]+) - - \[([^\/]+)\/([^\/]+)\/([^:]+):([0-9][0-9])[^\]]*\] "GET [^"]*" [^ ]+ [^ ]+ "[^"]*" "[^ ]* ([0-9]\.[0-9])[^(]*\(([^-;]+)[^;]*; ([^;]*);') + +print("*Analysing IPs...") + +config = ConfigParser.RawConfigParser() +config.read('storage-' + sPrefix + '/compiler.cfg') + + +if config.has_option('Main', 'last_year'): + currentFileDate = datetime.date(int(config.get('Main', 'last_year')), int(config.get('Main', 'last_month')), int(config.get('Main', 'last_day'))) +else: + currentFileDate = datetime.date(2012,04,27) + +def getCurrentFileName(): + return "data/" + "update.libreoffice.org-access_log-" + currentFileDate.strftime("%Y%m%d") + ".bz2" + +sKnownIPsLocation = "storage-" + sPrefix + "/knownIPs.dat" + +knownIPs = set() +if os.path.isfile(sKnownIPsLocation): + f = open(sKnownIPsLocation, 'r') + knownIPs = set(f.readlines()) + +ipHits = collections.defaultdict(dict) +currentWeek = "" + +newIPsOverall = set() # We keep a track of new IPs overall +newIPs = collections.defaultdict(set) # But also new IPs associated with what version they downloaded + +lastDate = datetime.date(1980,1,1) + +while os.path.isfile(getCurrentFileName()) or currentFileDate in dataLessDates: + print(getCurrentFileName()) + + if (currentFileDate in dataLessDates): + currentFileDate += datetime.timedelta(days=1) + continue + + with bz2.BZ2File(getCurrentFileName(), 'r') as aFile: + for line in aFile: + m = linePattern.split(line) + if len(m) > 1: + sIP = m[1] + sDay = m[2] + sMonth = m[3] + sYear = m[4] + + currentDate = datetime.date(int(sYear), time.strptime(sMonth,'%b').tm_mon, int(sDay)) + + # Store the week, reinitialise counts -- should be refactored out + if currentWeek != currentDate.strftime(sResolution): + # We need the week before, i.e the week we just parsed + # The exact date we hit doesn't matter, but we need to shift + # an entire week since e.g. 6 days of data could be missing + # and currentDate *could* be the last day of the week. + storageDate = lastDate + if currentWeek != "": + for version in VERSIONLIST: + sDirectory = "storage-" + sPrefix + "/" + version + storageDate.strftime("/%Y/" + sResolution) + if not os.path.exists(sDirectory): + os.makedirs(sDirectory) + pickle.dump( ipHits.get(version, {}), open(sDirectory + "/iphits.dat", 'w' )) + + # Create country mapping here: + for sVersion in VERSIONLIST: + countryHits = {} # Reuse for every version to save memory + for sIP,nHits in ipHits.get(sVersion, {}).iteritems(): + sCountry = getCountryForIP(sIP) + if len(sCountry) > 0: + countryHits[sCountry] = countryHits.get(sCountry,0) + nHits + + sDirectory = "storage-" + sPrefix + "/" + sVersion + storageDate.strftime("/%Y/" + sResolution) + if not os.path.exists(sDirectory): + os.makedirs(sDirectory) + pickle.dump( countryHits, open(sDirectory + "/countryhits.dat", 'w' )) + print("storing "+ sDirectory + "/countryhits.dat") + + # Deal with new IPSortedByHits + # per version + for sVersion in VERSIONLIST: + sDirectory = "storage-" + sPrefix + "/" + version + storageDate.strftime("/%Y/" + sResolution) + file = open(sDirectory + "/newips.dat", 'w') + file.writelines( "%s\n" % item for item in newIPs[sVersion] ) + # And total + sDirectory = "storage-" + sPrefix + "/" + "overall" + storageDate.strftime("/%Y/" + sResolution) + if not os.path.exists(sDirectory): + os.makedirs(sDirectory) + file = open(sDirectory + "/newips.dat", 'w') + file.writelines( "%s\n" % item for item in newIPs ) + knownIPs.add(sIP) + + # Cleanup + currentWeek = currentDate.strftime(sResolution) + print("Now on week " + currentDate.strftime(sResolution + " of %Y")) + ipHits = collections.defaultdict(dict) + newIPsOverall = set() # We keep a track of new IPs overall + newIPs = collections.defaultdict(set) # But also new IPs associated with what version they downloaded + + # And read in existing data for this week + for version in VERSIONLIST: + sFile = "storage-" + sPrefix + "/" + version + currentDate.strftime("/%Y/" + sResolution) + "/iphits.dat" + if os.path.exists(sFile): + print("*********************************************************************************") + print("WARNING: data mis-ordered, we are reloading the following file, could be very wrong") + print(sFile) + #raw_input("Press Enter to continue: ") + ipHits[version] = pickle.load(open(sFile)) + #pickle.dump( ipHits.get(sVersion, {}), open(sDirectory + "/iphits.dat", 'w' )) + print("reading "+ sFile) + # Countries are reprocessed on every write, so we can ignore them + # Unique IPs will be a mess if data isn't ordered, so ignore for now... + # TODO: deal with ^^^ + + lastDate = currentDate + + #sHour = m[5] + #sOS = m[7] #Unused + sVersion = m[6] # Hash of version... + + if sVersion in VERSIONLIST: # Some people hit the url with a browser, so various UAs here.... + ipHits[sVersion][sIP] = ipHits.get(sVersion, {}).get(sIP,0) + 1 + + if sIP not in knownIPs: + newIPsOverall.add(sIP) + newIPs[sVersion].add(sIP) + # We don't add to the knownIPs list yet, as the data + # could get lost as we only write completed weeks + # of data, hence we update knownIPS in the week storage + # mechanism above + else: + print("Unknown version: " + line) + + currentFileDate += datetime.timedelta(days=1) + + +if not config.has_section('Main'): + config.add_section('Main') + +config.set('Main', 'last_year', currentFileDate.strftime("%Y")) +config.set('Main', 'last_month', currentFileDate.strftime("%m")) +config.set('Main', 'last_day', currentFileDate.strftime("%d")) + +config.write(open("storage-" + sPrefix + "/compiler.cfg", 'w')) + +file = open(sKnownIPsLocation, 'w') +file.writelines( "%s\n" % item for item in knownIPs ) + +print("*Completed successfully") \ No newline at end of file _______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
