#!/home/antrix/local/bin/python """peastat - simple live web stats http://www.throwingbeans.org/peastat/ instructions: 1. configure the 'logfile' and 'rooturl' values below 2. upload peastat.py somewhere it can be executed on your web server (e.g. your cgi-bin) 3. make peastat.py executable (set its permissions to 755) """ __version__ = "0.2.3" __author__ = "Tom Dyson (tomdyson at spamcop dot net)\n Deepak Sarda (deepak at antrix.net)" __copyright__ = "(C) 2005 Tom Dyson. GNU GPL 2." __url__ = 'http://www.throwingbeans.org/peastat/' import cgitb cgitb.enable() import cgi, os, re, time, urllib try: import dbm # anydbm is unreliable... except: import dumbdbm as dbm # start configuring: logfile = "/home/antrix/logs/antrix.net/http/access.log" # full path to log file if not os.path.isfile(logfile): logfile = logfile + '.0' rooturl = "http://antrix.net" # root url of site whose logs we're analysing # configure if you want to: minresults = 5 # minimum results to include in overview lastlines = 2000 # number of most recent requests to analyse ispage = re.compile('(/|\.html|\.htm|\.php|\.comments)$').search # requests matching this regex count as pages isspecial = re.compile('(\.avi|\.apk|\.ogg|\.gz|\.pdf|\.ps|\.zip|dumble/|dsarda_resume\.html)$').search # special resources where a single request also should be shown ignorebots = re.compile('([Bb]ot)|([Rr]obot)|([Ss]urvey)|([Ss]pider)|([Cc]rawler)|([Ff]etch)|([Ss]cout)|(@)|(Java)|(Perl)').search # ignore bots & crawlers ignorelines = re.compile('pea\.py').search # ignore lines including this regex recentreferrers = 10 # show this many recent referrers recentsearches = 10 # show this many recent search terms database = "/home/antrix/.antrix-peastat.db" # store DNS lookups here # stop configuring url = None; ip = None; atom = False cgiloc = os.environ.get('SCRIPT_NAME', '') request_uri = os.environ.get('REQUEST_URI', '') server_name = os.environ.get('SERVER_NAME', '') apachetoday = time.strftime('%d/%b/%Y') # todo ooh form = cgi.FieldStorage() if form.has_key( "url" ): url = form["url"].value if form.has_key( "ip" ): ip = form["ip"].value if form.has_key( "atom" ): atom = True def justdomain(url): # Return only the domain of a URL try: return url.split('//')[1].split('/')[0] except IndexError: # catch evil referrers return 'bad referrer' thisdomain = justdomain(rooturl) def sortByValue(d): """ Returns the keys of dictionary d sorted by their values """ items=d.items() backitems=[ [v[1],v[0]] for v in items] backitems.sort(); backitems.reverse() return [ backitems[i][1] for i in range(0,len(backitems))] def tailLines(filename,linesback): """python tail - modified from recipe at http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/157035 returns list of [linesback] lines from end of [filename]""" avgcharsperline=150 file = open(filename,'r') while 1: try: file.seek(-1 * avgcharsperline * linesback,2) except IOError: file.seek(0) if file.tell() == 0: atstart=1 else: atstart=0 lines=file.read().split("\n") if (len(lines) > (linesback+1)) or atstart: break #The lines are bigger than we thought avgcharsperline=avgcharsperline * 1.3 #Inc avg for retry file.close() if len(lines) > linesback: start=len(lines)-linesback -1 else: start=0 return lines[start:len(lines)-1] def simpleEncodeForGoogleChart(values): simpleEncoding = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' scale = len(simpleEncoding) - 1.0 m = max(values) return ''.join([simpleEncoding[int(round(scale*v/m))] if (v>=0) else '_' for v in values]) def timeSinceApacheDate(apacheDate): then = time.strptime(apacheDate,'%d/%b/%Y:%H:%M:%S') then = time.mktime(then) now = time.mktime(time.localtime()) minutesSince = (now-then) / 60 hours, minutes = divmod(minutesSince,60) return int(hours), int(minutes) def getDNS(ip): # get the domain name, if we've seen it before try: db = dbm.open(database, "c") if db.has_key(ip): addr = db[ip] else: addr = ip db.close() except: addr = ip return addr def getLogLines(logfile): try: logLines = tailLines(logfile,lastlines) except: # or try system's tail logLines = os.popen('/usr/bin/tail -n ' + str(lastlines) + ' ' + logfile).readlines() if len(logLines) == 0: # can't handle popen exceptions properly raise Exception ('No lines found') return logLines loglines = getLogLines(logfile) def getOverview(): t0 = time.time() overview = {'cgiloc':cgiloc} hits = {} pagecount = 0 overview["totalhits"] = len(loglines) referrers = [] queries = {} timeoffirsthit = loglines[0].split(' ')[3].replace('[','') hourssince, minutessince = timeSinceApacheDate(timeoffirsthit) hitscountbyhour = [0 for i in range(hourssince + 1)] for line in loglines: chunks = line.split(' ') if 'POST' in chunks[5]: continue # Don't want to see POST requests which are mostly from spambots if chunks[8] == '301' or chunks[8] == '403': continue # Ignore these HTTP codes resource = chunks[6] if (ispage(resource) or isspecial(resource)) and not ignorelines(line) and not ignorebots(line.split('"')[-2]): pagecount = pagecount + 1 hits[resource] = hits.get(resource,0) + 1 hr, min = timeSinceApacheDate(line.split(' ')[3].replace('[','')) hitscountbyhour[hr] += 1 lastres = resource line = line.replace('\\"','"') # some agents include escaped quotes referrer = line.split('"')[-4] if len(referrer) > 1 and referrer.find(thisdomain) == -1: # count queries querydict = cgi.parse_qs(referrer.split("?")[-1]) if referrer.count(".yahoo."): q = querydict.get("p") else: q = querydict.get("q") if not q: q = querydict.get("query") if q: q = q[0].lower() queries[q] = queries.get(q,0) + 1 referrers.append([referrer, q]) t1 = time.time() overview["timing"] = int((t1 - t0) * 1000) overview["logfile"] = logfile overview["timeoffirsthit"] = timeoffirsthit overview["hits"] = hits overview["lastrequest"] = lastres overview["pagecount"] = pagecount overview["referrers"] = referrers overview["queries"] = queries pagehitsperhour = pagecount / (hourssince + ( float(minutessince) / 60 )) overview["hourssince"], overview["minutessince"] = hourssince, minutessince overview["pagehitsperhour"] = int(round(pagehitsperhour)) overview["hitscountbyhour"] = hitscountbyhour return overview def displayOverviewHTML(overview): print """

Summary
First hit counted %(hourssince)s hours, %(minutessince)s minutes ago
Total hits: %(totalhits)s
Page hits: %(pagecount)s (%(pagehitsperhour)s per hour)
Last page request: %(lastrequest)s details
Processing time: %(timing)s ms
""" % overview hitsStr = simpleEncodeForGoogleChart(overview['hitscountbyhour']) if hitsStr and len(hitsStr) > 1: print """
Hits per hour""" % (hitsStr,) print "

" print """

Recent popular pages (%s or more requests)
""" % minresults hits = overview["hits"] for res in sortByValue(hits): score = hits[res] if score >= minresults: print """%s: %s
""" % (res, overview["cgiloc"], urllib.quote(res), score) print """

Recent special requests
""" for res in sortByValue(hits): if isspecial(res): print """%s: %s
""" % (res, overview["cgiloc"], urllib.quote(res), hits[res]) print """

%s recent referrers
""" % recentreferrers referrers = overview["referrers"] referrers.reverse() for referrer, query in referrers[0:recentreferrers]: referrer = referrer.replace("&","&") print """%s""" % (referrer, referrer, justdomain(referrer)) if query: print " - %s" % query print "
" print "

" print """

%s recent popular search terms
""" % recentsearches queries = overview["queries"] for query in sortByValue(queries)[0:recentsearches]: query_score = queries[query] quoted_query = query.replace('"','%22') print """%(query)s: %(query_score)s
""" % vars() print "

" def urldetails(url, cgiloc): print """

Requests for %s

""" % (url, url) counter = 1 referrers = [] for line in loglines: resource = line.split(' ')[6] if resource == url and not ignorelines(line) and not ignorebots(line.split('"')[-2]): time = line.split(' ')[3].replace('[','') if time.startswith(apachetoday): time = time.replace(apachetoday +':','today, ') ip = line.split(' ')[0] addr = getDNS(ip) print """%(counter)s: %(time)s: %(addr)s
""" % vars() counter = counter + 1 line = line.replace('\\"','"') # some agents include escaped quotes referrer = line.split('"')[-4] if len(referrer) > 1 and referrer.find(thisdomain) == -1: # count queries querydict = cgi.parse_qs(referrer.split("?")[-1]) if referrer.count(".yahoo."): q = querydict.get("p") else: q = querydict.get("q") if not q: q = querydict.get("query") if q: q = q[0].lower() referrers.append([referrer, q]) print "

" print """

%s recent referrers
""" % recentreferrers referrers.reverse() for referrer, query in referrers[0:recentreferrers]: referrer = referrer.replace("&","&") print """%s""" % (referrer, referrer, justdomain(referrer)) if query: print " - %s" % query print "
" print "

" def ipdetails(ip, cgiloc): import socket try: addr = socket.gethostbyaddr(ip)[0] except: addr = 'unknown host' if addr != 'unknown host': # add successful lookups to the DNS cache try: db = dbm.open(database, "c") db[ip] = addr db.close() except: pass # fail silently - lots of things could have gone wrong... print """

Visit details for %s
hostname: %s
""" % (ip, addr) counter = 1; pagecounter = 1 for line in loglines: address = line.split(' ')[0] if address == ip: time = line.split(' ')[3].replace('[','') if time.startswith(apachetoday): time = time.replace(apachetoday +':','today, ') resource = line.split(' ')[6] if counter == 1: referrer = line.split('"')[-4] user_agent = line.split('"')[-2] if len(user_agent) > 50: user_agent = user_agent[0:50].strip() + "..." if len(referrer) > 1: print """referrer: %(referrer)s
""" % vars() print """browser: %(user_agent)s

""" % vars() if ispage(resource) or isspecial(resource): quotedresource = urllib.quote(resource) print """%(pagecounter)s: %(time)s: %(resource)s [details]
""" % vars() pagecounter += 1 counter += 1 print "

" def header(): print "Content-type: text/html\n\n" print """ peastat """ % (server_name, request_uri) print """
""" print """

peastat

""" % (cgiloc) def footer(): print """

peastat %s © tom dyson 2005 // updates, bugs, suggestions

""" % (__version__) def atomHeader(): basehref = server_name + request_uri cleanurl = rooturl.replace("http://","") timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ") print """Content-type: application/atom+xml\n\n peastat for %(cleanurl)s %(timestamp)s peastat http://%(basehref)s/""" % vars() def atomSummary(overview): basehref = server_name + request_uri cleanurl = rooturl.replace("http://","") timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ") print """ Summary for %(cleanurl)s http://%(basehref)s %(timestamp)s
""" % vars() displayOverviewHTML(overview) print """
""" def atomFooter(): print "
" if __name__ == "__main__": if atom: atomHeader() overview = getOverview() atomSummary(overview) atomFooter() else: header() if url: urldetails(url, cgiloc) elif ip: ipdetails(ip, cgiloc) else: overview = getOverview() displayOverviewHTML(overview) footer()