#!/home/antrix/local/bin/python """peastat - simple live web stats http://www.throwingbeans.org/peastat/ instructions: 1. configure the 'logfile' and 'rooturl' values below 2. upload peastat.py somewhere it can be executed on your web server (e.g. your cgi-bin) 3. make peastat.py executable (set its permissions to 755) """ __version__ = "0.2.3" __author__ = "Tom Dyson (tomdyson at spamcop dot net)\n Deepak Sarda (deepak at antrix.net)" __copyright__ = "(C) 2005 Tom Dyson. GNU GPL 2." __url__ = 'http://www.throwingbeans.org/peastat/' import cgitb cgitb.enable() import cgi, os, re, time, urllib try: import dbm # anydbm is unreliable... except: import dumbdbm as dbm # start configuring: logfile = "/home/antrix/logs/antrix.net/http/access.log" # full path to log file if not os.path.isfile(logfile): logfile = logfile + '.0' rooturl = "http://antrix.net" # root url of site whose logs we're analysing # configure if you want to: minresults = 5 # minimum results to include in overview lastlines = 2000 # number of most recent requests to analyse ispage = re.compile('(/|\.html|\.htm|\.php|\.comments)$').search # requests matching this regex count as pages isspecial = re.compile('(\.avi|\.apk|\.ogg|\.gz|\.pdf|\.ps|\.zip|dumble/|dsarda_resume\.html)$').search # special resources where a single request also should be shown ignorebots = re.compile('([Bb]ot)|([Rr]obot)|([Ss]urvey)|([Ss]pider)|([Cc]rawler)|([Ff]etch)|([Ss]cout)|(@)|(Java)|(Perl)').search # ignore bots & crawlers ignorelines = re.compile('pea\.py').search # ignore lines including this regex recentreferrers = 10 # show this many recent referrers recentsearches = 10 # show this many recent search terms database = "/home/antrix/.antrix-peastat.db" # store DNS lookups here # stop configuring url = None; ip = None; atom = False cgiloc = os.environ.get('SCRIPT_NAME', '') request_uri = os.environ.get('REQUEST_URI', '') server_name = os.environ.get('SERVER_NAME', '') apachetoday = time.strftime('%d/%b/%Y') # todo ooh form = cgi.FieldStorage() if form.has_key( "url" ): url = form["url"].value if form.has_key( "ip" ): ip = form["ip"].value if form.has_key( "atom" ): atom = True def justdomain(url): # Return only the domain of a URL try: return url.split('//')[1].split('/')[0] except IndexError: # catch evil referrers return 'bad referrer' thisdomain = justdomain(rooturl) def sortByValue(d): """ Returns the keys of dictionary d sorted by their values """ items=d.items() backitems=[ [v[1],v[0]] for v in items] backitems.sort(); backitems.reverse() return [ backitems[i][1] for i in range(0,len(backitems))] def tailLines(filename,linesback): """python tail - modified from recipe at http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/157035 returns list of [linesback] lines from end of [filename]""" avgcharsperline=150 file = open(filename,'r') while 1: try: file.seek(-1 * avgcharsperline * linesback,2) except IOError: file.seek(0) if file.tell() == 0: atstart=1 else: atstart=0 lines=file.read().split("\n") if (len(lines) > (linesback+1)) or atstart: break #The lines are bigger than we thought avgcharsperline=avgcharsperline * 1.3 #Inc avg for retry file.close() if len(lines) > linesback: start=len(lines)-linesback -1 else: start=0 return lines[start:len(lines)-1] def simpleEncodeForGoogleChart(values): simpleEncoding = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' scale = len(simpleEncoding) - 1.0 m = max(values) return ''.join([simpleEncoding[int(round(scale*v/m))] if (v>=0) else '_' for v in values]) def timeSinceApacheDate(apacheDate): then = time.strptime(apacheDate,'%d/%b/%Y:%H:%M:%S') then = time.mktime(then) now = time.mktime(time.localtime()) minutesSince = (now-then) / 60 hours, minutes = divmod(minutesSince,60) return int(hours), int(minutes) def getDNS(ip): # get the domain name, if we've seen it before try: db = dbm.open(database, "c") if db.has_key(ip): addr = db[ip] else: addr = ip db.close() except: addr = ip return addr def getLogLines(logfile): try: logLines = tailLines(logfile,lastlines) except: # or try system's tail logLines = os.popen('/usr/bin/tail -n ' + str(lastlines) + ' ' + logfile).readlines() if len(logLines) == 0: # can't handle popen exceptions properly raise Exception ('No lines found') return logLines loglines = getLogLines(logfile) def getOverview(): t0 = time.time() overview = {'cgiloc':cgiloc} hits = {} pagecount = 0 overview["totalhits"] = len(loglines) referrers = [] queries = {} timeoffirsthit = loglines[0].split(' ')[3].replace('[','') hourssince, minutessince = timeSinceApacheDate(timeoffirsthit) hitscountbyhour = [0 for i in range(hourssince + 1)] for line in loglines: chunks = line.split(' ') if 'POST' in chunks[5]: continue # Don't want to see POST requests which are mostly from spambots if chunks[8] == '301' or chunks[8] == '403': continue # Ignore these HTTP codes resource = chunks[6] if (ispage(resource) or isspecial(resource)) and not ignorelines(line) and not ignorebots(line.split('"')[-2]): pagecount = pagecount + 1 hits[resource] = hits.get(resource,0) + 1 hr, min = timeSinceApacheDate(line.split(' ')[3].replace('[','')) hitscountbyhour[hr] += 1 lastres = resource line = line.replace('\\"','"') # some agents include escaped quotes referrer = line.split('"')[-4] if len(referrer) > 1 and referrer.find(thisdomain) == -1: # count queries querydict = cgi.parse_qs(referrer.split("?")[-1]) if referrer.count(".yahoo."): q = querydict.get("p") else: q = querydict.get("q") if not q: q = querydict.get("query") if q: q = q[0].lower() queries[q] = queries.get(q,0) + 1 referrers.append([referrer, q]) t1 = time.time() overview["timing"] = int((t1 - t0) * 1000) overview["logfile"] = logfile overview["timeoffirsthit"] = timeoffirsthit overview["hits"] = hits overview["lastrequest"] = lastres overview["pagecount"] = pagecount overview["referrers"] = referrers overview["queries"] = queries pagehitsperhour = pagecount / (hourssince + ( float(minutessince) / 60 )) overview["hourssince"], overview["minutessince"] = hourssince, minutessince overview["pagehitsperhour"] = int(round(pagehitsperhour)) overview["hitscountbyhour"] = hitscountbyhour return overview def displayOverviewHTML(overview): print """
Summary
First hit counted %(hourssince)s hours, %(minutessince)s minutes ago
Total hits: %(totalhits)s
Page hits: %(pagecount)s (%(pagehitsperhour)s per hour)
Last page request: %(lastrequest)s details
Processing time: %(timing)s ms
""" % overview
hitsStr = simpleEncodeForGoogleChart(overview['hitscountbyhour'])
if hitsStr and len(hitsStr) > 1:
print """""" % (hitsStr,)
print "
Recent popular pages (%s or more requests)
""" % minresults
hits = overview["hits"]
for res in sortByValue(hits):
score = hits[res]
if score >= minresults:
print """%s:
%s
""" % (res, overview["cgiloc"], urllib.quote(res), score)
print """
Recent special requests
"""
for res in sortByValue(hits):
if isspecial(res):
print """%s:
%s
""" % (res, overview["cgiloc"], urllib.quote(res), hits[res])
print """
%s recent referrers
""" % recentreferrers
referrers = overview["referrers"]
referrers.reverse()
for referrer, query in referrers[0:recentreferrers]:
referrer = referrer.replace("&","&")
print """%s""" % (referrer, referrer, justdomain(referrer))
if query: print " - %s" % query
print "
"
print "
%s recent popular search terms
""" % recentsearches
queries = overview["queries"]
for query in sortByValue(queries)[0:recentsearches]:
query_score = queries[query]
quoted_query = query.replace('"','%22')
print """%(query)s:
%(query_score)s
""" % vars()
print "
Requests for %s
""" % (url, url)
counter = 1
referrers = []
for line in loglines:
resource = line.split(' ')[6]
if resource == url and not ignorelines(line) and not ignorebots(line.split('"')[-2]):
time = line.split(' ')[3].replace('[','')
if time.startswith(apachetoday): time = time.replace(apachetoday +':','today, ')
ip = line.split(' ')[0]
addr = getDNS(ip)
print """%(counter)s: %(time)s: %(addr)s
""" % vars()
counter = counter + 1
line = line.replace('\\"','"') # some agents include escaped quotes
referrer = line.split('"')[-4]
if len(referrer) > 1 and referrer.find(thisdomain) == -1:
# count queries
querydict = cgi.parse_qs(referrer.split("?")[-1])
if referrer.count(".yahoo."):
q = querydict.get("p")
else: q = querydict.get("q")
if not q: q = querydict.get("query")
if q:
q = q[0].lower()
referrers.append([referrer, q])
print "
%s recent referrers
""" % recentreferrers
referrers.reverse()
for referrer, query in referrers[0:recentreferrers]:
referrer = referrer.replace("&","&")
print """%s""" % (referrer, referrer, justdomain(referrer))
if query: print " - %s" % query
print "
"
print "
Visit details for %s
hostname: %s
""" % (ip, addr)
counter = 1; pagecounter = 1
for line in loglines:
address = line.split(' ')[0]
if address == ip:
time = line.split(' ')[3].replace('[','')
if time.startswith(apachetoday): time = time.replace(apachetoday +':','today, ')
resource = line.split(' ')[6]
if counter == 1:
referrer = line.split('"')[-4]
user_agent = line.split('"')[-2]
if len(user_agent) > 50: user_agent = user_agent[0:50].strip() + "..."
if len(referrer) > 1:
print """referrer: %(referrer)s
""" % vars()
print """browser: %(user_agent)s
""" % vars()
if ispage(resource) or isspecial(resource):
quotedresource = urllib.quote(resource)
print """%(pagecounter)s: %(time)s: %(resource)s [details]
""" % vars()
pagecounter += 1
counter += 1
print "
peastat %s © tom dyson 2005 // updates, bugs, suggestions