This is a little script for extracting basic stats from Apache access logs that demonstrates a series of basic Python techniques: text parsing, boolean conditional expressions, objects as dictionary elements and computing a running average.
The Counter class maintains a series of stats (minimum, maximum, average and total) values for whatever you feed into it, and the LogAnalyzer builds upon that to manage a dictionary of Counter objects, one for each User-Agent.
Since LogAnalyzer inherits from Counter, it can easily keep track of overall values as well.
#!/usr/bin/env python
"""Minimalist HTTP Log Analyzer for Python"""
__version__ = "0.2"
__author__ = "Rui Carmo (http://the.taoofmac.com)"
__copyright__ = "(C) 2005 Rui Carmo. Code under BSD License."
import sys, re, fileinput, string
class Counter:
total = 0
count = 0
average = 0
last = 0
max = 0
min = sys.maxint
def push(self, value):
"""Compute average and other stats as we go along"""
self.average = self.average + ((value - self.average) / (self.count + 1.0))
self.last = value
self.total += value
self.count += 1
self.max = (self.max,value)[value > self.max]
self.min = (self.min,value)[value < self.min]
# end def
# end class
class LogAnalyzer(Counter):
agents = {}
table = {}
def __init__(self, filename, regexp):
self.regexp = re.compile(regexp)
for line in fileinput.input(filename):
self.match(line)
# end def
def match(self, line):
fields = string.split(line)
uri = fields[6]
code = fields[8]
bytes = fields[9]
agent = (string.join(fields[11:]))[1:-1]
if string.find("/", agent):
agent = (string.split(agent,"/"))[0]
if self.regexp.match(uri):
if code == "200" and bytes <> "-":
if not agent in self.agents:
self.agents[agent] = Counter()
(self.agents[agent]).push(int(bytes))
self.push(int(bytes))
# end def
def dump(self):
self.table['Others'] = {}
self.table['Others']['count'] = 0
for agent in self.agents.keys():
a = self.agents[agent]
percent = a.count * 1.0 / self.count
if percent < 0.01: # group all agents below 1 percent
self.table['Others']['count'] += a.count
else:
if not agent in self.table:
self.table[agent] = {}
self.table[agent]['count'] = a.count
self.table[agent]['percent'] = percent
if self.count == 0:
self.table['Others']['percent'] = 0
else:
self.table['Others']['percent'] = self.table['Others']['count'] * 1.0 / self.count
for agent in self.table.keys():
print "%s: %5d %0.2f" % (agent, self.table[agent]['count'], self.table[agent]['percent'])
# end def
# end class
if __name__ == '__main__':
o = LogAnalyzer("access_log", ".*\=(rss|atom)$")
o.dump()
As is, the code will output something like this (columns aligned for readability):
NewsPipe: 94 0.01 Straw: 96 0.01 Mozilla: 2952 0.35 NetNewsWire: 3160 0.37 Others: 999 0.12 NewsFire: 764 0.09 kinjabot 118 0.01 FeedOnFeeds: 138 0.02 libwww-perl: 184 0.02
I use this mostly to figure out RSS readership and (using the min and max member variables) to detect poorly implemented aggregators (if the average is too near the maximum value, then it's pretty much a given that that aggregator doesn't support gzip encoding or If-Modified-Since:).
Sorting the table and improving formatting is left as an exercise to the reader.