PicoStats

This is a little script for extracting basic stats from Apache access logs that demonstrates a series of basic techniques: text parsing, boolean conditional expressions, objects as dictionary elements and computing a running average.

The Counter class maintains a series of stats (minimum, maximum, average and total) values for whatever you feed into it, and the LogAnalyzer builds upon that to manage a dictionary of Counter objects, one for each User-Agent.

Since LogAnalyzer inherits from Counter, it can easily keep track of overall values as well.

#!/usr/bin/env python

"""Minimalist HTTP Log Analyzer for Python"""
__version__ = "0.2"
__author__ = "Rui Carmo (http://the.taoofmac.com)"
__copyright__ = "(C) 2005 Rui Carmo. Code under BSD License."

import sys, re, fileinput, string

class Counter:
  total = 0
  count = 0
  average = 0
  last = 0
  max = 0
  min = sys.maxint

  def push(self, value):
    """Compute average and other stats as we go along"""
    self.average = self.average + ((value - self.average) / (self.count + 1.0))
    self.last = value
    self.total += value
    self.count += 1
    self.max = (self.max,value)[value > self.max]
    self.min = (self.min,value)[value < self.min]
  # end def
# end class

class LogAnalyzer(Counter):
  agents = {}
  table = {}

  def __init__(self, filename, regexp):
    self.regexp = re.compile(regexp)
    for line in fileinput.input(filename):
      self.match(line)
  # end def

  def match(self, line):
    fields = string.split(line)
    uri = fields[6]
    code = fields[8]
    bytes = fields[9]
    agent = (string.join(fields[11:]))[1:-1]
    if string.find("/", agent):
      agent = (string.split(agent,"/"))[0]
    if self.regexp.match(uri):
      if code == "200" and bytes <> "-":
        if not agent in self.agents:
           self.agents[agent] = Counter()
        (self.agents[agent]).push(int(bytes))
        self.push(int(bytes))
  # end def

  def dump(self):
    self.table['Others'] = {}
    self.table['Others']['count'] = 0
    for agent in self.agents.keys():
      a = self.agents[agent]
      percent = a.count * 1.0 / self.count
      if percent < 0.01: # group all agents below 1 percent
        self.table['Others']['count'] += a.count
      else:
        if not agent in self.table:
          self.table[agent] = {}
        self.table[agent]['count'] = a.count
        self.table[agent]['percent'] = percent
    if self.count == 0:
      self.table['Others']['percent'] = 0
    else:
      self.table['Others']['percent'] = self.table['Others']['count'] * 1.0 / self.count
    for agent in self.table.keys():
      print "%s: %5d %0.2f" % (agent, self.table[agent]['count'], self.table[agent]['percent'])
  # end def
# end class

if __name__ == '__main__':
  o = LogAnalyzer("access_log", ".*\=(rss|atom)$")
  o.dump()

As is, the code will output something like this (columns aligned for readability):

NewsPipe:          94 0.01
Straw:             96 0.01
Mozilla:         2952 0.35
NetNewsWire:     3160 0.37
Others:           999 0.12
NewsFire:         764 0.09
kinjabot          118 0.01
FeedOnFeeds:      138 0.02
libwww-perl:      184 0.02

I use this mostly to figure out RSS readership and (using the min and max member variables) to detect poorly implemented aggregators (if the average is too near the maximum value, then it's pretty much a given that that aggregator doesn't support gzip encoding or If-Modified-Since:).

Sorting the table and improving formatting is left as an exercise to the reader.

This page is referenced in: