MailArchive.py

Following an into long-term archival of web pages, this is alpha code that will download, re-format and archive a web page as e-mail, cobbled together from source code.

At the moment, this sends the result via SMTP to an arbitrary mailbox (it's easier to test with several MUAs that way, and there's no need to login to an IMAP server). The end result will append to a given IMAP folder.

Please note that this requires you to have Beautiful Soup, which is the clever bit that lets me parse whatever kind HTML I come across into submission.

Stuff to do:

  • Handle errors and redirects more gracefully
  • Fix UTF-8 issues on pages cobbled together from multiple charsets (like some planets)
  • Replace SMTP delivery with imaplib
  • Accept command-line parameters for URL, IMAP folder, username, password, etc.
  • Optionally prompt user for authentication, tags, Subject:, etc. (most likely using 's Tcl/Tk bindings for ).
  • Create a action script to invoke this (probably using to query for the current page)

Changelog:

0.2

  • Appended linked CSS to a single inline style tag, since some MUAs are unable to deal with linked CSS as CIDs to MIME multipart data. Script now removes all non-screen CSS, since MUAs get awfully confused.
  • Script now parses CSS and re-formats url() calls to point to MIME multipart data (works fine with , must check with other MUAs)
  • Able to completely parse and download Slashdot's home page, inline images, and CSS inline images! (was pretty tough going for a while).
  • Added X-Keywords header for support.

0.1

  • First stab at the problem.

Source Code:

Yes, yes, I know I should just link to this. Bear with me.

#!/usr/bin/env python

"""Prototype Mail Archiver"""
__version__ = "0.2" #
__author__ = "Rui Carmo (http://the.taoofmac.com)"
__copyright__ = "(C) 2006 Rui Carmo. Distributed under BSD license."
__contributors__ = "Based on newspipe source code."

import urllib2, urlparse, cStringIO, BeautifulSoup, sha, gzip, re
import base64, mimetools, MimeWriter
import smtplib, Queue

USER_AGENT='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

class Gatherer:
  def __init__(self, url):
    self.url = url
    self.html = ''
    self.parts = Queue.Queue()

  def fetch(self, url, referrer=None):
    result = {'url':url}
    headers = {'User-Agent':USER_AGENT,'Accept-encoding':'gzip'}
    if referrer:
      headers['Referer'] = referrer
    req = urllib2.Request(url,None,headers)
    try:
      f = urllib2.urlopen(req)
    except urllib2.URLError, e:
      if hasattr(e, 'reason'):
        print 'We failed to reach a server.'
        print 'Reason: ', e.reason
      elif hasattr(e, 'code'):
        print 'The server couldn\'t fulfill the request for %s' % url
        print 'Error code:', e.code
      return None
    result['data'] = f.read()
    if hasattr(f,'headers'):
      if f.headers.get('content-encoding') == 'gzip':
        result['data'] = gzip.GzipFile(fileobj=cStringIO.StringIO(result['data'])).read()
      result['last-modified'] = f.headers.get('Last-Modified')
      result['content-type'] = f.headers.get('Content-Type')
    if hasattr(f,'url'):
      result['url'] = f.url
    if hasattr(f,'status'):
      result['status'] = f.status
    if hasattr(f,'code'):
      result['code'] = f.code
    f.close()
    return result

  def buildURL(self, url, referrer='', base=''):
    if base == '':
      result = urlparse.urljoin(referrer,url)
    else:
      result = urlparse.urljoin(base,url)
    return result

  def run(self,headers):
    self.spider()
    return self.assemble(headers)

  def filename(self,response):
    parts = urlparse.urlsplit(response['url'])
    try:
      (path,extension) = parts.path.split('.',1)
    except:
      (dummy,extension) = response['content-type'].split('/',1)
      pass
    if extension == 'jpeg':
      extension = 'jpg'
    buffer = sha.sha(response['url'])
    result = buffer.hexdigest() + '.' + extension
    return result

  def spider(self):
    parts = urlparse.urlsplit(self.url)
    if parts[0].lower() != 'http':
      return
    # Kick off fetching by getting the base URL
    response = self.fetch(self.url)
    if response == None:
      # fail silently
      return
    soup = BeautifulSoup.BeautifulSoup(response['data'])
    # Remove all scripting and other nuisances
    for script in soup('script'):
      script.extract()
    for embed in soup('embed'):
      embed.extract()
    for obj in soup('object'):
      obj.extract()
    for iframe in soup('iframe'):
      iframe.extract()
    # grab any base href
    base = ''
    try:
      node = soup('base')[0]
      base = node['href']
      node.extract()
    except: pass
    total_css = ''
    # Grab only screen CSS - which is what a browser would do
    for style in soup('link', rel='stylesheet', media=re.compile('screen')):
      url = self.buildURL(style['href'],self.url,base)
      css = self.fetch(url,self.url)
      if css != None:
        name = self.filename(css)
        style['href'] = ''+name
        # try grabbing images referenced in CSS
        for m in re.finditer("url\((.+)\)",css['data']):
          rel = m.group(1)
          if rel.startswith("'") or rel.startswith('"'):
            rel = rel[1:-1]
          url = self.buildURL(rel,self.url,base)
          response = self.fetch(url,self.url)
          if response != None:
            name = self.filename(response)
            css['data'] = css['data'].replace(rel, ''+name)
            self.parts.put((name, {'data':response['data'],'content-type':response['content-type']}))
        # self.parts[name] = {'data':css['data'],'content-type':css['content-type']}
        # Accrete all stylesheets into a text buffer
        total_css = total_css + "\n" + css['data']
        # remove all CSS link tags from the document (they will only confuse the MUA)
        for style in soup('link', {'rel':'stylesheet'}):
          style.extract()
    # Get the head tag
    head = soup('head')[0]
    # Assemble a style tag with the accreted CSS and insert it
    style = BeautifulSoup.Tag(soup,'style')
    css = BeautifulSoup.NavigableString(total_css)
    style.insert(0,css)
    head.insert(0,style)
    for img in soup('img'):
      url = self.buildURL(img['src'],self.url,base)
      response = self.fetch(url,self.url)
      if response != None:
        name = self.filename(response)
        img['src'] = ''+name
        self.parts.put((name,{'data':response['data'],'content-type':response['content-type']}))
    self.html = soup.prettify()

  def assemble(self,headers):
    buffer = cStringIO.StringIO(self.html)
    out = cStringIO.StringIO() # output buffer for our message
    writer = MimeWriter.MimeWriter(out)
    for key in headers.keys():
      writer.addheader(key, headers[key])
    writer.addheader("MIME-Version", "1.0")
    writer.startmultipartbody("alternative", boundary="F7A30D4E-ED0B-4BBE-8A45-D4E88DBC2FBF")
    writer.flushheaders()
    if self.parts:
        htmlpart = writer.nextpart()
        htmlpart.startmultipartbody("related", boundary="F7A30D4E-ED0B-4BBE-8A45-3244235533221")
        subpart = htmlpart.nextpart()
    else:
        subpart = writer.nextpart()
    subpart.addheader("Content-Transfer-Encoding", "quoted-printable")
    pout = subpart.startbody("text/html", [("charset", 'utf-8')])
    mimetools.encode(buffer, pout, 'quoted-printable')
    if self.parts:
      while self.parts.empty() == False:
        (key,data) = self.parts.get()
        subpart = htmlpart.nextpart()
        subpart.addheader("Content-Transfer-Encoding", "base64")
        subpart.addheader("Content-ID", "<" + key + ">")
        subpart.addheader("Content-Location", key)
        subpart.addheader("Content-Disposition", "inline; filename=\"" + key + "\"" )
        f = subpart.startbody(data['content-type'], [["name", key]])
        b64 = base64.encodestring(data['data'])
        f.write(b64)
    if self.parts:
        htmlpart.lastpart()
    writer.lastpart()
    buffer = out.getvalue()
    out.close()
    return buffer

if __name__ == '__main__':
  url = 'http://slashdot.org'
  sender = recipient = 'me@myaccount'
  g = Gatherer(url)
  buffer = g.run({'Subject':'Archive of %s' % url, 'X-Keywords':'archive'})
  smtp = smtplib.SMTP('myserver')
  smtp.sendmail(sender,recipient,buffer)
  smtp.quit()

This page is referenced in: