Following an investigation into long-term archival of web pages, this is alpha Python code that will download, re-format and archive a web page as e-mail, cobbled together from newspipe source code.
At the moment, this sends the result via SMTP to an arbitrary mailbox (it's easier to test with several MUAs that way, and there's no need to login to an IMAP server). The end result will append to a given IMAP folder.
Please note that this requires you to have Beautiful Soup, which is the clever bit that lets me parse whatever kind HTML I come across into submission.
Stuff to do:
- Handle errors and redirects more gracefully
- Fix UTF-8 issues on pages cobbled together from multiple charsets (like some planets)
- Replace SMTP delivery with imaplib
- Accept command-line parameters for URL, IMAP folder, username, password, etc.
- Optionally prompt user for authentication, tags, Subject:, etc. (most likely using Mac OS X's Tcl/Tk bindings for Python).
- Create a Quicksilver action script to invoke this (probably using AppleScript to query Safari for the current page)
Changelog:
0.2
- Appended linked CSS to a single inline style tag, since some MUAs are unable to deal with linked CSS as CIDs to MIME multipart data. Script now removes all non-screen CSS, since MUAs get awfully confused.
- Script now parses CSS and re-formats url() calls to point to MIME multipart data (works fine with Mail.app, must check with other MUAs)
- Able to completely parse and download Slashdot's home page, inline images, and CSS inline images! (was pretty tough going for a while).
- Added X-Keywords header for MailTags support.
0.1
- First stab at the problem.
Source Code:
Yes, yes, I know I should just link to this. Bear with me.
#!/usr/bin/env python """Prototype Mail Archiver""" __version__ = "0.2" # __author__ = "Rui Carmo (http://the.taoofmac.com)" __copyright__ = "(C) 2006 Rui Carmo. Distributed under BSD license." __contributors__ = "Based on newspipe source code." import urllib2, urlparse, cStringIO, BeautifulSoup, sha, gzip, re import base64, mimetools, MimeWriter import smtplib, Queue USER_AGENT='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' class Gatherer: def __init__(self, url): self.url = url self.html = '' self.parts = Queue.Queue() def fetch(self, url, referrer=None): result = {'url':url} headers = {'User-Agent':USER_AGENT,'Accept-encoding':'gzip'} if referrer: headers['Referer'] = referrer req = urllib2.Request(url,None,headers) try: f = urllib2.urlopen(req) except urllib2.URLError, e: if hasattr(e, 'reason'): print 'We failed to reach a server.' print 'Reason: ', e.reason elif hasattr(e, 'code'): print 'The server couldn\'t fulfill the request for %s' % url print 'Error code:', e.code return None result['data'] = f.read() if hasattr(f,'headers'): if f.headers.get('content-encoding') == 'gzip': result['data'] = gzip.GzipFile(fileobj=cStringIO.StringIO(result['data'])).read() result['last-modified'] = f.headers.get('Last-Modified') result['content-type'] = f.headers.get('Content-Type') if hasattr(f,'url'): result['url'] = f.url if hasattr(f,'status'): result['status'] = f.status if hasattr(f,'code'): result['code'] = f.code f.close() return result def buildURL(self, url, referrer='', base=''): if base == '': result = urlparse.urljoin(referrer,url) else: result = urlparse.urljoin(base,url) return result def run(self,headers): self.spider() return self.assemble(headers) def filename(self,response): parts = urlparse.urlsplit(response['url']) try: (path,extension) = parts.path.split('.',1) except: (dummy,extension) = response['content-type'].split('/',1) pass if extension == 'jpeg': extension = 'jpg' buffer = sha.sha(response['url']) result = buffer.hexdigest() + '.' + extension return result def spider(self): parts = urlparse.urlsplit(self.url) if parts[0].lower() != 'http': return # Kick off fetching by getting the base URL response = self.fetch(self.url) if response == None: # fail silently return soup = BeautifulSoup.BeautifulSoup(response['data']) # Remove all scripting and other nuisances for script in soup('script'): script.extract() for embed in soup('embed'): embed.extract() for obj in soup('object'): obj.extract() for iframe in soup('iframe'): iframe.extract() # grab any base href base = '' try: node = soup('base')[0] base = node['href'] node.extract() except: pass total_css = '' # Grab only screen CSS - which is what a browser would do for style in soup('link', rel='stylesheet', media=re.compile('screen')): url = self.buildURL(style['href'],self.url,base) css = self.fetch(url,self.url) if css != None: name = self.filename(css) style['href'] = ''+name # try grabbing images referenced in CSS for m in re.finditer("url\((.+)\)",css['data']): rel = m.group(1) if rel.startswith("'") or rel.startswith('"'): rel = rel[1:-1] url = self.buildURL(rel,self.url,base) response = self.fetch(url,self.url) if response != None: name = self.filename(response) css['data'] = css['data'].replace(rel, ''+name) self.parts.put((name, {'data':response['data'],'content-type':response['content-type']})) # self.parts[name] = {'data':css['data'],'content-type':css['content-type']} # Accrete all stylesheets into a text buffer total_css = total_css + "\n" + css['data'] # remove all CSS link tags from the document (they will only confuse the MUA) for style in soup('link', {'rel':'stylesheet'}): style.extract() # Get the head tag head = soup('head')[0] # Assemble a style tag with the accreted CSS and insert it style = BeautifulSoup.Tag(soup,'style') css = BeautifulSoup.NavigableString(total_css) style.insert(0,css) head.insert(0,style) for img in soup('img'): url = self.buildURL(img['src'],self.url,base) response = self.fetch(url,self.url) if response != None: name = self.filename(response) img['src'] = ''+name self.parts.put((name,{'data':response['data'],'content-type':response['content-type']})) self.html = soup.prettify() def assemble(self,headers): buffer = cStringIO.StringIO(self.html) out = cStringIO.StringIO() # output buffer for our message writer = MimeWriter.MimeWriter(out) for key in headers.keys(): writer.addheader(key, headers[key]) writer.addheader("MIME-Version", "1.0") writer.startmultipartbody("alternative", boundary="F7A30D4E-ED0B-4BBE-8A45-D4E88DBC2FBF") writer.flushheaders() if self.parts: htmlpart = writer.nextpart() htmlpart.startmultipartbody("related", boundary="F7A30D4E-ED0B-4BBE-8A45-3244235533221") subpart = htmlpart.nextpart() else: subpart = writer.nextpart() subpart.addheader("Content-Transfer-Encoding", "quoted-printable") pout = subpart.startbody("text/html", [("charset", 'utf-8')]) mimetools.encode(buffer, pout, 'quoted-printable') if self.parts: while self.parts.empty() == False: (key,data) = self.parts.get() subpart = htmlpart.nextpart() subpart.addheader("Content-Transfer-Encoding", "base64") subpart.addheader("Content-ID", "<" + key + ">") subpart.addheader("Content-Location", key) subpart.addheader("Content-Disposition", "inline; filename=\"" + key + "\"" ) f = subpart.startbody(data['content-type'], [["name", key]]) b64 = base64.encodestring(data['data']) f.write(b64) if self.parts: htmlpart.lastpart() writer.lastpart() buffer = out.getvalue() out.close() return buffer if __name__ == '__main__': url = 'http://slashdot.org' sender = recipient = 'me@myaccount' g = Gatherer(url) buffer = g.run({'Subject':'Archive of %s' % url, 'X-Keywords':'archive'}) smtp = smtplib.SMTP('myserver') smtp.sendmail(sender,recipient,buffer) smtp.quit()