#!/usr/bin/env python import sgmllib, string import re whitespace_re = re.compile("\s+") def normalise_space(s): """Normalise space in the same manner as HTML. Any substring of multiple whitespace characters will be replaced with a single space char. """ return whitespace_re.sub(" ", str(s)) def make_block_start_end_pair(tag): def start_t(self, attrs): self._write("%s. " % tag) self._start_capture(tag) def end_t(self): self._stop_capture_and_write() self._write("\n\n") return start_t, end_t def make_quicktag_start_end_pair(tag, wrapchar): def start_t(self, attrs): self._write([" ", wrapchar]) self._start_capture(tag) def end_t(self): self._stop_capture_and_write() self._write([wrapchar, " "]) return start_t, end_t class HtmlToTextileConvertingParser(sgmllib.SGMLParser): """An SGML parser class which traverses the tree and converts HTML tags into Textile markup. Block tags within block tags are ignored. """ valid_tags = () valid_attrs = () block_tags = ("h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "bq") from htmlentitydefs import entitydefs def __init__(self): sgmllib.SGMLParser.__init__(self) self._result = [] self._data_stack = [] self._in_block = self._in_ul = self._in_ol = False def handle_data(self, data): if data: self._write(normalise_space(data).strip()) def handle_charref(self, tag): self._write(unichr(int(tag))) def handle_entityref(self, tag): if self.entitydefs.has_key(tag): self._write(self.entitydefs[tag]) def handle_starttag(self, tag, method, attrs): method(dict(attrs)) def _write(self, d): if len(self._data_stack) < 2: target = self._result else: target = self._data_stack[-1] if type(d) in (list, tuple): target += d else: target.append(str(d)) def _start_capture(self, tag): self._in_block = tag self._data_stack.append([]) def _stop_capture_and_write(self): self._in_block = False self._write(self._data_stack.pop()) start_h1, end_h1 = make_block_start_end_pair("h1") start_h2, end_h2 = make_block_start_end_pair("h2") start_h3, end_h3 = make_block_start_end_pair("h3") start_h4, end_h4 = make_block_start_end_pair("h4") start_h5, end_h5 = make_block_start_end_pair("h5") start_h6, end_h6 = make_block_start_end_pair("h6") start_h7, end_h7 = make_block_start_end_pair("h7") start_p, end_p = make_block_start_end_pair("p") start_blockquote, end_blockquote = make_block_start_end_pair("bq") start_b, end_b = make_quicktag_start_end_pair("b", "*") start_strong, end_strong = make_quicktag_start_end_pair("strong", "*") start_i, end_i = make_quicktag_start_end_pair("i", "_") start_em, end_em = make_quicktag_start_end_pair("em", "_") start_cite, end_cite = make_quicktag_start_end_pair("cite", "??") start_s, end_s = make_quicktag_start_end_pair("s", "-") start_sup, end_sup = make_quicktag_start_end_pair("sup", "^") start_sub, end_sub = make_quicktag_start_end_pair("sub", "~") def start_p(self, attrs): self._start_capture("p") def end_p(self): self._stop_capture_and_write() self._write("\n\n") def start_ol(self, attrs): self._in_ol = True def end_ol(self): self._in_ol = False self._write("\n") def start_ul(self, attrs): self._in_ul = True def end_ul(self): self._in_ul = False self._write("\n") def start_li(self, attrs): if self._in_ol: self._write("# ") else: self._write("* ") self._start_capture("li") def end_li(self): self._stop_capture_and_write() self._write("\n") def start_a(self, attrs): self.a_href = attrs.get("href") if self.a_href: self._write(" \"") self._start_capture("a") def end_a(self): if self.a_href: self._stop_capture_and_write() self._write(["\":", self.a_href, " "]) self.a_href = False def start_img(self, attrs): if attrs.get("src"): self._write([" !", attrs["src"], "! "]) def end_img(self): pass def start_tr(self, attrs): pass def end_tr(self): self._write("|\n") def start_td(self, attrs): self._write("|") self._start_capture("td") def end_td(self): self._stop_capture_and_write() self._write("|") def start_br(self, attrs): self._write("\n") def unknown_starttag(self, tag, attrs): """Delete all other tags except for those specified in valid_tags""" if tag in self.valid_tags: self._write(["<", tag]) for k, v in attrs: if k in self.valid_attrs: self._write([" ", k, "=\"", v, "\""]) self._write(">") def unknown_endtag(self, tag): if tag in self.valid_tags: self._write(["", tag, ">"]) def _get_result(self): return "".join(self._result).strip() result = property(_get_result) def html2textile(s): """Convert a snippet of HTML to Textile, a simple markup language. See http://www.textism.com/tools/textile/ for Textile's rules. >>> html2textile("
Welcome to my home page.
') 'h1. Hello world!\\n\\np. Welcome to my home page.' """ parser = HtmlToTextileConvertingParser() parser.feed(s) parser.close() return parser.result if __name__ == "__main__": import sys print html2textile(''.join(sys.stdin.readlines()))