from xml.dom import minidom, Node from urlparse import urlparse, urlunparse from xml.parsers.expat import ExpatError from htmlentitydefs import name2codepoint import re # select and apply an xml:base for this entry class relativize: def __init__(self, parent): self.score = {} self.links = [] self.collect_and_tally(parent) self.base = self.select_optimal_base() if self.base: if not parent.hasAttribute('xml:base'): self.rebase(parent) parent.setAttribute('xml:base', self.base) # collect and tally cite, href and src attributes def collect_and_tally(self,parent): uri = None if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') if parent.hasAttribute('href'): uri=parent.getAttribute('href') if parent.hasAttribute('src'): uri=parent.getAttribute('src') if uri: parts=urlparse(uri) if parts[0].lower() == 'http': parts = (parts[1]+parts[2]).split('/') base = None for i in range(1,len(parts)): base = tuple(parts[0:i]) self.score[base] = self.score.get(base,0) + len(base) if base and base not in self.links: self.links.append(base) for node in parent.childNodes: if node.nodeType == Node.ELEMENT_NODE: self.collect_and_tally(node) # select the xml:base with the highest score def select_optimal_base(self): if not self.score: return None for link in self.links: self.score[link] = 0 winner = max(self.score.values()) if not winner: return None for key in self.score.keys(): if self.score[key] == winner: if winner == len(key): return None return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/' # rewrite cite, href and src attributes using this base def rebase(self,parent): uri = None if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') if parent.hasAttribute('href'): uri=parent.getAttribute('href') if parent.hasAttribute('src'): uri=parent.getAttribute('src') if uri and uri.startswith(self.base): uri = uri[len(self.base):] or '.' if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri) if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri) for node in parent.childNodes: if node.nodeType == Node.ELEMENT_NODE: self.rebase(node) # convert type="html" to type="plain" or type="xhtml" as appropriate def retype(parent): for node in parent.childNodes: if node.nodeType == Node.ELEMENT_NODE: if node.hasAttribute('type') and node.getAttribute('type') == 'html': if len(node.childNodes)==0: node.removeAttribute('type') elif len(node.childNodes)==1: # replace html entity defs with utf-8 chunks=re.split('&(\w+);', node.childNodes[0].nodeValue) for i in range(1,len(chunks),2): if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']: chunks[i] ='&' + chunks[i] +';' elif chunks[i] in name2codepoint: chunks[i]=unichr(name2codepoint[chunks[i]]) else: chunks[i]='&' + chunks[i] + ';' text = u"".join(chunks) try: # see if the resulting text is a well-formed XML fragment div = '
%s
' data = minidom.parseString((div % text.encode('utf-8'))) if text.find('<') < 0: # plain text node.removeAttribute('type') text = data.documentElement.childNodes[0].nodeValue node.childNodes[0].replaceWholeText(text) elif len(text) > 80: # xhtml node.setAttribute('type', 'xhtml') node.removeChild(node.childNodes[0]) node.appendChild(data.documentElement) except ExpatError: # leave as html pass else: # recurse retype(node) if parent.nodeName == 'entry': relativize(parent) if __name__ == '__main__': # run styler on each file mention on the command line import sys for feed in sys.argv[1:]: doc = minidom.parse(feed) doc.normalize() retype(doc.documentElement) open(feed,'w').write(doc.toxml('utf-8'))