125 lines
4.2 KiB
Python
125 lines
4.2 KiB
Python
|
from xml.dom import minidom, Node
|
||
|
from urlparse import urlparse, urlunparse
|
||
|
from xml.parsers.expat import ExpatError
|
||
|
from htmlentitydefs import name2codepoint
|
||
|
import re
|
||
|
|
||
|
# select and apply an xml:base for this entry
|
||
|
class relativize:
|
||
|
def __init__(self, parent):
|
||
|
self.score = {}
|
||
|
self.links = []
|
||
|
self.collect_and_tally(parent)
|
||
|
self.base = self.select_optimal_base()
|
||
|
if self.base:
|
||
|
if not parent.hasAttribute('xml:base'):
|
||
|
self.rebase(parent)
|
||
|
parent.setAttribute('xml:base', self.base)
|
||
|
|
||
|
# collect and tally cite, href and src attributes
|
||
|
def collect_and_tally(self,parent):
|
||
|
uri = None
|
||
|
if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')
|
||
|
if parent.hasAttribute('href'): uri=parent.getAttribute('href')
|
||
|
if parent.hasAttribute('src'): uri=parent.getAttribute('src')
|
||
|
|
||
|
if uri:
|
||
|
parts=urlparse(uri)
|
||
|
if parts[0].lower() == 'http':
|
||
|
parts = (parts[1]+parts[2]).split('/')
|
||
|
base = None
|
||
|
for i in range(1,len(parts)):
|
||
|
base = tuple(parts[0:i])
|
||
|
self.score[base] = self.score.get(base,0) + len(base)
|
||
|
if base and base not in self.links: self.links.append(base)
|
||
|
|
||
|
for node in parent.childNodes:
|
||
|
if node.nodeType == Node.ELEMENT_NODE:
|
||
|
self.collect_and_tally(node)
|
||
|
|
||
|
# select the xml:base with the highest score
|
||
|
def select_optimal_base(self):
|
||
|
if not self.score: return None
|
||
|
for link in self.links:
|
||
|
self.score[link] = 0
|
||
|
winner = max(self.score.values())
|
||
|
if not winner: return None
|
||
|
for key in self.score.keys():
|
||
|
if self.score[key] == winner:
|
||
|
if winner == len(key): return None
|
||
|
return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/'
|
||
|
|
||
|
# rewrite cite, href and src attributes using this base
|
||
|
def rebase(self,parent):
|
||
|
uri = None
|
||
|
if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')
|
||
|
if parent.hasAttribute('href'): uri=parent.getAttribute('href')
|
||
|
if parent.hasAttribute('src'): uri=parent.getAttribute('src')
|
||
|
if uri and uri.startswith(self.base):
|
||
|
uri = uri[len(self.base):] or '.'
|
||
|
if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri)
|
||
|
if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri)
|
||
|
|
||
|
for node in parent.childNodes:
|
||
|
if node.nodeType == Node.ELEMENT_NODE:
|
||
|
self.rebase(node)
|
||
|
|
||
|
# convert type="html" to type="plain" or type="xhtml" as appropriate
|
||
|
def retype(parent):
|
||
|
for node in parent.childNodes:
|
||
|
if node.nodeType == Node.ELEMENT_NODE:
|
||
|
|
||
|
if node.hasAttribute('type') and node.getAttribute('type') == 'html':
|
||
|
if len(node.childNodes)==0:
|
||
|
node.removeAttribute('type')
|
||
|
elif len(node.childNodes)==1:
|
||
|
|
||
|
# replace html entity defs with utf-8
|
||
|
chunks=re.split('&(\w+);', node.childNodes[0].nodeValue)
|
||
|
for i in range(1,len(chunks),2):
|
||
|
if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']:
|
||
|
chunks[i] ='&' + chunks[i] +';'
|
||
|
elif chunks[i] in name2codepoint:
|
||
|
chunks[i]=unichr(name2codepoint[chunks[i]])
|
||
|
else:
|
||
|
chunks[i]='&' + chunks[i] + ';'
|
||
|
text = u"".join(chunks)
|
||
|
|
||
|
try:
|
||
|
# see if the resulting text is a well-formed XML fragment
|
||
|
div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
||
|
data = minidom.parseString((div % text.encode('utf-8')))
|
||
|
|
||
|
if text.find('<') < 0:
|
||
|
# plain text
|
||
|
node.removeAttribute('type')
|
||
|
text = data.documentElement.childNodes[0].nodeValue
|
||
|
node.childNodes[0].replaceWholeText(text)
|
||
|
|
||
|
elif len(text) > 80:
|
||
|
# xhtml
|
||
|
node.setAttribute('type', 'xhtml')
|
||
|
node.removeChild(node.childNodes[0])
|
||
|
node.appendChild(data.documentElement)
|
||
|
|
||
|
except ExpatError:
|
||
|
# leave as html
|
||
|
pass
|
||
|
|
||
|
else:
|
||
|
# recurse
|
||
|
retype(node)
|
||
|
|
||
|
if parent.nodeName == 'entry':
|
||
|
relativize(parent)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
|
||
|
# run styler on each file mention on the command line
|
||
|
import sys
|
||
|
for feed in sys.argv[1:]:
|
||
|
doc = minidom.parse(feed)
|
||
|
doc.normalize()
|
||
|
retype(doc.documentElement)
|
||
|
open(feed,'w').write(doc.toxml('utf-8'))
|