lkd-planet/DJAGEN/branches/oguz/djagen/gezegen/planet/atomstyler.py

from xml.dom import minidom, Node
from urlparse import urlparse, urlunparse
from xml.parsers.expat import ExpatError
from htmlentitydefs import name2codepoint
import re

# select and apply an xml:base for this entry
class relativize:
  def __init__(self, parent):
    self.score = {}
    self.links = []
    self.collect_and_tally(parent)
    self.base = self.select_optimal_base()
    if self.base:
      if not parent.hasAttribute('xml:base'):
        self.rebase(parent)
        parent.setAttribute('xml:base', self.base)

  # collect and tally cite, href and src attributes
  def collect_and_tally(self,parent):
    uri = None
    if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')
    if parent.hasAttribute('href'): uri=parent.getAttribute('href')
    if parent.hasAttribute('src'): uri=parent.getAttribute('src')

    if uri:
      parts=urlparse(uri)
      if parts[0].lower() == 'http':
        parts = (parts[1]+parts[2]).split('/')
        base = None
        for i in range(1,len(parts)):
          base = tuple(parts[0:i])
          self.score[base] = self.score.get(base,0) + len(base)
        if base and base not in self.links: self.links.append(base)

    for node in parent.childNodes:
      if node.nodeType == Node.ELEMENT_NODE:
        self.collect_and_tally(node)
    
  # select the xml:base with the highest score
  def select_optimal_base(self):
    if not self.score: return None
    for link in self.links:
      self.score[link] = 0
    winner = max(self.score.values())
    if not winner: return None
    for key in self.score.keys():
      if self.score[key] == winner:
        if winner == len(key): return None
        return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/'
    
  # rewrite cite, href and src attributes using this base
  def rebase(self,parent):
    uri = None
    if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')
    if parent.hasAttribute('href'): uri=parent.getAttribute('href')
    if parent.hasAttribute('src'): uri=parent.getAttribute('src')
    if uri and uri.startswith(self.base):
      uri = uri[len(self.base):] or '.'
      if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri)
      if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri)

    for node in parent.childNodes:
      if node.nodeType == Node.ELEMENT_NODE:
        self.rebase(node)

# convert type="html" to type="plain" or type="xhtml" as appropriate
def retype(parent):
  for node in parent.childNodes:
    if node.nodeType == Node.ELEMENT_NODE:

      if node.hasAttribute('type') and node.getAttribute('type') == 'html':
        if len(node.childNodes)==0:
          node.removeAttribute('type')
        elif len(node.childNodes)==1:

          # replace html entity defs with utf-8
          chunks=re.split('&(\w+);', node.childNodes[0].nodeValue)
          for i in range(1,len(chunks),2):
             if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']:
               chunks[i] ='&' + chunks[i] +';'
             elif chunks[i] in name2codepoint:
               chunks[i]=unichr(name2codepoint[chunks[i]])
             else:
               chunks[i]='&' + chunks[i] + ';'
          text = u"".join(chunks)

          try:
            # see if the resulting text is a well-formed XML fragment
            div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
            data = minidom.parseString((div % text.encode('utf-8')))

            if text.find('<') < 0:
              # plain text
              node.removeAttribute('type')
              text = data.documentElement.childNodes[0].nodeValue
              node.childNodes[0].replaceWholeText(text)

            elif len(text) > 80:
              # xhtml
              node.setAttribute('type', 'xhtml')
              node.removeChild(node.childNodes[0])
              node.appendChild(data.documentElement)

          except ExpatError:
            # leave as html
            pass

      else:
        # recurse
        retype(node)

  if parent.nodeName == 'entry':
    relativize(parent)

if __name__ == '__main__':

  # run styler on each file mention on the command line
  import sys
  for feed in sys.argv[1:]:
    doc = minidom.parse(feed)
    doc.normalize()
    retype(doc.documentElement)
    open(feed,'w').write(doc.toxml('utf-8'))
DJAGEN project migrates to LKD's svn. 2010-07-06 19:25:42 +03:00			`from xml.dom import minidom, Node`
			`from urlparse import urlparse, urlunparse`
			`from xml.parsers.expat import ExpatError`
			`from htmlentitydefs import name2codepoint`
			`import re`

			`# select and apply an xml:base for this entry`
			`class relativize:`
			`def __init__(self, parent):`
			`self.score = {}`
			`self.links = []`
			`self.collect_and_tally(parent)`
			`self.base = self.select_optimal_base()`
			`if self.base:`
			`if not parent.hasAttribute('xml:base'):`
			`self.rebase(parent)`
			`parent.setAttribute('xml:base', self.base)`

			`# collect and tally cite, href and src attributes`
			`def collect_and_tally(self,parent):`
			`uri = None`
			`if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')`
			`if parent.hasAttribute('href'): uri=parent.getAttribute('href')`
			`if parent.hasAttribute('src'): uri=parent.getAttribute('src')`

			`if uri:`
			`parts=urlparse(uri)`
			`if parts[0].lower() == 'http':`
			`parts = (parts[1]+parts[2]).split('/')`
			`base = None`
			`for i in range(1,len(parts)):`
			`base = tuple(parts[0:i])`
			`self.score[base] = self.score.get(base,0) + len(base)`
			`if base and base not in self.links: self.links.append(base)`

			`for node in parent.childNodes:`
			`if node.nodeType == Node.ELEMENT_NODE:`
			`self.collect_and_tally(node)`

			`# select the xml:base with the highest score`
			`def select_optimal_base(self):`
			`if not self.score: return None`
			`for link in self.links:`
			`self.score[link] = 0`
			`winner = max(self.score.values())`
			`if not winner: return None`
			`for key in self.score.keys():`
			`if self.score[key] == winner:`
			`if winner == len(key): return None`
			`return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/'`

			`# rewrite cite, href and src attributes using this base`
			`def rebase(self,parent):`
			`uri = None`
			`if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')`
			`if parent.hasAttribute('href'): uri=parent.getAttribute('href')`
			`if parent.hasAttribute('src'): uri=parent.getAttribute('src')`
			`if uri and uri.startswith(self.base):`
			`uri = uri[len(self.base):] or '.'`
			`if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri)`
			`if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri)`

			`for node in parent.childNodes:`
			`if node.nodeType == Node.ELEMENT_NODE:`
			`self.rebase(node)`

			`# convert type="html" to type="plain" or type="xhtml" as appropriate`
			`def retype(parent):`
			`for node in parent.childNodes:`
			`if node.nodeType == Node.ELEMENT_NODE:`

			`if node.hasAttribute('type') and node.getAttribute('type') == 'html':`
			`if len(node.childNodes)==0:`
			`node.removeAttribute('type')`
			`elif len(node.childNodes)==1:`

			`# replace html entity defs with utf-8`
			`chunks=re.split('&(\w+);', node.childNodes[0].nodeValue)`
			`for i in range(1,len(chunks),2):`
			`if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']:`
			`chunks[i] ='&' + chunks[i] +';'`
			`elif chunks[i] in name2codepoint:`
			`chunks[i]=unichr(name2codepoint[chunks[i]])`
			`else:`
			`chunks[i]='&' + chunks[i] + ';'`
			`text = u"".join(chunks)`

			`try:`
			`# see if the resulting text is a well-formed XML fragment`
			`div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'`
			`data = minidom.parseString((div % text.encode('utf-8')))`

			`if text.find('<') < 0:`
			`# plain text`
			`node.removeAttribute('type')`
			`text = data.documentElement.childNodes[0].nodeValue`
			`node.childNodes[0].replaceWholeText(text)`

			`elif len(text) > 80:`
			`# xhtml`
			`node.setAttribute('type', 'xhtml')`
			`node.removeChild(node.childNodes[0])`
			`node.appendChild(data.documentElement)`

			`except ExpatError:`
			`# leave as html`
			`pass`

			`else:`
			`# recurse`
			`retype(node)`

			`if parent.nodeName == 'entry':`
			`relativize(parent)`

			`if __name__ == '__main__':`

			`# run styler on each file mention on the command line`
			`import sys`
			`for feed in sys.argv[1:]:`
			`doc = minidom.parse(feed)`
			`doc.normalize()`
			`retype(doc.documentElement)`
			`open(feed,'w').write(doc.toxml('utf-8'))`