planet/planet/shell/tmpl.py

from xml.sax.saxutils import escape
import sgmllib, time, os, sys, new, urlparse, re
from planet import config, feedparser
import htmltmpl

voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))

class stripHtml(sgmllib.SGMLParser):
    "remove all tags from the data"
    def __init__(self, data):
        sgmllib.SGMLParser.__init__(self)
        self.result=''
        if isinstance(data, str):
            try:
                self.feed(data.decode('utf-8'))
            except:
                self.feed(data)
        else:
            self.feed(data)
        self.close()
    def __str__(self):
        if isinstance(self.result, unicode):
            return self.result.encode('utf-8')
        return self.result
    def handle_entityref(self, ref):
        import htmlentitydefs
        if ref in htmlentitydefs.entitydefs:
            ref=htmlentitydefs.entitydefs[ref]
            if len(ref)==1:
                self.result+=unichr(ord(ref))
            elif ref.startswith('&#') and ref.endswith(';'):
                self.handle_charref(ref[2:-1])
            else:
                self.result+='&%s;' % ref
        else:
            self.result+='&%s;' % ref
    def handle_charref(self, ref):
        try:
            if ref.startswith('x'):
                self.result+=unichr(int(ref[1:],16))
            else:
                self.result+=unichr(int(ref))
        except:
            self.result+='&#%s;' % ref
    def handle_data(self, data):
        if data: self.result+=data

# Data format mappers

def String(value):
    if isinstance(value, unicode): return value.encode('utf-8')
    return value

def Plain(value):
    return str(stripHtml(value))

def PlanetDate(value):
    return time.strftime(config.date_format(), value)

def NewDate(value):
    return time.strftime(config.new_date_format(), value)

def Rfc822(value):
    return time.strftime("%a, %d %b %Y %H:%M:%S +0000", value)

def Rfc3399(value):
    return time.strftime("%Y-%m-%dT%H:%M:%S+00:00", value)

# Map from FeedParser path to Planet tmpl names
Base = [
    ['author', String, 'author'],
    ['author_name', String, 'author_detail', 'name'],
    ['generator', String, 'generator'],
    ['id', String, 'id'],
    ['icon', String, 'icon'],
    ['last_updated_822', Rfc822, 'updated_parsed'],
    ['last_updated_iso', Rfc3399, 'updated_parsed'],
    ['last_updated', PlanetDate, 'updated_parsed'],
    ['link', String, 'link'],
    ['logo', String, 'logo'],
    ['rights', String, 'rights_detail', 'value'],
    ['subtitle', String, 'subtitle_detail', 'value'],
    ['title', String, 'title_detail', 'value'],
    ['title_plain', Plain, 'title_detail', 'value'],
    ['url', String, 'links', {'rel':'self'}, 'href'],
    ['url', String, 'planet_http_location'],
]

Items = [
    ['author', String, 'author'],
    ['author_email', String, 'author_detail', 'email'],
    ['author_name', String, 'author_detail', 'name'],
    ['author_uri', String, 'author_detail', 'href'],
    ['content_language', String, 'content', 0, 'language'],
    ['content', String, 'summary_detail', 'value'],
    ['content', String, 'content', 0, 'value'],
    ['date', PlanetDate, 'published_parsed'],
    ['date', PlanetDate, 'updated_parsed'],
    ['date_822', Rfc822, 'published_parsed'],
    ['date_822', Rfc822, 'updated_parsed'],
    ['date_iso', Rfc3399, 'published_parsed'],
    ['date_iso', Rfc3399, 'updated_parsed'],
    ['enclosure_href', String, 'links', {'rel': 'enclosure'}, 'href'],
    ['enclosure_length', String, 'links', {'rel': 'enclosure'}, 'length'],
    ['enclosure_type', String, 'links', {'rel': 'enclosure'}, 'type'],
    ['id', String, 'id'],
    ['link', String, 'links', {'rel': 'alternate'}, 'href'],
    ['new_channel', String, 'source', 'id'],
    ['new_date', NewDate, 'published_parsed'],
    ['new_date', NewDate, 'updated_parsed'],
    ['rights', String, 'rights_detail', 'value'],
    ['title_language', String, 'title_detail', 'language'],
    ['title_plain', Plain, 'title_detail', 'value'],
    ['title', String, 'title_detail', 'value'],
    ['summary_language', String, 'summary_detail', 'language'],
    ['updated', PlanetDate, 'updated_parsed'],
    ['updated_822', Rfc822, 'updated_parsed'],
    ['updated_iso', Rfc3399, 'updated_parsed'],
    ['published', PlanetDate, 'published_parsed'],
    ['published_822', Rfc822, 'published_parsed'],
    ['published_iso', Rfc3399, 'published_parsed'],
]

# Add additional rules for source information
for rule in Base:
    Items.append(['channel_'+rule[0], rule[1], 'source'] + rule[2:])

def tmpl_mapper(source, rules):
    "Apply specified rules to the source, and return a template dictionary"
    output = {}

    for rule in rules:
        node = source
        for path in rule[2:]:
            if isinstance(path, str) and path in node:
                if path == 'value':
                    if node.get('type','')=='text/plain':
                        node['value'] = escape(node['value'])
                        node['type'] = 'text/html'
                    elif node.get('type','')=='application/xhtml+xml':
                        node['value'] = empty.sub(r"<\1 />", node['value'])
                node = node[path]
            elif isinstance(path, int):
                node = node[path]
            elif isinstance(path, dict):
                for test in node:
                    for key, value in path.items():
                        if test.get(key,None) != value: break
                    else:
                        node = test
                        break
                else:
                    break
            else:
                break
        else:
            if node: output[rule[0]] = rule[1](node)

    # copy over all planet namespaced elements from parent source
    for name,value in source.items():
        if name.startswith('planet_'):
            output[name[7:]] = String(value)
        if not output.get('name') and source.has_key('title_detail'):
            output['name'] = Plain(source.title_detail.value)

    # copy over all planet namespaced elements from child source element
    if 'source' in source:
        for name,value in source.source.items():
            if name.startswith('planet_'):
                output['channel_' + name[7:]] = String(value)
            if not output.get('channel_name') and \
                source.source.has_key('title_detail'):
                output['channel_name'] = Plain(source.source.title_detail.value)

    return output

def _end_planet_source(self):
    self._end_source()
    context = self._getContext()
    if not context.has_key('sources'): context['sources'] = []
    context.sources.append(context.source)
    del context['source']

def template_info(source):
    """ get template information from a feedparser output """

    # wire in support for planet:source, call feedparser, unplug planet:source
    mixin=feedparser._FeedParserMixin
    mixin._start_planet_source = mixin._start_source
    mixin._end_planet_source = \
        new.instancemethod(_end_planet_source, None, mixin)
    data=feedparser.parse(source)
    del mixin._start_planet_source
    del mixin._end_planet_source

    # apply rules to convert feed parser output to htmltmpl input
    output = {'Channels': [], 'Items': []}
    output.update(tmpl_mapper(data.feed, Base))
    sources = []
    for feed in data.feed.get('sources',[]):
        source = tmpl_mapper(feed, Base)
        sources.append([source.get('name'), source])
    sources.sort()
    output['Channels'] = [source for name,source in sources]
    for entry in data.entries:
        output['Items'].append(tmpl_mapper(entry, Items))

    # synthesize isPermaLink attribute
    for item in output['Items']:
        if item.get('id') == item.get('link'):
            item['guid_isPermaLink']='true'
        else:
            item['guid_isPermaLink']='false'

    # feed level information
    output['generator'] = config.generator_uri()
    output['name'] = config.name()
    output['link'] = config.link()
    output['owner_name'] = config.owner_name()
    output['owner_email'] = config.owner_email()
    output['pubsubhubbub_hub'] = config.pubsubhubbub_hub()
    if config.feed():
        output['feed'] = config.feed()
        output['feedtype'] = config.feed().find('rss')>=0 and 'rss' or 'atom'

    # date/time information
    date = time.gmtime()
    output['date'] = PlanetDate(date)
    output['date_iso'] = Rfc3399(date)
    output['date_822'] = Rfc822(date)

    # remove new_dates and new_channels that aren't "new"
    date = channel = None
    for item in output['Items']:
        if item.has_key('new_date'):
            if item['new_date'] == date:
                del item['new_date']
            else:
                date = item['new_date']

        if item.has_key('new_channel'):
            if item['new_channel'] == channel and not item.has_key('new_date'):
                del item['new_channel']
            else:
                channel = item['new_channel']

    return output

def run(script, doc, output_file=None, options={}):
    """ process an HTMLTMPL file """
    manager = htmltmpl.TemplateManager()
    template = manager.prepare(script)
    tp = htmltmpl.TemplateProcessor(html_escape=0)
    for key,value in template_info(doc).items():
        tp.set(key, value)

    if output_file:
        basename = os.path.basename(output_file)
        reluri = os.path.splitext(os.path.basename(output_file))[0]
        tp.set('url', urlparse.urljoin(config.link(),reluri))
        tp.set('fullurl', urlparse.urljoin(config.link(),basename))

        output = open(output_file, "w")
        output.write(tp.process(template))
        output.close()
    else:
        return tp.process(template)

if __name__ == '__main__':
    sys.path.insert(0, os.path.split(sys.path[0])[0])

    for test in sys.argv[1:]:
        from pprint import pprint
        pprint(template_info('/home/rubys/bzr/venus/tests/data/filter/tmpl/'+test))