Implement an excerpt filter

2006-08-31 16:04:29 -04:00 · 2006-08-31 16:04:29 -04:00 · 954093f1bb
commit 954093f1bb
parent 357e47f3d6
11 changed files with 184 additions and 16 deletions
--- a/8
+++ b/8
@ -19,10 +19,10 @@ distribution.

        python runtests.py

-   This should take anywhere from a half a second to ten seconds to execute.
-   No network connection is required, and it cleans up after itself.  If it
-   completes with an "OK", you are good to go.  Otherwise stopping here and
-   inquiring on the mailing list is a good idea as it can save you lots of
+   This should take anywhere from a one to ten seconds to execute.  No network
+   connection is required, and the script cleans up after itself.  If the
+   script completes with an "OK", you are good to go.  Otherwise stopping here
+   and inquiring on the mailing list is a good idea as it can save you lots of
   frustration down the road.

 iii.
--- a/filters/excerpt.py
+++ b/filters/excerpt.py
@ -0,0 +1,103 @@
+"""
+Generate an excerpt from either the summary or a content of an entry.
+
+Parameters:
+  width:  maximum number of characters in the excerpt.  Default: 500
+  omit:   whitespace delimited list of html tags to remove.  Default: none
+
+Notes:
+ * if 'img' is in the list of tags to be omitted <img> tags are replaced with
+   hypertext links associated with the value of the 'alt' attribute.  If there
+   is no alt attribute value, <img> is used instead.  If the parent element
+   of the img tag is already an <a> tag, no additional hypertext links are
+   added.
+"""
+
+import sys, xml.dom.minidom, textwrap
+from xml.dom import Node, minidom
+
+atomNS = 'http://www.w3.org/2005/Atom'
+planetNS = 'http://planet.intertwingly.net'
+
+args = dict(zip([name.lstrip('-') for name in sys.argv[1::2]], sys.argv[2::2]))
+
+wrapper = textwrap.TextWrapper(width=int(args.get('width','500')))
+omit = args.get('omit', '').split()
+
+class copy:
+    """ recursively copy a source to a target, up to a given width """
+
+    def __init__(self, dom, source, target):
+        self.dom = dom
+        self.full = False
+        self.text = []
+        self.textlen = 0
+        self.copyChildren(source, target)
+
+    def copyChildren(self, source, target):
+        """ copy child nodes of a source to the target """
+        for child in source.childNodes:
+            if child.nodeType == Node.ELEMENT_NODE:
+                 self.copyElement(child, target)
+            elif child.nodeType == Node.TEXT_NODE:
+                 self.copyText(child.data, target)
+            if self.full: break
+
+    def copyElement(self, source, target):
+        """ copy source element to the target """
+
+        # check the omit list
+        if source.nodeName in omit:
+            if source.nodeName == 'img':
+               return self.elideImage(source, target)
+            return self.copyChildren(source, target)
+
+        # copy element, attributes, and children
+        child = self.dom.createElementNS(source.namespaceURI, source.nodeName)
+        target.appendChild(child)
+        for i in range(0, source.attributes.length):
+            attr = source.attributes.item(i)
+            child.setAttributeNS(attr.namespaceURI, attr.name, attr.value)
+        self.copyChildren(source, child)
+
+    def elideImage(self, source, target):
+        """ copy an elided form of the image element to the target """
+        alt = source.getAttribute('alt') or '<img>'
+        src = source.getAttribute('src')
+
+        if target.nodeName == 'a' or not src:
+            self.copyText(alt, target)
+        else:
+            child = self.dom.createElement('a')
+            child.setAttribute('href', src)
+            self.copyText(alt, child)
+            target.appendChild(child)
+
+    def copyText(self, source, target):
+        """ copy text to the target, until the point where it would wrap """
+        if not source.isspace() and source.strip():
+            self.text.append(source.strip())
+        lines = wrapper.wrap(' '.join(self.text))
+        if len(lines) == 1:
+            target.appendChild(self.dom.createTextNode(source))
+            self.textlen = len(lines[0])
+        else:
+            excerpt = source[:len(lines[0])-self.textlen] + u' \u2026'
+            target.appendChild(dom.createTextNode(excerpt))
+            self.full = True
+
+# select summary or content element
+dom = minidom.parse(sys.stdin)
+source = dom.getElementsByTagNameNS(atomNS, 'summary')
+if not source:
+    source = dom.getElementsByTagNameNS(atomNS, 'content')
+
+# if present, recursively copy it to a planet:excerpt element
+if source:
+    dom.documentElement.setAttribute('xmlns:planet', planetNS)
+    target = dom.createElementNS(planetNS, 'planet:excerpt')
+    source[0].parentNode.appendChild(target)
+    copy(dom, source[0], target)
+
+# print out results
+print dom.toxml('utf-8')
--- a/planet/shell/init.py
+++ b/planet/shell/init.py
@ -29,18 +29,18 @@ def run(template_file, doc, mode='template'):
    try:
        module = __import__(module_name)
    except Exception, inst:
-        print module_name
        return log.error("Skipping %s '%s' after failing to load '%s': %s", 
            mode, template_resolved, module_name, inst)

    # Execute the shell module
+    options = planet.config.template_options(template_file)
    if mode == 'filter':
        log.debug("Processing filer %s using %s", template_resolved,
            module_name)
-        return module.run(template_resolved, doc, None)
+        return module.run(template_resolved, doc, None, options)
    else:
        log.info("Processing template %s using %s", template_resolved,
            module_name)
        output_dir = planet.config.output_dir()
        output_file = os.path.join(output_dir, base)
-        module.run(template_resolved, doc, output_file)
+        module.run(template_resolved, doc, output_file, options)
--- a/planet/shell/py.py
+++ b/planet/shell/py.py
@ -1,6 +1,6 @@
 from subprocess import Popen, PIPE

-def run(script, doc, output_file=None):
+def run(script, doc, output_file=None, options={}):
    """ process an Python script """

    if output_file:
@ -8,9 +8,14 @@ def run(script, doc, output_file=None):
    else:
        out = PIPE

-    proc = Popen(['python', script], stdin=PIPE, stdout=out, stderr=PIPE)
+    options = sum([['--'+key, value] for key,value in options.items()], [])
+
+    proc = Popen(['python', script] + options,
+        stdin=PIPE, stdout=out, stderr=PIPE)
+
    stdout, stderr = proc.communicate(doc)
    if stderr:
-        print stderr
+        import planet
+        planet.logger.error(stderr)

    return stdout
--- a/planet/shell/tmpl.py
+++ b/planet/shell/tmpl.py
@ -221,7 +221,7 @@ def template_info(source):

    return output

-def run(script, doc, output_file=None):
+def run(script, doc, output_file=None, options={}):
    """ process an HTMLTMPL file """
    manager = htmltmpl.TemplateManager()
    template = manager.prepare(script)
--- a/planet/shell/xslt.py
+++ b/planet/shell/xslt.py
@ -1,6 +1,6 @@
 import os

-def run(script, doc, output_file=None):
+def run(script, doc, output_file=None, options={}):
    """ process an XSLT stylesheet """

    try:
--- a/tests/data/filter/excerpt-images.ini
+++ b/tests/data/filter/excerpt-images.ini
@ -0,0 +1,5 @@
+[Planet]
+filters = excerpt.py
+
+[excerpt.py]
+omit = img
--- a/tests/data/filter/excerpt-images.xml
+++ b/tests/data/filter/excerpt-images.xml
@ -0,0 +1,10 @@
+<entry xmlns="http://www.w3.org/2005/Atom">
+  <summary><div xmlns="http://wwww.w3.org/1999/xhtml">before
+
+<img src="inner" alt="bar"/>
+<a href="outer1"><img src="foo" alt="bar"/></a>
+<a href="outer2"><img src="foo"/></a>
+
+after</div></summary>
+</entry>
+
--- a/tests/data/filter/excerpt-lorem-ipsum.ini
+++ b/tests/data/filter/excerpt-lorem-ipsum.ini
@ -0,0 +1,6 @@
+[Planet]
+filters = excerpt.py
+
+[excerpt.py]
+width = 100
+omit = p
--- a/tests/data/filter/excerpt-lorem-ipsum.xml
+++ b/tests/data/filter/excerpt-lorem-ipsum.xml
@ -0,0 +1,8 @@
+<entry xmlns="http://www.w3.org/2005/Atom">
+  <summary><div xmlns="http://wwww.w3.org/1999/xhtml"><p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nullam velit. Vivamus tincidunt, erat in rutrum fringilla, urna urna nonummy turpis, et lobortis eros dolor eu dui. Pellentesque vitae lorem. Sed lobortis arcu accumsan sapien. Pellentesque eget nulla et justo mollis mattis. Nulla dictum est eleifend nisl. Pellentesque ultricies ligula vel arcu. Ut ac mi in felis porta tristique. Donec cursus mollis ipsum. Maecenas nonummy.</p>
+
+<p>Sed posuere. Phasellus pellentesque mattis mauris. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos hymenaeos. Ut ullamcorper consequat eros. Morbi hendrerit faucibus felis. Pellentesque odio eros, bibendum eget, ultrices ac, tempus quis, diam. Donec posuere, ligula eget sodales tristique, enim nunc faucibus nibh, luctus sagittis elit orci a nulla. Nulla scelerisque. In hac habitasse platea dictumst. Etiam vel nisl quis mauris metus.</p>
+
+<p>Vivamus nonummy, justo at malesuada mollis, nisi purus fermentum neque, a faucibus dolor lorem at sem. Nunc quam nulla, lobortis sed, vehicula at, elementum volutpat.</p></div></summary>
+</entry>
+
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -1,15 +1,46 @@
 #!/usr/bin/env python

 import unittest, xml.dom.minidom
-from planet import shell
-
-testfile = 'tests/data/filter/coral_cdn.xml'
-filter = 'coral_cdn_filter.py'
+from planet import shell, config

 class FilterTests(unittest.TestCase):

    def test_coral_cdn(self):
+        testfile = 'tests/data/filter/coral_cdn.xml'
+        filter = 'coral_cdn_filter.py'
+
        output = shell.run(filter, open(testfile).read(), mode="filter")
        dom = xml.dom.minidom.parseString(output)
        imgsrc = dom.getElementsByTagName('img')[0].getAttribute('src')
        self.assertEqual('http://example.com.nyud.net:8080/foo.png', imgsrc)
+
+    def test_excerpt_images(self):
+        testfile = 'tests/data/filter/excerpt-images.xml'
+        config.load('tests/data/filter/excerpt-images.ini')
+
+        output = open(testfile).read()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        dom = xml.dom.minidom.parseString(output)
+        excerpt = dom.getElementsByTagName('planet:excerpt')[0]
+        anchors = excerpt.getElementsByTagName('a')
+        hrefs = [a.getAttribute('href') for a in anchors]
+        texts = [a.lastChild.nodeValue for a in anchors]
+
+        self.assertEqual(['inner','outer1','outer2'], hrefs)
+        self.assertEqual(['bar','bar','<img>'], texts)
+
+    def test_excerpt_lorem_ipsum(self):
+        testfile = 'tests/data/filter/excerpt-lorem-ipsum.xml'
+        config.load('tests/data/filter/excerpt-lorem-ipsum.ini')
+
+        output = open(testfile).read()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        dom = xml.dom.minidom.parseString(output)
+        excerpt = dom.getElementsByTagName('planet:excerpt')[0]
+        self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
+            u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
+            u'in \u2026', excerpt.firstChild.firstChild.nodeValue)