Implement an excerpt filter

This commit is contained in:
Sam Ruby 2006-08-31 16:04:29 -04:00
parent 357e47f3d6
commit 954093f1bb
11 changed files with 184 additions and 16 deletions

View File

@ -19,10 +19,10 @@ distribution.
python runtests.py
This should take anywhere from a half a second to ten seconds to execute.
No network connection is required, and it cleans up after itself. If it
completes with an "OK", you are good to go. Otherwise stopping here and
inquiring on the mailing list is a good idea as it can save you lots of
This should take anywhere from a one to ten seconds to execute. No network
connection is required, and the script cleans up after itself. If the
script completes with an "OK", you are good to go. Otherwise stopping here
and inquiring on the mailing list is a good idea as it can save you lots of
frustration down the road.
iii.

103
filters/excerpt.py Normal file
View File

@ -0,0 +1,103 @@
"""
Generate an excerpt from either the summary or a content of an entry.
Parameters:
width: maximum number of characters in the excerpt. Default: 500
omit: whitespace delimited list of html tags to remove. Default: none
Notes:
* if 'img' is in the list of tags to be omitted <img> tags are replaced with
hypertext links associated with the value of the 'alt' attribute. If there
is no alt attribute value, <img> is used instead. If the parent element
of the img tag is already an <a> tag, no additional hypertext links are
added.
"""
import sys, xml.dom.minidom, textwrap
from xml.dom import Node, minidom
atomNS = 'http://www.w3.org/2005/Atom'
planetNS = 'http://planet.intertwingly.net'
args = dict(zip([name.lstrip('-') for name in sys.argv[1::2]], sys.argv[2::2]))
wrapper = textwrap.TextWrapper(width=int(args.get('width','500')))
omit = args.get('omit', '').split()
class copy:
""" recursively copy a source to a target, up to a given width """
def __init__(self, dom, source, target):
self.dom = dom
self.full = False
self.text = []
self.textlen = 0
self.copyChildren(source, target)
def copyChildren(self, source, target):
""" copy child nodes of a source to the target """
for child in source.childNodes:
if child.nodeType == Node.ELEMENT_NODE:
self.copyElement(child, target)
elif child.nodeType == Node.TEXT_NODE:
self.copyText(child.data, target)
if self.full: break
def copyElement(self, source, target):
""" copy source element to the target """
# check the omit list
if source.nodeName in omit:
if source.nodeName == 'img':
return self.elideImage(source, target)
return self.copyChildren(source, target)
# copy element, attributes, and children
child = self.dom.createElementNS(source.namespaceURI, source.nodeName)
target.appendChild(child)
for i in range(0, source.attributes.length):
attr = source.attributes.item(i)
child.setAttributeNS(attr.namespaceURI, attr.name, attr.value)
self.copyChildren(source, child)
def elideImage(self, source, target):
""" copy an elided form of the image element to the target """
alt = source.getAttribute('alt') or '<img>'
src = source.getAttribute('src')
if target.nodeName == 'a' or not src:
self.copyText(alt, target)
else:
child = self.dom.createElement('a')
child.setAttribute('href', src)
self.copyText(alt, child)
target.appendChild(child)
def copyText(self, source, target):
""" copy text to the target, until the point where it would wrap """
if not source.isspace() and source.strip():
self.text.append(source.strip())
lines = wrapper.wrap(' '.join(self.text))
if len(lines) == 1:
target.appendChild(self.dom.createTextNode(source))
self.textlen = len(lines[0])
else:
excerpt = source[:len(lines[0])-self.textlen] + u' \u2026'
target.appendChild(dom.createTextNode(excerpt))
self.full = True
# select summary or content element
dom = minidom.parse(sys.stdin)
source = dom.getElementsByTagNameNS(atomNS, 'summary')
if not source:
source = dom.getElementsByTagNameNS(atomNS, 'content')
# if present, recursively copy it to a planet:excerpt element
if source:
dom.documentElement.setAttribute('xmlns:planet', planetNS)
target = dom.createElementNS(planetNS, 'planet:excerpt')
source[0].parentNode.appendChild(target)
copy(dom, source[0], target)
# print out results
print dom.toxml('utf-8')

View File

@ -29,18 +29,18 @@ def run(template_file, doc, mode='template'):
try:
module = __import__(module_name)
except Exception, inst:
print module_name
return log.error("Skipping %s '%s' after failing to load '%s': %s",
mode, template_resolved, module_name, inst)
# Execute the shell module
options = planet.config.template_options(template_file)
if mode == 'filter':
log.debug("Processing filer %s using %s", template_resolved,
module_name)
return module.run(template_resolved, doc, None)
return module.run(template_resolved, doc, None, options)
else:
log.info("Processing template %s using %s", template_resolved,
module_name)
output_dir = planet.config.output_dir()
output_file = os.path.join(output_dir, base)
module.run(template_resolved, doc, output_file)
module.run(template_resolved, doc, output_file, options)

View File

@ -1,6 +1,6 @@
from subprocess import Popen, PIPE
def run(script, doc, output_file=None):
def run(script, doc, output_file=None, options={}):
""" process an Python script """
if output_file:
@ -8,9 +8,14 @@ def run(script, doc, output_file=None):
else:
out = PIPE
proc = Popen(['python', script], stdin=PIPE, stdout=out, stderr=PIPE)
options = sum([['--'+key, value] for key,value in options.items()], [])
proc = Popen(['python', script] + options,
stdin=PIPE, stdout=out, stderr=PIPE)
stdout, stderr = proc.communicate(doc)
if stderr:
print stderr
import planet
planet.logger.error(stderr)
return stdout

View File

@ -221,7 +221,7 @@ def template_info(source):
return output
def run(script, doc, output_file=None):
def run(script, doc, output_file=None, options={}):
""" process an HTMLTMPL file """
manager = htmltmpl.TemplateManager()
template = manager.prepare(script)

View File

@ -1,6 +1,6 @@
import os
def run(script, doc, output_file=None):
def run(script, doc, output_file=None, options={}):
""" process an XSLT stylesheet """
try:

View File

@ -0,0 +1,5 @@
[Planet]
filters = excerpt.py
[excerpt.py]
omit = img

View File

@ -0,0 +1,10 @@
<entry xmlns="http://www.w3.org/2005/Atom">
<summary><div xmlns="http://wwww.w3.org/1999/xhtml">before
<img src="inner" alt="bar"/>
<a href="outer1"><img src="foo" alt="bar"/></a>
<a href="outer2"><img src="foo"/></a>
after</div></summary>
</entry>

View File

@ -0,0 +1,6 @@
[Planet]
filters = excerpt.py
[excerpt.py]
width = 100
omit = p

View File

@ -0,0 +1,8 @@
<entry xmlns="http://www.w3.org/2005/Atom">
<summary><div xmlns="http://wwww.w3.org/1999/xhtml"><p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nullam velit. Vivamus tincidunt, erat in rutrum fringilla, urna urna nonummy turpis, et lobortis eros dolor eu dui. Pellentesque vitae lorem. Sed lobortis arcu accumsan sapien. Pellentesque eget nulla et justo mollis mattis. Nulla dictum est eleifend nisl. Pellentesque ultricies ligula vel arcu. Ut ac mi in felis porta tristique. Donec cursus mollis ipsum. Maecenas nonummy.</p>
<p>Sed posuere. Phasellus pellentesque mattis mauris. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos hymenaeos. Ut ullamcorper consequat eros. Morbi hendrerit faucibus felis. Pellentesque odio eros, bibendum eget, ultrices ac, tempus quis, diam. Donec posuere, ligula eget sodales tristique, enim nunc faucibus nibh, luctus sagittis elit orci a nulla. Nulla scelerisque. In hac habitasse platea dictumst. Etiam vel nisl quis mauris metus.</p>
<p>Vivamus nonummy, justo at malesuada mollis, nisi purus fermentum neque, a faucibus dolor lorem at sem. Nunc quam nulla, lobortis sed, vehicula at, elementum volutpat.</p></div></summary>
</entry>

View File

@ -1,15 +1,46 @@
#!/usr/bin/env python
import unittest, xml.dom.minidom
from planet import shell
testfile = 'tests/data/filter/coral_cdn.xml'
filter = 'coral_cdn_filter.py'
from planet import shell, config
class FilterTests(unittest.TestCase):
def test_coral_cdn(self):
testfile = 'tests/data/filter/coral_cdn.xml'
filter = 'coral_cdn_filter.py'
output = shell.run(filter, open(testfile).read(), mode="filter")
dom = xml.dom.minidom.parseString(output)
imgsrc = dom.getElementsByTagName('img')[0].getAttribute('src')
self.assertEqual('http://example.com.nyud.net:8080/foo.png', imgsrc)
def test_excerpt_images(self):
testfile = 'tests/data/filter/excerpt-images.xml'
config.load('tests/data/filter/excerpt-images.ini')
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
dom = xml.dom.minidom.parseString(output)
excerpt = dom.getElementsByTagName('planet:excerpt')[0]
anchors = excerpt.getElementsByTagName('a')
hrefs = [a.getAttribute('href') for a in anchors]
texts = [a.lastChild.nodeValue for a in anchors]
self.assertEqual(['inner','outer1','outer2'], hrefs)
self.assertEqual(['bar','bar','<img>'], texts)
def test_excerpt_lorem_ipsum(self):
testfile = 'tests/data/filter/excerpt-lorem-ipsum.xml'
config.load('tests/data/filter/excerpt-lorem-ipsum.ini')
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
dom = xml.dom.minidom.parseString(output)
excerpt = dom.getElementsByTagName('planet:excerpt')[0]
self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)