Properly handle content type text/plain

2007-01-12 06:19:19 -05:00 · 2007-01-12 06:19:19 -05:00 · f2ac92465d
commit f2ac92465d
parent 3024af031f
1 changed files with 24 additions and 17 deletions
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -16,7 +16,6 @@ Todo:
 import re, time, md5, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
 from BeautifulSoup import BeautifulSoup
 from planet.html5lib import liberalxmlparser, treebuilders
 import planet, config
@ -139,25 +138,33 @@ def content(xentry, name, detail, bozo):
    xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
    xdoc = xentry.ownerDocument
    xcontent = xdoc.createElement(name)
    if isinstance(detail.value,unicode):
        detail.value=detail.value.encode('utf-8')
-    parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
+    if not detail.has_key('type') or detail.type.lower().find('html')<0:
-    html = parser.parse(xdiv % detail.value, encoding="utf-8")
+        detail['value'] = escape(detail.value)
-    for body in html.documentElement.childNodes:
+        detail['type'] = 'text/html'
-        if body.nodeType != Node.ELEMENT_NODE: continue
+
-        if body.nodeName != 'body': continue
+    if detail.type.find('xhtml')>=0 and not bozo:
-        for div in body.childNodes:
+        data = minidom.parseString(xdiv % detail.value).documentElement
-            if div.nodeType != Node.ELEMENT_NODE: continue
+    else:
-            if div.nodeName != 'div': continue
+        parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
-            div.normalize()
+        html = parser.parse(xdiv % detail.value, encoding="utf-8")
-            if len(div.childNodes) == 1 and \
+        for body in html.documentElement.childNodes:
-                div.firstChild.nodeType == Node.TEXT_NODE:
+            if body.nodeType != Node.ELEMENT_NODE: continue
-                data = div.firstChild
+            if body.nodeName != 'body': continue
-            else:
+            for div in body.childNodes:
-                data = div
+                if div.nodeType != Node.ELEMENT_NODE: continue
-                xcontent.setAttribute('type', 'xhtml')
+                if div.nodeName != 'div': continue
-            break
+                div.normalize()
                if len(div.childNodes) == 1 and \
                    div.firstChild.nodeType == Node.TEXT_NODE:
                    data = div.firstChild
                else:
                    data = div
                    xcontent.setAttribute('type', 'xhtml')
                break
    if data: xcontent.appendChild(data)