Improved parsing of RSS 2.0 authors

2006-08-31 20:20:24 -04:00 · 2006-08-31 20:20:24 -04:00 · 4b7891110f
commit 4b7891110f
parent 01e47cb49a
1 changed files with 4 additions and 2 deletions
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 1.134 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.135 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -1112,12 +1112,14 @@ class _FeedParserMixin:
        else:
            author, email = context.get(key), None
            if not author: return
-            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
+            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
            if emailmatch:
                email = emailmatch.group(0)
                # probably a better way to do the following, but it passes all the tests
                author = author.replace(email, '')
                author = author.replace('()', '')
+                author = author.replace('<>', '')
+                author = author.replace('&lt;&gt;', '')
                author = author.strip()
                if author and (author[0] == '('):
                    author = author[1:]