Filters to add languages, categories

2006-10-17 14:24:31 -04:00 · 2006-10-17 14:24:31 -04:00 · 9ee9adbe5d
commit 9ee9adbe5d
parent 94cbfe4f09 59aa2a964c
8 changed files with 38232 additions and 0 deletions
--- a/1
+++ b/1
@ -6,6 +6,7 @@ Michael Koziarski - HTTP Auth fix
 Brian Ewins     - Win32 / Portalocker
 Joe Gregorio    - Invoke same version of Python for filters
 Harry Fuecks    - Pipe characters in file names
+Eric van der Vlist - Filters to add language, category information

 This codebase represents a radical refactoring of Planet 2.0, which lists
 the following contributors:
--- a/examples/filters/categories/categories.xslt
+++ b/examples/filters/categories/categories.xslt
@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE xsl:stylesheet [
+<!ENTITY categoryTerm "WebSemantique">
+]>
+<!-- 
+
+  This transformation is released under the same licence as Python
+  see http://www.intertwingly.net/code/venus/LICENCE.
+
+  Author: Eric van der Vlist <vdv@dyomedea.com>
+  
+  This transformation is meant to be used as a filter that determines if
+  Atom entries are relevant to a specific topic and adds the corresonding
+  <category/> element when it is the case.
+  
+  This is done by a simple keyword matching mechanism.
+  
+  To customize this filter to your needs:
+  
+    1) Replace WebSemantique by your own category name in the definition of
+        the categoryTerm entity above.
+    2) Review the "upper" and "lower" variables that are used to convert text
+        nodes to lower case and replace common ponctuation signs into spaces
+        to check that they meet your needs.
+    3) Define your own list of keywords in <d:keyword/> elements. Note that 
+        the leading and trailing spaces are significant: "> rdf <" will match rdf
+        as en entier word while ">rdf<" would match the substring "rdf" and
+        "> rdf<" would match words starting by rdf. Also note that the test is done
+        after conversion to lowercase.
+
+  To use it with venus, just add this filter to the list of filters, for instance:
+  
+filters= categories.xslt guess_language.py
+  
+-->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:atom="http://www.w3.org/2005/Atom" xmlns="http://www.w3.org/2005/Atom"
+  xmlns:d="http://ns.websemantique.org/data/" exclude-result-prefixes="d atom" version="1.0">
+  <xsl:variable name="upper"
+    >,.;AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZzÀàÁáÂâÃãÄäÅåÆæÇçÈèÉéÊêËëÌìÍíÎîÏïÐðÑñÒòÓóÔôÕõÖöØøÙùÚúÛûÜüÝýÞþ</xsl:variable>
+  <xsl:variable name="lower"
+    >   aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzzaaaaaaaaaaaaææcceeeeeeeeiiiiiiiiððnnooooooooooøøuuuuuuuuyyþþ</xsl:variable>
+  <d:keywords>
+    <d:keyword> wiki semantique </d:keyword>
+    <d:keyword> wikis semantiques </d:keyword>
+    <d:keyword> web semantique </d:keyword>
+    <d:keyword> websemantique </d:keyword>
+    <d:keyword> semantic web</d:keyword>
+    <d:keyword> semweb</d:keyword>
+    <d:keyword> rdf</d:keyword>
+    <d:keyword> owl </d:keyword>
+    <d:keyword> sparql </d:keyword>
+    <d:keyword> topic map</d:keyword>
+    <d:keyword> doap </d:keyword>
+    <d:keyword> foaf </d:keyword>
+    <d:keyword> sioc </d:keyword>
+    <d:keyword> ontology </d:keyword>
+    <d:keyword> ontologie</d:keyword>
+    <d:keyword> dublin core </d:keyword>
+  </d:keywords>
+  <xsl:template match="@*|node()">
+    <xsl:copy>
+      <xsl:apply-templates select="@*|node()"/>
+    </xsl:copy>
+  </xsl:template>
+  <xsl:template match="atom:entry/atom:updated">
+    <xsl:copy>
+      <xsl:apply-templates select="@*|node()"/>
+    </xsl:copy>
+    <xsl:variable name="concatenatedText">
+      <xsl:for-each select="../atom:title|../atom:summary|../atom:content|../atom:category/@term">
+        <xsl:text> </xsl:text>
+        <xsl:value-of select="translate(., $upper, $lower)"/>
+      </xsl:for-each>
+      <xsl:text> </xsl:text>
+    </xsl:variable>
+    <xsl:if test="document('')/*/d:keywords/d:keyword[contains($concatenatedText, .)]">
+      <category term="WebSemantique"/>
+    </xsl:if>
+  </xsl:template>
+  <xsl:template match="atom:category[@term='&categoryTerm;']"/>
+</xsl:stylesheet>
--- a/examples/filters/guess-language/README
+++ b/examples/filters/guess-language/README
@ -0,0 +1,37 @@
+This filter is released under the same licence as Python
+see http://www.intertwingly.net/code/venus/LICENCE.
+
+Author: Eric van der Vlist <vdv@dyomedea.com>
+  
+This filter guesses whether an Atom entry is written
+in English or French. It should be trivial to chose between
+two other languages, easy to extend to more than two languages
+and useful to pass these languages as Venus configuration
+parameters.
+
+The code used to guess the language is the one that has been
+described by Douglas Bagnall as the Python recipe titled
+"Language detection using character trigrams"
+http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576.
+
+To add support for a new language, this language must first be
+"learned" using learn-language.py. This learning phase is nothing
+more than saving a pickled version of the Trigram object for this
+language. 
+
+To learn Finnish, you would execute:
+
+$ ./learn-language.py http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt fi.data
+
+where http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt is a text
+representative of the Finnish language and "fi.data" is the name of the
+data file for "fi" (ISO code for Finnish).
+
+To install this filter, copy this directory under the Venus
+filter directory and declare it in your filters list, for instance:
+
+filters= categories.xslt guess-language/guess-language.py
+
+NOTE: this filter depends on Amara 
+(http://uche.ogbuji.net/tech/4suite/amara/)
+
--- a/examples/filters/guess-language/en.data
+++ b/examples/filters/guess-language/en.data
--- a/examples/filters/guess-language/fr.data
+++ b/examples/filters/guess-language/fr.data
--- a/examples/filters/guess-language/guess-language.py
+++ b/examples/filters/guess-language/guess-language.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python
+"""A filter to guess languages.
+
+This filter guesses whether an Atom entry is written
+in English or French. It should be trivial to chose between
+two other languages, easy to extend to more than two languages
+and useful to pass these languages as Venus configuration
+parameters.
+
+(See the REAME file for more details).
+
+Requires Python 2.1, recommends 2.4.
+"""
+__authors__ = [ "Eric van der Vlist <vdv@dyomedea.com>"]
+__license__ = "Python"
+
+import amara
+from sys import stdin, stdout
+from trigram import Trigram
+from xml.dom import XML_NAMESPACE as XML_NS
+import cPickle
+
+ATOM_NSS = {
+    u'atom': u'http://www.w3.org/2005/Atom',
+    u'xml': XML_NS
+}
+
+langs = {}
+
+def tri(lang):
+    if not langs.has_key(lang):
+	f = open('filters/guess-language/%s.data' % lang, 'r')
+	t = cPickle.load(f)
+	f.close()
+	langs[lang] = t
+    return langs[lang]
+    
+
+def guess_language(entry):
+    text = u'';
+    for child in entry.xml_xpath(u'atom:title|atom:summary|atom:content'):
+	text = text + u' '+ child.__unicode__()
+    t = Trigram()
+    t.parseString(text)
+    if tri('fr') - t > tri('en') - t:
+	lang=u'en'
+    else:
+	lang=u'fr'
+    entry.xml_set_attribute((u'xml:lang', XML_NS), lang)
+
+def main():
+    feed = amara.parse(stdin, prefixes=ATOM_NSS)
+    for entry in feed.xml_xpath(u'//atom:entry[not(@xml:lang)]'):
+	guess_language(entry)
+    feed.xml(stdout)
+
+if __name__ == '__main__':
+    main()
--- a/examples/filters/guess-language/learn-language.py
+++ b/examples/filters/guess-language/learn-language.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+"""A filter to guess languages.
+
+This utility saves a Trigram object on file.
+
+(See the REAME file for more details).
+
+Requires Python 2.1, recommends 2.4.
+"""
+__authors__ = [ "Eric van der Vlist <vdv@dyomedea.com>"]
+__license__ = "Python"
+
+from trigram import Trigram
+from sys import argv
+from cPickle import dump
+
+
+def main():
+    tri = Trigram(argv[1])
+    out = open(argv[2], 'w')
+    dump(tri, out)
+    out.close()
+
+if __name__ == '__main__':
+    main()
--- a/examples/filters/guess-language/trigram.py
+++ b/examples/filters/guess-language/trigram.py
@ -0,0 +1,188 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+"""
+    This class is based on the Python recipe titled
+    "Language detection using character trigrams"
+    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576
+    by Douglas Bagnall.
+    It has been (slightly) adapted by Eric van der Vlist to support
+    Unicode and accept a method to parse strings.
+"""
+__authors__ = [ "Douglas Bagnall", "Eric van der Vlist <vdv@dyomedea.com>"]
+__license__ = "Python"
+
+import random
+from urllib import urlopen
+
+class Trigram:
+    """
+    From one or more text files, the frequency of three character
+    sequences is calculated.  When treated as a vector, this information
+    can be compared to other trigrams, and the difference between them
+    seen as an angle.  The cosine of this angle varies between 1 for
+    complete similarity, and 0 for utter difference.  Since letter
+    combinations are characteristic to a language, this can be used to
+    determine the language of a body of text. For example:
+
+        >>> reference_en = Trigram('/path/to/reference/text/english')
+        >>> reference_de = Trigram('/path/to/reference/text/german')
+        >>> unknown = Trigram('url://pointing/to/unknown/text')
+        >>> unknown.similarity(reference_de)
+        0.4
+        >>> unknown.similarity(reference_en)
+        0.95
+
+    would indicate the unknown text is almost cetrtainly English.  As
+    syntax sugar, the minus sign is overloaded to return the difference
+    between texts, so the above objects would give you:
+
+        >>> unknown - reference_de
+        0.6
+        >>> reference_en - unknown    # order doesn't matter.
+        0.05
+
+    As it stands, the Trigram ignores character set information, which
+    means you can only accurately compare within a single encoding
+    (iso-8859-1 in the examples).  A more complete implementation might
+    convert to unicode first.
+
+    As an extra bonus, there is a method to make up nonsense words in the
+    style of the Trigram's text.
+
+        >>> reference_en.makeWords(30)
+        My withillonquiver and ald, by now wittlectionsurper, may sequia,
+        tory, I ad my notter. Marriusbabilly She lady for rachalle spen
+        hat knong al elf
+
+    Beware when using urls: HTML won't be parsed out.
+
+    Most methods chatter away to standard output, to let you know they're
+    still there.
+    """
+
+    length = 0
+
+    def __init__(self, fn=None):
+        self.lut = {}
+        if fn is not None:
+            self.parseFile(fn)
+
+    def _parseAFragment(self, line, pair='  '):
+	for letter in line:
+	    d = self.lut.setdefault(pair, {})
+            d[letter] = d.get(letter, 0) + 1
+            pair = pair[1] + letter
+	return pair
+
+    def parseString(self, string):
+	self._parseAFragment(string)
+        self.measure()
+    
+    def parseFile(self, fn, encoding="iso-8859-1"):
+        pair = '  '
+        if '://' in fn:
+            #print "trying to fetch url, may take time..."
+            f = urlopen(fn)
+        else:
+            f = open(fn)
+        for z, line in enumerate(f):
+            #if not z % 1000:
+            #    print "line %s" % z
+            # \n's are spurious in a prose context
+            pair = self._parseAFragment(line.strip().decode(encoding) + ' ')
+        f.close()
+        self.measure()
+
+
+    def measure(self):
+        """calculates the scalar length of the trigram vector and
+        stores it in self.length."""
+        total = 0
+        for y in self.lut.values():
+            total += sum([ x * x for x in y.values() ])
+        self.length = total ** 0.5
+
+    def similarity(self, other):
+        """returns a number between 0 and 1 indicating similarity.
+        1 means an identical ratio of trigrams;
+        0 means no trigrams in common.
+        """
+        if not isinstance(other, Trigram):
+            raise TypeError("can't compare Trigram with non-Trigram")
+        lut1 = self.lut
+        lut2 = other.lut
+        total = 0
+        for k in lut1.keys():
+            if k in lut2:
+                a = lut1[k]
+                b = lut2[k]
+                for x in a:
+                    if x in b:
+                        total += a[x] * b[x]
+
+        return float(total) / (self.length * other.length)
+
+    def __sub__(self, other):
+        """indicates difference between trigram sets; 1 is entirely
+        different, 0 is entirely the same."""
+        return 1 - self.similarity(other)
+
+
+    def makeWords(self, count):
+        """returns a string of made-up words based on the known text."""
+        text = []
+        k = '  '
+        while count:
+            n = self.likely(k)
+            text.append(n)
+            k = k[1] + n
+            if n in ' \t':
+                count -= 1
+        return ''.join(text)
+
+
+    def likely(self, k):
+        """Returns a character likely to follow the given string
+        two character string, or a space if nothing is found."""
+        if k not in self.lut:
+            return ' '
+        # if you were using this a lot, caching would a good idea.
+        letters = []
+        for k, v in self.lut[k].items():
+            letters.append(k * v)
+        letters = ''.join(letters)
+        return random.choice(letters)
+
+
+def test():
+    en = Trigram('http://gutenberg.net/dirs/etext97/lsusn11.txt')
+   #NB fr and some others have English license text.
+    #   no has english excerpts.
+    fr = Trigram('http://gutenberg.net/dirs/etext03/candi10.txt')
+    fi = Trigram('http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt')
+    no = Trigram('http://gutenberg.net/dirs/1/2/8/4/12844/12844-8.txt')
+    se = Trigram('http://gutenberg.net/dirs/1/0/1/1/10117/10117-8.txt')
+    no2 = Trigram('http://gutenberg.net/dirs/1/3/0/4/13041/13041-8.txt')
+    en2 = Trigram('http://gutenberg.net/dirs/etext05/cfgsh10.txt')
+    fr2 = Trigram('http://gutenberg.net/dirs/1/3/7/0/13704/13704-8.txt')
+    print "calculating difference:"
+    print "en - fr is %s" % (en - fr)
+    print "fr - en is %s" % (fr - en)
+    print "en - en2 is %s" % (en - en2)
+    print "en - fr2 is %s" % (en - fr2)
+    print "fr - en2 is %s" % (fr - en2)
+    print "fr - fr2 is %s" % (fr - fr2)
+    print "fr2 - en2 is %s" % (fr2 - en2)
+    print "fi - fr  is %s" % (fi - fr)
+    print "fi - en  is %s" % (fi - en)
+    print "fi - se  is %s" % (fi - se)
+    print "no - se  is %s" % (no - se)
+    print "en - no  is %s" % (en - no)
+    print "no - no2  is %s" % (no - no2)
+    print "se - no2  is %s" % (se - no2)
+    print "en - no2  is %s" % (en - no2)
+    print "fr - no2  is %s" % (fr - no2)
+
+
+if __name__ == '__main__':
+    test()