Filters to add languages, categories
This commit is contained in:
commit
9ee9adbe5d
1
THANKS
1
THANKS
@ -6,6 +6,7 @@ Michael Koziarski - HTTP Auth fix
|
|||||||
Brian Ewins - Win32 / Portalocker
|
Brian Ewins - Win32 / Portalocker
|
||||||
Joe Gregorio - Invoke same version of Python for filters
|
Joe Gregorio - Invoke same version of Python for filters
|
||||||
Harry Fuecks - Pipe characters in file names
|
Harry Fuecks - Pipe characters in file names
|
||||||
|
Eric van der Vlist - Filters to add language, category information
|
||||||
|
|
||||||
This codebase represents a radical refactoring of Planet 2.0, which lists
|
This codebase represents a radical refactoring of Planet 2.0, which lists
|
||||||
the following contributors:
|
the following contributors:
|
||||||
|
82
examples/filters/categories/categories.xslt
Normal file
82
examples/filters/categories/categories.xslt
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE xsl:stylesheet [
|
||||||
|
<!ENTITY categoryTerm "WebSemantique">
|
||||||
|
]>
|
||||||
|
<!--
|
||||||
|
|
||||||
|
This transformation is released under the same licence as Python
|
||||||
|
see http://www.intertwingly.net/code/venus/LICENCE.
|
||||||
|
|
||||||
|
Author: Eric van der Vlist <vdv@dyomedea.com>
|
||||||
|
|
||||||
|
This transformation is meant to be used as a filter that determines if
|
||||||
|
Atom entries are relevant to a specific topic and adds the corresonding
|
||||||
|
<category/> element when it is the case.
|
||||||
|
|
||||||
|
This is done by a simple keyword matching mechanism.
|
||||||
|
|
||||||
|
To customize this filter to your needs:
|
||||||
|
|
||||||
|
1) Replace WebSemantique by your own category name in the definition of
|
||||||
|
the categoryTerm entity above.
|
||||||
|
2) Review the "upper" and "lower" variables that are used to convert text
|
||||||
|
nodes to lower case and replace common ponctuation signs into spaces
|
||||||
|
to check that they meet your needs.
|
||||||
|
3) Define your own list of keywords in <d:keyword/> elements. Note that
|
||||||
|
the leading and trailing spaces are significant: "> rdf <" will match rdf
|
||||||
|
as en entier word while ">rdf<" would match the substring "rdf" and
|
||||||
|
"> rdf<" would match words starting by rdf. Also note that the test is done
|
||||||
|
after conversion to lowercase.
|
||||||
|
|
||||||
|
To use it with venus, just add this filter to the list of filters, for instance:
|
||||||
|
|
||||||
|
filters= categories.xslt guess_language.py
|
||||||
|
|
||||||
|
-->
|
||||||
|
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||||
|
xmlns:atom="http://www.w3.org/2005/Atom" xmlns="http://www.w3.org/2005/Atom"
|
||||||
|
xmlns:d="http://ns.websemantique.org/data/" exclude-result-prefixes="d atom" version="1.0">
|
||||||
|
<xsl:variable name="upper"
|
||||||
|
>,.;AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZzÀàÁáÂâÃãÄäÅ寿ÇçÈèÉéÊêËëÌìÍíÎîÏïÐðÑñÒòÓóÔôÕõÖöØøÙùÚúÛûÜüÝýÞþ</xsl:variable>
|
||||||
|
<xsl:variable name="lower"
|
||||||
|
> aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzzaaaaaaaaaaaaææcceeeeeeeeiiiiiiiiððnnooooooooooøøuuuuuuuuyyþþ</xsl:variable>
|
||||||
|
<d:keywords>
|
||||||
|
<d:keyword> wiki semantique </d:keyword>
|
||||||
|
<d:keyword> wikis semantiques </d:keyword>
|
||||||
|
<d:keyword> web semantique </d:keyword>
|
||||||
|
<d:keyword> websemantique </d:keyword>
|
||||||
|
<d:keyword> semantic web</d:keyword>
|
||||||
|
<d:keyword> semweb</d:keyword>
|
||||||
|
<d:keyword> rdf</d:keyword>
|
||||||
|
<d:keyword> owl </d:keyword>
|
||||||
|
<d:keyword> sparql </d:keyword>
|
||||||
|
<d:keyword> topic map</d:keyword>
|
||||||
|
<d:keyword> doap </d:keyword>
|
||||||
|
<d:keyword> foaf </d:keyword>
|
||||||
|
<d:keyword> sioc </d:keyword>
|
||||||
|
<d:keyword> ontology </d:keyword>
|
||||||
|
<d:keyword> ontologie</d:keyword>
|
||||||
|
<d:keyword> dublin core </d:keyword>
|
||||||
|
</d:keywords>
|
||||||
|
<xsl:template match="@*|node()">
|
||||||
|
<xsl:copy>
|
||||||
|
<xsl:apply-templates select="@*|node()"/>
|
||||||
|
</xsl:copy>
|
||||||
|
</xsl:template>
|
||||||
|
<xsl:template match="atom:entry/atom:updated">
|
||||||
|
<xsl:copy>
|
||||||
|
<xsl:apply-templates select="@*|node()"/>
|
||||||
|
</xsl:copy>
|
||||||
|
<xsl:variable name="concatenatedText">
|
||||||
|
<xsl:for-each select="../atom:title|../atom:summary|../atom:content|../atom:category/@term">
|
||||||
|
<xsl:text> </xsl:text>
|
||||||
|
<xsl:value-of select="translate(., $upper, $lower)"/>
|
||||||
|
</xsl:for-each>
|
||||||
|
<xsl:text> </xsl:text>
|
||||||
|
</xsl:variable>
|
||||||
|
<xsl:if test="document('')/*/d:keywords/d:keyword[contains($concatenatedText, .)]">
|
||||||
|
<category term="WebSemantique"/>
|
||||||
|
</xsl:if>
|
||||||
|
</xsl:template>
|
||||||
|
<xsl:template match="atom:category[@term='&categoryTerm;']"/>
|
||||||
|
</xsl:stylesheet>
|
37
examples/filters/guess-language/README
Normal file
37
examples/filters/guess-language/README
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
This filter is released under the same licence as Python
|
||||||
|
see http://www.intertwingly.net/code/venus/LICENCE.
|
||||||
|
|
||||||
|
Author: Eric van der Vlist <vdv@dyomedea.com>
|
||||||
|
|
||||||
|
This filter guesses whether an Atom entry is written
|
||||||
|
in English or French. It should be trivial to chose between
|
||||||
|
two other languages, easy to extend to more than two languages
|
||||||
|
and useful to pass these languages as Venus configuration
|
||||||
|
parameters.
|
||||||
|
|
||||||
|
The code used to guess the language is the one that has been
|
||||||
|
described by Douglas Bagnall as the Python recipe titled
|
||||||
|
"Language detection using character trigrams"
|
||||||
|
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576.
|
||||||
|
|
||||||
|
To add support for a new language, this language must first be
|
||||||
|
"learned" using learn-language.py. This learning phase is nothing
|
||||||
|
more than saving a pickled version of the Trigram object for this
|
||||||
|
language.
|
||||||
|
|
||||||
|
To learn Finnish, you would execute:
|
||||||
|
|
||||||
|
$ ./learn-language.py http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt fi.data
|
||||||
|
|
||||||
|
where http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt is a text
|
||||||
|
representative of the Finnish language and "fi.data" is the name of the
|
||||||
|
data file for "fi" (ISO code for Finnish).
|
||||||
|
|
||||||
|
To install this filter, copy this directory under the Venus
|
||||||
|
filter directory and declare it in your filters list, for instance:
|
||||||
|
|
||||||
|
filters= categories.xslt guess-language/guess-language.py
|
||||||
|
|
||||||
|
NOTE: this filter depends on Amara
|
||||||
|
(http://uche.ogbuji.net/tech/4suite/amara/)
|
||||||
|
|
15131
examples/filters/guess-language/en.data
Normal file
15131
examples/filters/guess-language/en.data
Normal file
File diff suppressed because it is too large
Load Diff
22710
examples/filters/guess-language/fr.data
Normal file
22710
examples/filters/guess-language/fr.data
Normal file
File diff suppressed because it is too large
Load Diff
58
examples/filters/guess-language/guess-language.py
Normal file
58
examples/filters/guess-language/guess-language.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""A filter to guess languages.
|
||||||
|
|
||||||
|
This filter guesses whether an Atom entry is written
|
||||||
|
in English or French. It should be trivial to chose between
|
||||||
|
two other languages, easy to extend to more than two languages
|
||||||
|
and useful to pass these languages as Venus configuration
|
||||||
|
parameters.
|
||||||
|
|
||||||
|
(See the REAME file for more details).
|
||||||
|
|
||||||
|
Requires Python 2.1, recommends 2.4.
|
||||||
|
"""
|
||||||
|
__authors__ = [ "Eric van der Vlist <vdv@dyomedea.com>"]
|
||||||
|
__license__ = "Python"
|
||||||
|
|
||||||
|
import amara
|
||||||
|
from sys import stdin, stdout
|
||||||
|
from trigram import Trigram
|
||||||
|
from xml.dom import XML_NAMESPACE as XML_NS
|
||||||
|
import cPickle
|
||||||
|
|
||||||
|
ATOM_NSS = {
|
||||||
|
u'atom': u'http://www.w3.org/2005/Atom',
|
||||||
|
u'xml': XML_NS
|
||||||
|
}
|
||||||
|
|
||||||
|
langs = {}
|
||||||
|
|
||||||
|
def tri(lang):
|
||||||
|
if not langs.has_key(lang):
|
||||||
|
f = open('filters/guess-language/%s.data' % lang, 'r')
|
||||||
|
t = cPickle.load(f)
|
||||||
|
f.close()
|
||||||
|
langs[lang] = t
|
||||||
|
return langs[lang]
|
||||||
|
|
||||||
|
|
||||||
|
def guess_language(entry):
|
||||||
|
text = u'';
|
||||||
|
for child in entry.xml_xpath(u'atom:title|atom:summary|atom:content'):
|
||||||
|
text = text + u' '+ child.__unicode__()
|
||||||
|
t = Trigram()
|
||||||
|
t.parseString(text)
|
||||||
|
if tri('fr') - t > tri('en') - t:
|
||||||
|
lang=u'en'
|
||||||
|
else:
|
||||||
|
lang=u'fr'
|
||||||
|
entry.xml_set_attribute((u'xml:lang', XML_NS), lang)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
feed = amara.parse(stdin, prefixes=ATOM_NSS)
|
||||||
|
for entry in feed.xml_xpath(u'//atom:entry[not(@xml:lang)]'):
|
||||||
|
guess_language(entry)
|
||||||
|
feed.xml(stdout)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
25
examples/filters/guess-language/learn-language.py
Executable file
25
examples/filters/guess-language/learn-language.py
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""A filter to guess languages.
|
||||||
|
|
||||||
|
This utility saves a Trigram object on file.
|
||||||
|
|
||||||
|
(See the REAME file for more details).
|
||||||
|
|
||||||
|
Requires Python 2.1, recommends 2.4.
|
||||||
|
"""
|
||||||
|
__authors__ = [ "Eric van der Vlist <vdv@dyomedea.com>"]
|
||||||
|
__license__ = "Python"
|
||||||
|
|
||||||
|
from trigram import Trigram
|
||||||
|
from sys import argv
|
||||||
|
from cPickle import dump
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
tri = Trigram(argv[1])
|
||||||
|
out = open(argv[2], 'w')
|
||||||
|
dump(tri, out)
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
188
examples/filters/guess-language/trigram.py
Normal file
188
examples/filters/guess-language/trigram.py
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
This class is based on the Python recipe titled
|
||||||
|
"Language detection using character trigrams"
|
||||||
|
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576
|
||||||
|
by Douglas Bagnall.
|
||||||
|
It has been (slightly) adapted by Eric van der Vlist to support
|
||||||
|
Unicode and accept a method to parse strings.
|
||||||
|
"""
|
||||||
|
__authors__ = [ "Douglas Bagnall", "Eric van der Vlist <vdv@dyomedea.com>"]
|
||||||
|
__license__ = "Python"
|
||||||
|
|
||||||
|
import random
|
||||||
|
from urllib import urlopen
|
||||||
|
|
||||||
|
class Trigram:
|
||||||
|
"""
|
||||||
|
From one or more text files, the frequency of three character
|
||||||
|
sequences is calculated. When treated as a vector, this information
|
||||||
|
can be compared to other trigrams, and the difference between them
|
||||||
|
seen as an angle. The cosine of this angle varies between 1 for
|
||||||
|
complete similarity, and 0 for utter difference. Since letter
|
||||||
|
combinations are characteristic to a language, this can be used to
|
||||||
|
determine the language of a body of text. For example:
|
||||||
|
|
||||||
|
>>> reference_en = Trigram('/path/to/reference/text/english')
|
||||||
|
>>> reference_de = Trigram('/path/to/reference/text/german')
|
||||||
|
>>> unknown = Trigram('url://pointing/to/unknown/text')
|
||||||
|
>>> unknown.similarity(reference_de)
|
||||||
|
0.4
|
||||||
|
>>> unknown.similarity(reference_en)
|
||||||
|
0.95
|
||||||
|
|
||||||
|
would indicate the unknown text is almost cetrtainly English. As
|
||||||
|
syntax sugar, the minus sign is overloaded to return the difference
|
||||||
|
between texts, so the above objects would give you:
|
||||||
|
|
||||||
|
>>> unknown - reference_de
|
||||||
|
0.6
|
||||||
|
>>> reference_en - unknown # order doesn't matter.
|
||||||
|
0.05
|
||||||
|
|
||||||
|
As it stands, the Trigram ignores character set information, which
|
||||||
|
means you can only accurately compare within a single encoding
|
||||||
|
(iso-8859-1 in the examples). A more complete implementation might
|
||||||
|
convert to unicode first.
|
||||||
|
|
||||||
|
As an extra bonus, there is a method to make up nonsense words in the
|
||||||
|
style of the Trigram's text.
|
||||||
|
|
||||||
|
>>> reference_en.makeWords(30)
|
||||||
|
My withillonquiver and ald, by now wittlectionsurper, may sequia,
|
||||||
|
tory, I ad my notter. Marriusbabilly She lady for rachalle spen
|
||||||
|
hat knong al elf
|
||||||
|
|
||||||
|
Beware when using urls: HTML won't be parsed out.
|
||||||
|
|
||||||
|
Most methods chatter away to standard output, to let you know they're
|
||||||
|
still there.
|
||||||
|
"""
|
||||||
|
|
||||||
|
length = 0
|
||||||
|
|
||||||
|
def __init__(self, fn=None):
|
||||||
|
self.lut = {}
|
||||||
|
if fn is not None:
|
||||||
|
self.parseFile(fn)
|
||||||
|
|
||||||
|
def _parseAFragment(self, line, pair=' '):
|
||||||
|
for letter in line:
|
||||||
|
d = self.lut.setdefault(pair, {})
|
||||||
|
d[letter] = d.get(letter, 0) + 1
|
||||||
|
pair = pair[1] + letter
|
||||||
|
return pair
|
||||||
|
|
||||||
|
def parseString(self, string):
|
||||||
|
self._parseAFragment(string)
|
||||||
|
self.measure()
|
||||||
|
|
||||||
|
def parseFile(self, fn, encoding="iso-8859-1"):
|
||||||
|
pair = ' '
|
||||||
|
if '://' in fn:
|
||||||
|
#print "trying to fetch url, may take time..."
|
||||||
|
f = urlopen(fn)
|
||||||
|
else:
|
||||||
|
f = open(fn)
|
||||||
|
for z, line in enumerate(f):
|
||||||
|
#if not z % 1000:
|
||||||
|
# print "line %s" % z
|
||||||
|
# \n's are spurious in a prose context
|
||||||
|
pair = self._parseAFragment(line.strip().decode(encoding) + ' ')
|
||||||
|
f.close()
|
||||||
|
self.measure()
|
||||||
|
|
||||||
|
|
||||||
|
def measure(self):
|
||||||
|
"""calculates the scalar length of the trigram vector and
|
||||||
|
stores it in self.length."""
|
||||||
|
total = 0
|
||||||
|
for y in self.lut.values():
|
||||||
|
total += sum([ x * x for x in y.values() ])
|
||||||
|
self.length = total ** 0.5
|
||||||
|
|
||||||
|
def similarity(self, other):
|
||||||
|
"""returns a number between 0 and 1 indicating similarity.
|
||||||
|
1 means an identical ratio of trigrams;
|
||||||
|
0 means no trigrams in common.
|
||||||
|
"""
|
||||||
|
if not isinstance(other, Trigram):
|
||||||
|
raise TypeError("can't compare Trigram with non-Trigram")
|
||||||
|
lut1 = self.lut
|
||||||
|
lut2 = other.lut
|
||||||
|
total = 0
|
||||||
|
for k in lut1.keys():
|
||||||
|
if k in lut2:
|
||||||
|
a = lut1[k]
|
||||||
|
b = lut2[k]
|
||||||
|
for x in a:
|
||||||
|
if x in b:
|
||||||
|
total += a[x] * b[x]
|
||||||
|
|
||||||
|
return float(total) / (self.length * other.length)
|
||||||
|
|
||||||
|
def __sub__(self, other):
|
||||||
|
"""indicates difference between trigram sets; 1 is entirely
|
||||||
|
different, 0 is entirely the same."""
|
||||||
|
return 1 - self.similarity(other)
|
||||||
|
|
||||||
|
|
||||||
|
def makeWords(self, count):
|
||||||
|
"""returns a string of made-up words based on the known text."""
|
||||||
|
text = []
|
||||||
|
k = ' '
|
||||||
|
while count:
|
||||||
|
n = self.likely(k)
|
||||||
|
text.append(n)
|
||||||
|
k = k[1] + n
|
||||||
|
if n in ' \t':
|
||||||
|
count -= 1
|
||||||
|
return ''.join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def likely(self, k):
|
||||||
|
"""Returns a character likely to follow the given string
|
||||||
|
two character string, or a space if nothing is found."""
|
||||||
|
if k not in self.lut:
|
||||||
|
return ' '
|
||||||
|
# if you were using this a lot, caching would a good idea.
|
||||||
|
letters = []
|
||||||
|
for k, v in self.lut[k].items():
|
||||||
|
letters.append(k * v)
|
||||||
|
letters = ''.join(letters)
|
||||||
|
return random.choice(letters)
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
en = Trigram('http://gutenberg.net/dirs/etext97/lsusn11.txt')
|
||||||
|
#NB fr and some others have English license text.
|
||||||
|
# no has english excerpts.
|
||||||
|
fr = Trigram('http://gutenberg.net/dirs/etext03/candi10.txt')
|
||||||
|
fi = Trigram('http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt')
|
||||||
|
no = Trigram('http://gutenberg.net/dirs/1/2/8/4/12844/12844-8.txt')
|
||||||
|
se = Trigram('http://gutenberg.net/dirs/1/0/1/1/10117/10117-8.txt')
|
||||||
|
no2 = Trigram('http://gutenberg.net/dirs/1/3/0/4/13041/13041-8.txt')
|
||||||
|
en2 = Trigram('http://gutenberg.net/dirs/etext05/cfgsh10.txt')
|
||||||
|
fr2 = Trigram('http://gutenberg.net/dirs/1/3/7/0/13704/13704-8.txt')
|
||||||
|
print "calculating difference:"
|
||||||
|
print "en - fr is %s" % (en - fr)
|
||||||
|
print "fr - en is %s" % (fr - en)
|
||||||
|
print "en - en2 is %s" % (en - en2)
|
||||||
|
print "en - fr2 is %s" % (en - fr2)
|
||||||
|
print "fr - en2 is %s" % (fr - en2)
|
||||||
|
print "fr - fr2 is %s" % (fr - fr2)
|
||||||
|
print "fr2 - en2 is %s" % (fr2 - en2)
|
||||||
|
print "fi - fr is %s" % (fi - fr)
|
||||||
|
print "fi - en is %s" % (fi - en)
|
||||||
|
print "fi - se is %s" % (fi - se)
|
||||||
|
print "no - se is %s" % (no - se)
|
||||||
|
print "en - no is %s" % (en - no)
|
||||||
|
print "no - no2 is %s" % (no - no2)
|
||||||
|
print "se - no2 is %s" % (se - no2)
|
||||||
|
print "en - no2 is %s" % (en - no2)
|
||||||
|
print "fr - no2 is %s" % (fr - no2)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
Loading…
Reference in New Issue
Block a user