Filters to add languages, categories
This commit is contained in:
commit
9ee9adbe5d
1
THANKS
1
THANKS
@ -6,6 +6,7 @@ Michael Koziarski - HTTP Auth fix
|
||||
Brian Ewins - Win32 / Portalocker
|
||||
Joe Gregorio - Invoke same version of Python for filters
|
||||
Harry Fuecks - Pipe characters in file names
|
||||
Eric van der Vlist - Filters to add language, category information
|
||||
|
||||
This codebase represents a radical refactoring of Planet 2.0, which lists
|
||||
the following contributors:
|
||||
|
82
examples/filters/categories/categories.xslt
Normal file
82
examples/filters/categories/categories.xslt
Normal file
@ -0,0 +1,82 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE xsl:stylesheet [
|
||||
<!ENTITY categoryTerm "WebSemantique">
|
||||
]>
|
||||
<!--
|
||||
|
||||
This transformation is released under the same licence as Python
|
||||
see http://www.intertwingly.net/code/venus/LICENCE.
|
||||
|
||||
Author: Eric van der Vlist <vdv@dyomedea.com>
|
||||
|
||||
This transformation is meant to be used as a filter that determines if
|
||||
Atom entries are relevant to a specific topic and adds the corresonding
|
||||
<category/> element when it is the case.
|
||||
|
||||
This is done by a simple keyword matching mechanism.
|
||||
|
||||
To customize this filter to your needs:
|
||||
|
||||
1) Replace WebSemantique by your own category name in the definition of
|
||||
the categoryTerm entity above.
|
||||
2) Review the "upper" and "lower" variables that are used to convert text
|
||||
nodes to lower case and replace common ponctuation signs into spaces
|
||||
to check that they meet your needs.
|
||||
3) Define your own list of keywords in <d:keyword/> elements. Note that
|
||||
the leading and trailing spaces are significant: "> rdf <" will match rdf
|
||||
as en entier word while ">rdf<" would match the substring "rdf" and
|
||||
"> rdf<" would match words starting by rdf. Also note that the test is done
|
||||
after conversion to lowercase.
|
||||
|
||||
To use it with venus, just add this filter to the list of filters, for instance:
|
||||
|
||||
filters= categories.xslt guess_language.py
|
||||
|
||||
-->
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom" xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:d="http://ns.websemantique.org/data/" exclude-result-prefixes="d atom" version="1.0">
|
||||
<xsl:variable name="upper"
|
||||
>,.;AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZzÀàÁáÂâÃãÄäÅ寿ÇçÈèÉéÊêËëÌìÍíÎîÏïÐðÑñÒòÓóÔôÕõÖöØøÙùÚúÛûÜüÝýÞþ</xsl:variable>
|
||||
<xsl:variable name="lower"
|
||||
> aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzzaaaaaaaaaaaaææcceeeeeeeeiiiiiiiiððnnooooooooooøøuuuuuuuuyyþþ</xsl:variable>
|
||||
<d:keywords>
|
||||
<d:keyword> wiki semantique </d:keyword>
|
||||
<d:keyword> wikis semantiques </d:keyword>
|
||||
<d:keyword> web semantique </d:keyword>
|
||||
<d:keyword> websemantique </d:keyword>
|
||||
<d:keyword> semantic web</d:keyword>
|
||||
<d:keyword> semweb</d:keyword>
|
||||
<d:keyword> rdf</d:keyword>
|
||||
<d:keyword> owl </d:keyword>
|
||||
<d:keyword> sparql </d:keyword>
|
||||
<d:keyword> topic map</d:keyword>
|
||||
<d:keyword> doap </d:keyword>
|
||||
<d:keyword> foaf </d:keyword>
|
||||
<d:keyword> sioc </d:keyword>
|
||||
<d:keyword> ontology </d:keyword>
|
||||
<d:keyword> ontologie</d:keyword>
|
||||
<d:keyword> dublin core </d:keyword>
|
||||
</d:keywords>
|
||||
<xsl:template match="@*|node()">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
<xsl:template match="atom:entry/atom:updated">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
<xsl:variable name="concatenatedText">
|
||||
<xsl:for-each select="../atom:title|../atom:summary|../atom:content|../atom:category/@term">
|
||||
<xsl:text> </xsl:text>
|
||||
<xsl:value-of select="translate(., $upper, $lower)"/>
|
||||
</xsl:for-each>
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:variable>
|
||||
<xsl:if test="document('')/*/d:keywords/d:keyword[contains($concatenatedText, .)]">
|
||||
<category term="WebSemantique"/>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template match="atom:category[@term='&categoryTerm;']"/>
|
||||
</xsl:stylesheet>
|
37
examples/filters/guess-language/README
Normal file
37
examples/filters/guess-language/README
Normal file
@ -0,0 +1,37 @@
|
||||
This filter is released under the same licence as Python
|
||||
see http://www.intertwingly.net/code/venus/LICENCE.
|
||||
|
||||
Author: Eric van der Vlist <vdv@dyomedea.com>
|
||||
|
||||
This filter guesses whether an Atom entry is written
|
||||
in English or French. It should be trivial to chose between
|
||||
two other languages, easy to extend to more than two languages
|
||||
and useful to pass these languages as Venus configuration
|
||||
parameters.
|
||||
|
||||
The code used to guess the language is the one that has been
|
||||
described by Douglas Bagnall as the Python recipe titled
|
||||
"Language detection using character trigrams"
|
||||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576.
|
||||
|
||||
To add support for a new language, this language must first be
|
||||
"learned" using learn-language.py. This learning phase is nothing
|
||||
more than saving a pickled version of the Trigram object for this
|
||||
language.
|
||||
|
||||
To learn Finnish, you would execute:
|
||||
|
||||
$ ./learn-language.py http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt fi.data
|
||||
|
||||
where http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt is a text
|
||||
representative of the Finnish language and "fi.data" is the name of the
|
||||
data file for "fi" (ISO code for Finnish).
|
||||
|
||||
To install this filter, copy this directory under the Venus
|
||||
filter directory and declare it in your filters list, for instance:
|
||||
|
||||
filters= categories.xslt guess-language/guess-language.py
|
||||
|
||||
NOTE: this filter depends on Amara
|
||||
(http://uche.ogbuji.net/tech/4suite/amara/)
|
||||
|
15131
examples/filters/guess-language/en.data
Normal file
15131
examples/filters/guess-language/en.data
Normal file
File diff suppressed because it is too large
Load Diff
22710
examples/filters/guess-language/fr.data
Normal file
22710
examples/filters/guess-language/fr.data
Normal file
File diff suppressed because it is too large
Load Diff
58
examples/filters/guess-language/guess-language.py
Normal file
58
examples/filters/guess-language/guess-language.py
Normal file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
"""A filter to guess languages.
|
||||
|
||||
This filter guesses whether an Atom entry is written
|
||||
in English or French. It should be trivial to chose between
|
||||
two other languages, easy to extend to more than two languages
|
||||
and useful to pass these languages as Venus configuration
|
||||
parameters.
|
||||
|
||||
(See the REAME file for more details).
|
||||
|
||||
Requires Python 2.1, recommends 2.4.
|
||||
"""
|
||||
__authors__ = [ "Eric van der Vlist <vdv@dyomedea.com>"]
|
||||
__license__ = "Python"
|
||||
|
||||
import amara
|
||||
from sys import stdin, stdout
|
||||
from trigram import Trigram
|
||||
from xml.dom import XML_NAMESPACE as XML_NS
|
||||
import cPickle
|
||||
|
||||
ATOM_NSS = {
|
||||
u'atom': u'http://www.w3.org/2005/Atom',
|
||||
u'xml': XML_NS
|
||||
}
|
||||
|
||||
langs = {}
|
||||
|
||||
def tri(lang):
|
||||
if not langs.has_key(lang):
|
||||
f = open('filters/guess-language/%s.data' % lang, 'r')
|
||||
t = cPickle.load(f)
|
||||
f.close()
|
||||
langs[lang] = t
|
||||
return langs[lang]
|
||||
|
||||
|
||||
def guess_language(entry):
|
||||
text = u'';
|
||||
for child in entry.xml_xpath(u'atom:title|atom:summary|atom:content'):
|
||||
text = text + u' '+ child.__unicode__()
|
||||
t = Trigram()
|
||||
t.parseString(text)
|
||||
if tri('fr') - t > tri('en') - t:
|
||||
lang=u'en'
|
||||
else:
|
||||
lang=u'fr'
|
||||
entry.xml_set_attribute((u'xml:lang', XML_NS), lang)
|
||||
|
||||
def main():
|
||||
feed = amara.parse(stdin, prefixes=ATOM_NSS)
|
||||
for entry in feed.xml_xpath(u'//atom:entry[not(@xml:lang)]'):
|
||||
guess_language(entry)
|
||||
feed.xml(stdout)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
25
examples/filters/guess-language/learn-language.py
Executable file
25
examples/filters/guess-language/learn-language.py
Executable file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
"""A filter to guess languages.
|
||||
|
||||
This utility saves a Trigram object on file.
|
||||
|
||||
(See the REAME file for more details).
|
||||
|
||||
Requires Python 2.1, recommends 2.4.
|
||||
"""
|
||||
__authors__ = [ "Eric van der Vlist <vdv@dyomedea.com>"]
|
||||
__license__ = "Python"
|
||||
|
||||
from trigram import Trigram
|
||||
from sys import argv
|
||||
from cPickle import dump
|
||||
|
||||
|
||||
def main():
|
||||
tri = Trigram(argv[1])
|
||||
out = open(argv[2], 'w')
|
||||
dump(tri, out)
|
||||
out.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
188
examples/filters/guess-language/trigram.py
Normal file
188
examples/filters/guess-language/trigram.py
Normal file
@ -0,0 +1,188 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
This class is based on the Python recipe titled
|
||||
"Language detection using character trigrams"
|
||||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576
|
||||
by Douglas Bagnall.
|
||||
It has been (slightly) adapted by Eric van der Vlist to support
|
||||
Unicode and accept a method to parse strings.
|
||||
"""
|
||||
__authors__ = [ "Douglas Bagnall", "Eric van der Vlist <vdv@dyomedea.com>"]
|
||||
__license__ = "Python"
|
||||
|
||||
import random
|
||||
from urllib import urlopen
|
||||
|
||||
class Trigram:
|
||||
"""
|
||||
From one or more text files, the frequency of three character
|
||||
sequences is calculated. When treated as a vector, this information
|
||||
can be compared to other trigrams, and the difference between them
|
||||
seen as an angle. The cosine of this angle varies between 1 for
|
||||
complete similarity, and 0 for utter difference. Since letter
|
||||
combinations are characteristic to a language, this can be used to
|
||||
determine the language of a body of text. For example:
|
||||
|
||||
>>> reference_en = Trigram('/path/to/reference/text/english')
|
||||
>>> reference_de = Trigram('/path/to/reference/text/german')
|
||||
>>> unknown = Trigram('url://pointing/to/unknown/text')
|
||||
>>> unknown.similarity(reference_de)
|
||||
0.4
|
||||
>>> unknown.similarity(reference_en)
|
||||
0.95
|
||||
|
||||
would indicate the unknown text is almost cetrtainly English. As
|
||||
syntax sugar, the minus sign is overloaded to return the difference
|
||||
between texts, so the above objects would give you:
|
||||
|
||||
>>> unknown - reference_de
|
||||
0.6
|
||||
>>> reference_en - unknown # order doesn't matter.
|
||||
0.05
|
||||
|
||||
As it stands, the Trigram ignores character set information, which
|
||||
means you can only accurately compare within a single encoding
|
||||
(iso-8859-1 in the examples). A more complete implementation might
|
||||
convert to unicode first.
|
||||
|
||||
As an extra bonus, there is a method to make up nonsense words in the
|
||||
style of the Trigram's text.
|
||||
|
||||
>>> reference_en.makeWords(30)
|
||||
My withillonquiver and ald, by now wittlectionsurper, may sequia,
|
||||
tory, I ad my notter. Marriusbabilly She lady for rachalle spen
|
||||
hat knong al elf
|
||||
|
||||
Beware when using urls: HTML won't be parsed out.
|
||||
|
||||
Most methods chatter away to standard output, to let you know they're
|
||||
still there.
|
||||
"""
|
||||
|
||||
length = 0
|
||||
|
||||
def __init__(self, fn=None):
|
||||
self.lut = {}
|
||||
if fn is not None:
|
||||
self.parseFile(fn)
|
||||
|
||||
def _parseAFragment(self, line, pair=' '):
|
||||
for letter in line:
|
||||
d = self.lut.setdefault(pair, {})
|
||||
d[letter] = d.get(letter, 0) + 1
|
||||
pair = pair[1] + letter
|
||||
return pair
|
||||
|
||||
def parseString(self, string):
|
||||
self._parseAFragment(string)
|
||||
self.measure()
|
||||
|
||||
def parseFile(self, fn, encoding="iso-8859-1"):
|
||||
pair = ' '
|
||||
if '://' in fn:
|
||||
#print "trying to fetch url, may take time..."
|
||||
f = urlopen(fn)
|
||||
else:
|
||||
f = open(fn)
|
||||
for z, line in enumerate(f):
|
||||
#if not z % 1000:
|
||||
# print "line %s" % z
|
||||
# \n's are spurious in a prose context
|
||||
pair = self._parseAFragment(line.strip().decode(encoding) + ' ')
|
||||
f.close()
|
||||
self.measure()
|
||||
|
||||
|
||||
def measure(self):
|
||||
"""calculates the scalar length of the trigram vector and
|
||||
stores it in self.length."""
|
||||
total = 0
|
||||
for y in self.lut.values():
|
||||
total += sum([ x * x for x in y.values() ])
|
||||
self.length = total ** 0.5
|
||||
|
||||
def similarity(self, other):
|
||||
"""returns a number between 0 and 1 indicating similarity.
|
||||
1 means an identical ratio of trigrams;
|
||||
0 means no trigrams in common.
|
||||
"""
|
||||
if not isinstance(other, Trigram):
|
||||
raise TypeError("can't compare Trigram with non-Trigram")
|
||||
lut1 = self.lut
|
||||
lut2 = other.lut
|
||||
total = 0
|
||||
for k in lut1.keys():
|
||||
if k in lut2:
|
||||
a = lut1[k]
|
||||
b = lut2[k]
|
||||
for x in a:
|
||||
if x in b:
|
||||
total += a[x] * b[x]
|
||||
|
||||
return float(total) / (self.length * other.length)
|
||||
|
||||
def __sub__(self, other):
|
||||
"""indicates difference between trigram sets; 1 is entirely
|
||||
different, 0 is entirely the same."""
|
||||
return 1 - self.similarity(other)
|
||||
|
||||
|
||||
def makeWords(self, count):
|
||||
"""returns a string of made-up words based on the known text."""
|
||||
text = []
|
||||
k = ' '
|
||||
while count:
|
||||
n = self.likely(k)
|
||||
text.append(n)
|
||||
k = k[1] + n
|
||||
if n in ' \t':
|
||||
count -= 1
|
||||
return ''.join(text)
|
||||
|
||||
|
||||
def likely(self, k):
|
||||
"""Returns a character likely to follow the given string
|
||||
two character string, or a space if nothing is found."""
|
||||
if k not in self.lut:
|
||||
return ' '
|
||||
# if you were using this a lot, caching would a good idea.
|
||||
letters = []
|
||||
for k, v in self.lut[k].items():
|
||||
letters.append(k * v)
|
||||
letters = ''.join(letters)
|
||||
return random.choice(letters)
|
||||
|
||||
|
||||
def test():
|
||||
en = Trigram('http://gutenberg.net/dirs/etext97/lsusn11.txt')
|
||||
#NB fr and some others have English license text.
|
||||
# no has english excerpts.
|
||||
fr = Trigram('http://gutenberg.net/dirs/etext03/candi10.txt')
|
||||
fi = Trigram('http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt')
|
||||
no = Trigram('http://gutenberg.net/dirs/1/2/8/4/12844/12844-8.txt')
|
||||
se = Trigram('http://gutenberg.net/dirs/1/0/1/1/10117/10117-8.txt')
|
||||
no2 = Trigram('http://gutenberg.net/dirs/1/3/0/4/13041/13041-8.txt')
|
||||
en2 = Trigram('http://gutenberg.net/dirs/etext05/cfgsh10.txt')
|
||||
fr2 = Trigram('http://gutenberg.net/dirs/1/3/7/0/13704/13704-8.txt')
|
||||
print "calculating difference:"
|
||||
print "en - fr is %s" % (en - fr)
|
||||
print "fr - en is %s" % (fr - en)
|
||||
print "en - en2 is %s" % (en - en2)
|
||||
print "en - fr2 is %s" % (en - fr2)
|
||||
print "fr - en2 is %s" % (fr - en2)
|
||||
print "fr - fr2 is %s" % (fr - fr2)
|
||||
print "fr2 - en2 is %s" % (fr2 - en2)
|
||||
print "fi - fr is %s" % (fi - fr)
|
||||
print "fi - en is %s" % (fi - en)
|
||||
print "fi - se is %s" % (fi - se)
|
||||
print "no - se is %s" % (no - se)
|
||||
print "en - no is %s" % (en - no)
|
||||
print "no - no2 is %s" % (no - no2)
|
||||
print "se - no2 is %s" % (se - no2)
|
||||
print "en - no2 is %s" % (en - no2)
|
||||
print "fr - no2 is %s" % (fr - no2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
Loading…
Reference in New Issue
Block a user