You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
316 lines
20 KiB
316 lines
20 KiB
4 years ago
|
"""
|
||
|
sphinx.search.de
|
||
|
~~~~~~~~~~~~~~~~
|
||
|
|
||
|
German search language: includes the JS German stemmer.
|
||
|
|
||
|
:copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
|
||
|
:license: BSD, see LICENSE for details.
|
||
|
"""
|
||
|
|
||
|
from typing import Dict
|
||
|
|
||
|
import snowballstemmer
|
||
|
|
||
|
from sphinx.search import SearchLanguage, parse_stop_word
|
||
|
|
||
|
german_stopwords = parse_stop_word('''
|
||
|
|source: http://snowball.tartarus.org/algorithms/german/stop.txt
|
||
|
aber | but
|
||
|
|
||
|
alle | all
|
||
|
allem
|
||
|
allen
|
||
|
aller
|
||
|
alles
|
||
|
|
||
|
als | than, as
|
||
|
also | so
|
||
|
am | an + dem
|
||
|
an | at
|
||
|
|
||
|
ander | other
|
||
|
andere
|
||
|
anderem
|
||
|
anderen
|
||
|
anderer
|
||
|
anderes
|
||
|
anderm
|
||
|
andern
|
||
|
anderr
|
||
|
anders
|
||
|
|
||
|
auch | also
|
||
|
auf | on
|
||
|
aus | out of
|
||
|
bei | by
|
||
|
bin | am
|
||
|
bis | until
|
||
|
bist | art
|
||
|
da | there
|
||
|
damit | with it
|
||
|
dann | then
|
||
|
|
||
|
der | the
|
||
|
den
|
||
|
des
|
||
|
dem
|
||
|
die
|
||
|
das
|
||
|
|
||
|
daß | that
|
||
|
|
||
|
derselbe | the same
|
||
|
derselben
|
||
|
denselben
|
||
|
desselben
|
||
|
demselben
|
||
|
dieselbe
|
||
|
dieselben
|
||
|
dasselbe
|
||
|
|
||
|
dazu | to that
|
||
|
|
||
|
dein | thy
|
||
|
deine
|
||
|
deinem
|
||
|
deinen
|
||
|
deiner
|
||
|
deines
|
||
|
|
||
|
denn | because
|
||
|
|
||
|
derer | of those
|
||
|
dessen | of him
|
||
|
|
||
|
dich | thee
|
||
|
dir | to thee
|
||
|
du | thou
|
||
|
|
||
|
dies | this
|
||
|
diese
|
||
|
diesem
|
||
|
diesen
|
||
|
dieser
|
||
|
dieses
|
||
|
|
||
|
|
||
|
doch | (several meanings)
|
||
|
dort | (over) there
|
||
|
|
||
|
|
||
|
durch | through
|
||
|
|
||
|
ein | a
|
||
|
eine
|
||
|
einem
|
||
|
einen
|
||
|
einer
|
||
|
eines
|
||
|
|
||
|
einig | some
|
||
|
einige
|
||
|
einigem
|
||
|
einigen
|
||
|
einiger
|
||
|
einiges
|
||
|
|
||
|
einmal | once
|
||
|
|
||
|
er | he
|
||
|
ihn | him
|
||
|
ihm | to him
|
||
|
|
||
|
es | it
|
||
|
etwas | something
|
||
|
|
||
|
euer | your
|
||
|
eure
|
||
|
eurem
|
||
|
euren
|
||
|
eurer
|
||
|
eures
|
||
|
|
||
|
für | for
|
||
|
gegen | towards
|
||
|
gewesen | p.p. of sein
|
||
|
hab | have
|
||
|
habe | have
|
||
|
haben | have
|
||
|
hat | has
|
||
|
hatte | had
|
||
|
hatten | had
|
||
|
hier | here
|
||
|
hin | there
|
||
|
hinter | behind
|
||
|
|
||
|
ich | I
|
||
|
mich | me
|
||
|
mir | to me
|
||
|
|
||
|
|
||
|
ihr | you, to her
|
||
|
ihre
|
||
|
ihrem
|
||
|
ihren
|
||
|
ihrer
|
||
|
ihres
|
||
|
euch | to you
|
||
|
|
||
|
im | in + dem
|
||
|
in | in
|
||
|
indem | while
|
||
|
ins | in + das
|
||
|
ist | is
|
||
|
|
||
|
jede | each, every
|
||
|
jedem
|
||
|
jeden
|
||
|
jeder
|
||
|
jedes
|
||
|
|
||
|
jene | that
|
||
|
jenem
|
||
|
jenen
|
||
|
jener
|
||
|
jenes
|
||
|
|
||
|
jetzt | now
|
||
|
kann | can
|
||
|
|
||
|
kein | no
|
||
|
keine
|
||
|
keinem
|
||
|
keinen
|
||
|
keiner
|
||
|
keines
|
||
|
|
||
|
können | can
|
||
|
könnte | could
|
||
|
machen | do
|
||
|
man | one
|
||
|
|
||
|
manche | some, many a
|
||
|
manchem
|
||
|
manchen
|
||
|
mancher
|
||
|
manches
|
||
|
|
||
|
mein | my
|
||
|
meine
|
||
|
meinem
|
||
|
meinen
|
||
|
meiner
|
||
|
meines
|
||
|
|
||
|
mit | with
|
||
|
muss | must
|
||
|
musste | had to
|
||
|
nach | to(wards)
|
||
|
nicht | not
|
||
|
nichts | nothing
|
||
|
noch | still, yet
|
||
|
nun | now
|
||
|
nur | only
|
||
|
ob | whether
|
||
|
oder | or
|
||
|
ohne | without
|
||
|
sehr | very
|
||
|
|
||
|
sein | his
|
||
|
seine
|
||
|
seinem
|
||
|
seinen
|
||
|
seiner
|
||
|
seines
|
||
|
|
||
|
selbst | self
|
||
|
sich | herself
|
||
|
|
||
|
sie | they, she
|
||
|
ihnen | to them
|
||
|
|
||
|
sind | are
|
||
|
so | so
|
||
|
|
||
|
solche | such
|
||
|
solchem
|
||
|
solchen
|
||
|
solcher
|
||
|
solches
|
||
|
|
||
|
soll | shall
|
||
|
sollte | should
|
||
|
sondern | but
|
||
|
sonst | else
|
||
|
über | over
|
||
|
um | about, around
|
||
|
und | and
|
||
|
|
||
|
uns | us
|
||
|
unse
|
||
|
unsem
|
||
|
unsen
|
||
|
unser
|
||
|
unses
|
||
|
|
||
|
unter | under
|
||
|
viel | much
|
||
|
vom | von + dem
|
||
|
von | from
|
||
|
vor | before
|
||
|
während | while
|
||
|
war | was
|
||
|
waren | were
|
||
|
warst | wast
|
||
|
was | what
|
||
|
weg | away, off
|
||
|
weil | because
|
||
|
weiter | further
|
||
|
|
||
|
welche | which
|
||
|
welchem
|
||
|
welchen
|
||
|
welcher
|
||
|
welches
|
||
|
|
||
|
wenn | when
|
||
|
werde | will
|
||
|
werden | will
|
||
|
wie | how
|
||
|
wieder | again
|
||
|
will | want
|
||
|
wir | we
|
||
|
wird | will
|
||
|
wirst | willst
|
||
|
wo | where
|
||
|
wollen | want
|
||
|
wollte | wanted
|
||
|
würde | would
|
||
|
würden | would
|
||
|
zu | to
|
||
|
zum | zu + dem
|
||
|
zur | zu + der
|
||
|
zwar | indeed
|
||
|
zwischen | between
|
||
|
''')
|
||
|
|
||
|
js_stemmer = """
|
||
|
var JSX={};(function(j){function l(b,e){var a=function(){};a.prototype=e.prototype;var c=new a;for(var d in b){b[d].prototype=c}}function H(c,b){for(var a in b.prototype)if(b.prototype.hasOwnProperty(a))c.prototype[a]=b.prototype[a]}function g(a,b,d){function c(a,b,c){delete a[b];a[b]=c;return c}Object.defineProperty(a,b,{get:function(){return c(a,b,d())},set:function(d){c(a,b,d)},enumerable:true,configurable:true})}function I(a,b,c){return a[b]=a[b]/c|0}var C=parseInt;var r=parseFloat;function J(a){return a!==a}var z=isFinite;var y=encodeURIComponent;var x=decodeURIComponent;var w=encodeURI;var u=decodeURI;var t=Object.prototype.toString;var B=Object.prototype.hasOwnProperty;function i(){}j.require=function(b){var a=q[b];return a!==undefined?a:null};j.profilerIsRunning=function(){return i.getResults!=null};j.getProfileResults=function(){return(i.getResults||function(){return{}})()};j.postProfileResults=function(a,b){if(i.postResults==null)throw new Error('profiler has not been turned on');return i.postResults(a,b)};j.resetProfileResults=function(){if(i.resetResults==null)throw new Error('profiler has not been turned on');return i.resetResults()};j.DEBUG=false;function s(){};l([s],Error);function c(a,b,c){this.F=a.length;this.K=a;this.L=b;this.I=c;this.H=null;this.P=null};l([c],Object);function o(){};l([o],Object);function e(){var a;var b;var c;this.G={};a=this.D='';b=this._=0;c=this.A=a.length;this.E=0;this.C=b;this.B=c};l([e],o);function v(a,b){a.D=b.D;a._=b._;a.A=b.A;a.E=b.E;a.C=b.C;a.B=b.B};function f(b,d,c,e){var a;if(b._>=b.A){return false}a=b.D.charCodeAt(b._);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._++;return true};function m(b,d,c,e){var a;if(b._<=b.E){return false}a=b.D.charCodeAt(b._-1);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._--;return true};function n(a,d,c,e){var b;if(a._>=a.A){return false}b=a.D.charCodeAt(a._);if(b>e||b<c){a._++;return true}b-=c;if((d[b>>>3]&1<<(b&7))===0){a._++;return true}return false};function k(a,b,d){var c;if(a.A-a._<b){return false}if(a.D.slice(c=a._,c+b)!==d){return false}a._+=b;return true};function d(a,b,d){var c;if(a._-a.E<b){return false}if(a.D.slice((c=a._)-b,c)!==d){return false}a._-=b;return true};function p(f,m,p){var b;var d;var e;var n;var g;var k;var l;var i;var h;var c;var a;var j;var o;b=0;d=p;e=f._;n=f.A;g=0;k=0;l=false;while(true){i=b+(d-b>>>1);h=0;c=g<k?g:k;a=m[i];for(j=c;j<a.F;j++){if(e+c===n){h=-1;break}h=f.D.charCodeAt(e+c)-a.K.charCodeAt(j);if(h!==0){break}c++}if(h<0){d=i;k=c}else{b=i;g=c}if(d-b<=1){if(b>0){break}if(d===b){break}if(l){break}l=true}}while(true){a=m[b];if(g>=a.F){f._=e+a.F|0;if(a.H==null){return a.I}o=a.H(a.P);f._=e+a.F|0;if(o){return a.I}}b=a.L;if(b<0){return 0}}return-1};function h(d,m,p){var b;var g;var e;var n;var f;var k;var l;var i;var h;var c;var a;var j;var o;b=0;g=p;e=d._;n=d.E;f=0;k=0;l=false;while(true){i=b+(g-b>>1);h=0;c=f<k?f:k;a=m[i];for(j=a.F-1-c;j>=0;j--){if(e-c===n){h=-1;break}h=d.D.charCodeAt(e-1-c)-a.K.charCodeAt(j);if(h!==0){break}c++}if(h<0){g=i;k=c}else{b=i;f=c}if(g-b<=1){if(b>0){break}if(g===b){break}if(l){break}l=true}}while(true){a=m[b];if(f>=a.F){d._=e-a.F|0;if(a.H==null){return a.I}o=a.H(d);d._=e-a.F|0;if(o){return a.I}}b=a.L;if(b<0){return 0}}return-1};function D(a,b,d,e){var c;c=e.length-(d-b);a.D=a.D.slice(0,b)+e+a.D.slice(d);a.A+=c|0;if(a._>=d){a._+=c|0}else if(a._>b){a._=b}return c|0};function b(a,f){var b;var c;var d;var e;b=false;if((c=a.C)<0||c>(d=a.B)||d>(e=a.A)||e>a.D.length?false:true){D(a,a.C,a.B,f);b=true}return b};e.prototype.J=function(){return false};e.prototype.W=function(b){var a;var c;var d;var e;a=this.G['.'+b];if(a==null){c=this.D=b;d=this._=0;e=this.A=c.length;this.E=0;this.C=d;this.B=e;this.J();a=this.D;this.G['.'+b]=a}return a};e.prototype.stemWord=e.prototype.W;e.prototype.X=function(e){var d;var b;var c;var a;var f;var g;var h;d=[];for(b=0;b<e.length;b++){c=e[b];a=this.G['.'+c];if(a==null){f=this.D=c;g=this._=0;h=this.A=f.length;this.E=0;this.C=g;this.B=h;this.J();a=this.D;this.G['.'+c]=a}d.push(a)}return d};e.prototype.stemWor
|
||
|
var Stemmer = JSX.require("src/german-stemmer.jsx").GermanStemmer;
|
||
|
"""
|
||
|
|
||
|
|
||
|
class SearchGerman(SearchLanguage):
|
||
|
lang = 'de'
|
||
|
language_name = 'German'
|
||
|
js_stemmer_rawcode = 'german-stemmer.js'
|
||
|
js_stemmer_code = js_stemmer
|
||
|
stopwords = german_stopwords
|
||
|
|
||
|
def init(self, options: Dict) -> None:
|
||
|
self.stemmer = snowballstemmer.stemmer('german')
|
||
|
|
||
|
def stem(self, word: str) -> str:
|
||
|
return self.stemmer.stemWord(word.lower())
|