You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
14 KiB
133 lines
14 KiB
4 years ago
|
"""
|
||
|
sphinx.search.da
|
||
|
~~~~~~~~~~~~~~~~
|
||
|
|
||
|
Danish search language: includes the JS Danish stemmer.
|
||
|
|
||
|
:copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
|
||
|
:license: BSD, see LICENSE for details.
|
||
|
"""
|
||
|
|
||
|
from typing import Dict
|
||
|
|
||
|
import snowballstemmer
|
||
|
|
||
|
from sphinx.search import SearchLanguage, parse_stop_word
|
||
|
|
||
|
danish_stopwords = parse_stop_word('''
|
||
|
| source: http://snowball.tartarus.org/algorithms/danish/stop.txt
|
||
|
og | and
|
||
|
i | in
|
||
|
jeg | I
|
||
|
det | that (dem. pronoun)/it (pers. pronoun)
|
||
|
at | that (in front of a sentence)/to (with infinitive)
|
||
|
en | a/an
|
||
|
den | it (pers. pronoun)/that (dem. pronoun)
|
||
|
til | to/at/for/until/against/by/of/into, more
|
||
|
er | present tense of "to be"
|
||
|
som | who, as
|
||
|
på | on/upon/in/on/at/to/after/of/with/for, on
|
||
|
de | they
|
||
|
med | with/by/in, along
|
||
|
han | he
|
||
|
af | of/by/from/off/for/in/with/on, off
|
||
|
for | at/for/to/from/by/of/ago, in front/before, because
|
||
|
ikke | not
|
||
|
der | who/which, there/those
|
||
|
var | past tense of "to be"
|
||
|
mig | me/myself
|
||
|
sig | oneself/himself/herself/itself/themselves
|
||
|
men | but
|
||
|
et | a/an/one, one (number), someone/somebody/one
|
||
|
har | present tense of "to have"
|
||
|
om | round/about/for/in/a, about/around/down, if
|
||
|
vi | we
|
||
|
min | my
|
||
|
havde | past tense of "to have"
|
||
|
ham | him
|
||
|
hun | she
|
||
|
nu | now
|
||
|
over | over/above/across/by/beyond/past/on/about, over/past
|
||
|
da | then, when/as/since
|
||
|
fra | from/off/since, off, since
|
||
|
du | you
|
||
|
ud | out
|
||
|
sin | his/her/its/one's
|
||
|
dem | them
|
||
|
os | us/ourselves
|
||
|
op | up
|
||
|
man | you/one
|
||
|
hans | his
|
||
|
hvor | where
|
||
|
eller | or
|
||
|
hvad | what
|
||
|
skal | must/shall etc.
|
||
|
selv | myself/youself/herself/ourselves etc., even
|
||
|
her | here
|
||
|
alle | all/everyone/everybody etc.
|
||
|
vil | will (verb)
|
||
|
blev | past tense of "to stay/to remain/to get/to become"
|
||
|
kunne | could
|
||
|
ind | in
|
||
|
når | when
|
||
|
være | present tense of "to be"
|
||
|
dog | however/yet/after all
|
||
|
noget | something
|
||
|
ville | would
|
||
|
jo | you know/you see (adv), yes
|
||
|
deres | their/theirs
|
||
|
efter | after/behind/according to/for/by/from, later/afterwards
|
||
|
ned | down
|
||
|
skulle | should
|
||
|
denne | this
|
||
|
end | than
|
||
|
dette | this
|
||
|
mit | my/mine
|
||
|
også | also
|
||
|
under | under/beneath/below/during, below/underneath
|
||
|
have | have
|
||
|
dig | you
|
||
|
anden | other
|
||
|
hende | her
|
||
|
mine | my
|
||
|
alt | everything
|
||
|
meget | much/very, plenty of
|
||
|
sit | his, her, its, one's
|
||
|
sine | his, her, its, one's
|
||
|
vor | our
|
||
|
mod | against
|
||
|
disse | these
|
||
|
hvis | if
|
||
|
din | your/yours
|
||
|
nogle | some
|
||
|
hos | by/at
|
||
|
blive | be/become
|
||
|
mange | many
|
||
|
ad | by/through
|
||
|
bliver | present tense of "to be/to become"
|
||
|
hendes | her/hers
|
||
|
været | be
|
||
|
thi | for (conj)
|
||
|
jer | you
|
||
|
sådan | such, like this/like that
|
||
|
''')
|
||
|
|
||
|
js_stemmer = """
|
||
|
var JSX={};(function(g){function j(b,e){var a=function(){};a.prototype=e.prototype;var c=new a;for(var d in b){b[d].prototype=c}}function I(c,b){for(var a in b.prototype)if(b.prototype.hasOwnProperty(a))c.prototype[a]=b.prototype[a]}function i(a,b,d){function c(a,b,c){delete a[b];a[b]=c;return c}Object.defineProperty(a,b,{get:function(){return c(a,b,d())},set:function(d){c(a,b,d)},enumerable:true,configurable:true})}function J(a,b,c){return a[b]=a[b]/c|0}var E=parseInt;var D=parseFloat;function K(a){return a!==a}var A=isFinite;var z=encodeURIComponent;var y=decodeURIComponent;var x=encodeURI;var w=decodeURI;var u=Object.prototype.toString;var C=Object.prototype.hasOwnProperty;function f(){}g.require=function(b){var a=p[b];return a!==undefined?a:null};g.profilerIsRunning=function(){return f.getResults!=null};g.getProfileResults=function(){return(f.getResults||function(){return{}})()};g.postProfileResults=function(a,b){if(f.postResults==null)throw new Error('profiler has not been turned on');return f.postResults(a,b)};g.resetProfileResults=function(){if(f.resetResults==null)throw new Error('profiler has not been turned on');return f.resetResults()};g.DEBUG=false;function t(){};j([t],Error);function b(a,b,c){this.G=a.length;this.S=a;this.V=b;this.J=c;this.I=null;this.W=null};j([b],Object);function l(){};j([l],Object);function d(){var a;var b;var c;this.F={};a=this.D='';b=this._=0;c=this.A=a.length;this.B=0;this.C=b;this.E=c};j([d],l);function v(a,b){a.D=b.D;a._=b._;a.A=b.A;a.B=b.B;a.C=b.C;a.E=b.E};function n(b,d,c,e){var a;if(b._>=b.A){return false}a=b.D.charCodeAt(b._);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._++;return true};function m(b,d,c,e){var a;if(b._<=b.B){return false}a=b.D.charCodeAt(b._-1);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._--;return true};function r(a,d,c,e){var b;if(a._>=a.A){return false}b=a.D.charCodeAt(a._);if(b>e||b<c){a._++;return true}b-=c;if((d[b>>>3]&1<<(b&7))===0){a._++;return true}return false};function q(a,d,c,e){var b;if(a._<=a.B){return false}b=a.D.charCodeAt(a._-1);if(b>e||b<c){a._--;return true}b-=c;if((d[b>>>3]&1<<(b&7))===0){a._--;return true}return false};function h(a,b,d){var c;if(a._-a.B<b){return false}if(a.D.slice((c=a._)-b,c)!==d){return false}a._-=b;return true};function e(d,m,p){var b;var g;var e;var n;var f;var k;var l;var i;var h;var c;var a;var j;var o;b=0;g=p;e=d._;n=d.B;f=0;k=0;l=false;while(true){i=b+(g-b>>1);h=0;c=f<k?f:k;a=m[i];for(j=a.G-1-c;j>=0;j--){if(e-c===n){h=-1;break}h=d.D.charCodeAt(e-1-c)-a.S.charCodeAt(j);if(h!==0){break}c++}if(h<0){g=i;k=c}else{b=i;f=c}if(g-b<=1){if(b>0){break}if(g===b){break}if(l){break}l=true}}while(true){a=m[b];if(f>=a.G){d._=e-a.G|0;if(a.I==null){return a.J}o=a.I(d);d._=e-a.G|0;if(o){return a.J}}b=a.V;if(b<0){return 0}}return-1};function s(a,b,d,e){var c;c=e.length-(d-b);a.D=a.D.slice(0,b)+e+a.D.slice(d);a.A+=c|0;if(a._>=d){a._+=c|0}else if(a._>b){a._=b}return c|0};function c(a,f){var b;var c;var d;var e;b=false;if((c=a.C)<0||c>(d=a.E)||d>(e=a.A)||e>a.D.length?false:true){s(a,a.C,a.E,f);b=true}return b};function o(a,f){var b;var c;var d;var e;b='';if((c=a.C)<0||c>(d=a.E)||d>(e=a.A)||e>a.D.length?false:true){b=a.D.slice(a.C,a.E)}return b};d.prototype.H=function(){return false};d.prototype.T=function(b){var a;var c;var d;var e;a=this.F['.'+b];if(a==null){c=this.D=b;d=this._=0;e=this.A=c.length;this.B=0;this.C=d;this.E=e;this.H();a=this.D;this.F['.'+b]=a}return a};d.prototype.stemWord=d.prototype.T;d.prototype.U=function(e){var d;var b;var c;var a;var f;var g;var h;d=[];for(b=0;b<e.length;b++){c=e[b];a=this.F['.'+c];if(a==null){f=this.D=c;g=this._=0;h=this.A=f.length;this.B=0;this.C=g;this.E=h;this.H();a=this.D;this.F['.'+c]=a}d.push(a)}return d};d.prototype.stemWords=d.prototype.U;function a(){d.call(this);this.I_x=0;this.I_p1=0;this.S_ch=''};j([a],d);a.prototype.K=function(a){this.I_x=a.I_x;this.I_p1=a.I_p1;this.S_ch=a.S_ch;v(this,a)};a.prototype.copy_from=a.prototype.K;a.prototype.P=function(){var g;var d;var b;var e;var c;var f;var i;var j;var k;var h;this.I_p1=j=this.A;g=
|
||
|
var Stemmer = JSX.require("src/danish-stemmer.jsx").DanishStemmer;
|
||
|
"""
|
||
|
|
||
|
|
||
|
class SearchDanish(SearchLanguage):
|
||
|
lang = 'da'
|
||
|
language_name = 'Danish'
|
||
|
js_stemmer_rawcode = 'danish-stemmer.js'
|
||
|
js_stemmer_code = js_stemmer
|
||
|
stopwords = danish_stopwords
|
||
|
|
||
|
def init(self, options: Dict) -> None:
|
||
|
self.stemmer = snowballstemmer.stemmer('danish')
|
||
|
|
||
|
def stem(self, word: str) -> str:
|
||
|
return self.stemmer.stemWord(word.lower())
|