You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
240 lines
24 KiB
240 lines
24 KiB
4 years ago
|
"""
|
||
|
sphinx.search.hu
|
||
|
~~~~~~~~~~~~~~~~
|
||
|
|
||
|
Hungarian search language: includes the JS Hungarian stemmer.
|
||
|
|
||
|
:copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
|
||
|
:license: BSD, see LICENSE for details.
|
||
|
"""
|
||
|
|
||
|
from typing import Dict
|
||
|
|
||
|
import snowballstemmer
|
||
|
|
||
|
from sphinx.search import SearchLanguage, parse_stop_word
|
||
|
|
||
|
hungarian_stopwords = parse_stop_word('''
|
||
|
| source: http://snowball.tartarus.org/algorithms/hungarian/stop.txt
|
||
|
| prepared by Anna Tordai
|
||
|
a
|
||
|
ahogy
|
||
|
ahol
|
||
|
aki
|
||
|
akik
|
||
|
akkor
|
||
|
alatt
|
||
|
által
|
||
|
általában
|
||
|
amely
|
||
|
amelyek
|
||
|
amelyekben
|
||
|
amelyeket
|
||
|
amelyet
|
||
|
amelynek
|
||
|
ami
|
||
|
amit
|
||
|
amolyan
|
||
|
amíg
|
||
|
amikor
|
||
|
át
|
||
|
abban
|
||
|
ahhoz
|
||
|
annak
|
||
|
arra
|
||
|
arról
|
||
|
az
|
||
|
azok
|
||
|
azon
|
||
|
azt
|
||
|
azzal
|
||
|
azért
|
||
|
aztán
|
||
|
azután
|
||
|
azonban
|
||
|
bár
|
||
|
be
|
||
|
belül
|
||
|
benne
|
||
|
cikk
|
||
|
cikkek
|
||
|
cikkeket
|
||
|
csak
|
||
|
de
|
||
|
e
|
||
|
eddig
|
||
|
egész
|
||
|
egy
|
||
|
egyes
|
||
|
egyetlen
|
||
|
egyéb
|
||
|
egyik
|
||
|
egyre
|
||
|
ekkor
|
||
|
el
|
||
|
elég
|
||
|
ellen
|
||
|
elő
|
||
|
először
|
||
|
előtt
|
||
|
első
|
||
|
én
|
||
|
éppen
|
||
|
ebben
|
||
|
ehhez
|
||
|
emilyen
|
||
|
ennek
|
||
|
erre
|
||
|
ez
|
||
|
ezt
|
||
|
ezek
|
||
|
ezen
|
||
|
ezzel
|
||
|
ezért
|
||
|
és
|
||
|
fel
|
||
|
felé
|
||
|
hanem
|
||
|
hiszen
|
||
|
hogy
|
||
|
hogyan
|
||
|
igen
|
||
|
így
|
||
|
illetve
|
||
|
ill.
|
||
|
ill
|
||
|
ilyen
|
||
|
ilyenkor
|
||
|
ison
|
||
|
ismét
|
||
|
itt
|
||
|
jó
|
||
|
jól
|
||
|
jobban
|
||
|
kell
|
||
|
kellett
|
||
|
keresztül
|
||
|
keressünk
|
||
|
ki
|
||
|
kívül
|
||
|
között
|
||
|
közül
|
||
|
legalább
|
||
|
lehet
|
||
|
lehetett
|
||
|
legyen
|
||
|
lenne
|
||
|
lenni
|
||
|
lesz
|
||
|
lett
|
||
|
maga
|
||
|
magát
|
||
|
majd
|
||
|
majd
|
||
|
már
|
||
|
más
|
||
|
másik
|
||
|
meg
|
||
|
még
|
||
|
mellett
|
||
|
mert
|
||
|
mely
|
||
|
melyek
|
||
|
mi
|
||
|
mit
|
||
|
míg
|
||
|
miért
|
||
|
milyen
|
||
|
mikor
|
||
|
minden
|
||
|
mindent
|
||
|
mindenki
|
||
|
mindig
|
||
|
mint
|
||
|
mintha
|
||
|
mivel
|
||
|
most
|
||
|
nagy
|
||
|
nagyobb
|
||
|
nagyon
|
||
|
ne
|
||
|
néha
|
||
|
nekem
|
||
|
neki
|
||
|
nem
|
||
|
néhány
|
||
|
nélkül
|
||
|
nincs
|
||
|
olyan
|
||
|
ott
|
||
|
össze
|
||
|
ő
|
||
|
ők
|
||
|
őket
|
||
|
pedig
|
||
|
persze
|
||
|
rá
|
||
|
s
|
||
|
saját
|
||
|
sem
|
||
|
semmi
|
||
|
sok
|
||
|
sokat
|
||
|
sokkal
|
||
|
számára
|
||
|
szemben
|
||
|
szerint
|
||
|
szinte
|
||
|
talán
|
||
|
tehát
|
||
|
teljes
|
||
|
tovább
|
||
|
továbbá
|
||
|
több
|
||
|
úgy
|
||
|
ugyanis
|
||
|
új
|
||
|
újabb
|
||
|
újra
|
||
|
után
|
||
|
utána
|
||
|
utolsó
|
||
|
vagy
|
||
|
vagyis
|
||
|
valaki
|
||
|
valami
|
||
|
valamint
|
||
|
való
|
||
|
vagyok
|
||
|
van
|
||
|
vannak
|
||
|
volt
|
||
|
voltam
|
||
|
voltak
|
||
|
voltunk
|
||
|
vissza
|
||
|
vele
|
||
|
viszont
|
||
|
volna
|
||
|
''')
|
||
|
|
||
|
js_stemmer = """
|
||
|
|
||
|
var JSX={};(function(h){function j(b,e){var a=function(){};a.prototype=e.prototype;var c=new a;for(var d in b){b[d].prototype=c}}function P(c,b){for(var a in b.prototype)if(b.prototype.hasOwnProperty(a))c.prototype[a]=b.prototype[a]}function e(a,b,d){function c(a,b,c){delete a[b];a[b]=c;return c}Object.defineProperty(a,b,{get:function(){return c(a,b,d())},set:function(d){c(a,b,d)},enumerable:true,configurable:true})}function O(a,b,c){return a[b]=a[b]/c|0}var u=parseInt;var v=parseFloat;function N(a){return a!==a}var x=isFinite;var y=encodeURIComponent;var z=decodeURIComponent;var B=encodeURI;var C=decodeURI;var E=Object.prototype.toString;var F=Object.prototype.hasOwnProperty;function i(){}h.require=function(b){var a=q[b];return a!==undefined?a:null};h.profilerIsRunning=function(){return i.getResults!=null};h.getProfileResults=function(){return(i.getResults||function(){return{}})()};h.postProfileResults=function(a,b){if(i.postResults==null)throw new Error('profiler has not been turned on');return i.postResults(a,b)};h.resetProfileResults=function(){if(i.resetResults==null)throw new Error('profiler has not been turned on');return i.resetResults()};h.DEBUG=false;function r(){};j([r],Error);function a(a,b,c){this.F=a.length;this.K=a;this.L=b;this.I=c;this.H=null;this.P=null};j([a],Object);function n(){};j([n],Object);function f(){var a;var b;var c;this.G={};a=this.D='';b=this._=0;c=this.A=a.length;this.E=0;this.B=b;this.C=c};j([f],n);function s(a,b){a.D=b.D;a._=b._;a.A=b.A;a.E=b.E;a.B=b.B;a.C=b.C};function k(b,d,c,e){var a;if(b._>=b.A){return false}a=b.D.charCodeAt(b._);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._++;return true};function l(a,d,c,e){var b;if(a._>=a.A){return false}b=a.D.charCodeAt(a._);if(b>e||b<c){a._++;return true}b-=c;if((d[b>>>3]&1<<(b&7))===0){a._++;return true}return false};function o(f,m,p){var b;var d;var e;var n;var g;var k;var l;var i;var h;var c;var a;var j;var o;b=0;d=p;e=f._;n=f.A;g=0;k=0;l=false;while(true){i=b+(d-b>>>1);h=0;c=g<k?g:k;a=m[i];for(j=c;j<a.F;j++){if(e+c===n){h=-1;break}h=f.D.charCodeAt(e+c)-a.K.charCodeAt(j);if(h!==0){break}c++}if(h<0){d=i;k=c}else{b=i;g=c}if(d-b<=1){if(b>0){break}if(d===b){break}if(l){break}l=true}}while(true){a=m[b];if(g>=a.F){f._=e+a.F|0;if(a.H==null){return a.I}o=a.H(a.P);f._=e+a.F|0;if(o){return a.I}}b=a.L;if(b<0){return 0}}return-1};function d(d,m,p){var b;var g;var e;var n;var f;var k;var l;var i;var h;var c;var a;var j;var o;b=0;g=p;e=d._;n=d.E;f=0;k=0;l=false;while(true){i=b+(g-b>>1);h=0;c=f<k?f:k;a=m[i];for(j=a.F-1-c;j>=0;j--){if(e-c===n){h=-1;break}h=d.D.charCodeAt(e-1-c)-a.K.charCodeAt(j);if(h!==0){break}c++}if(h<0){g=i;k=c}else{b=i;f=c}if(g-b<=1){if(b>0){break}if(g===b){break}if(l){break}l=true}}while(true){a=m[b];if(f>=a.F){d._=e-a.F|0;if(a.H==null){return a.I}o=a.H(d);d._=e-a.F|0;if(o){return a.I}}b=a.L;if(b<0){return 0}}return-1};function A(a,b,d,e){var c;c=e.length-(d-b);a.D=a.D.slice(0,b)+e+a.D.slice(d);a.A+=c|0;if(a._>=d){a._+=c|0}else if(a._>b){a._=b}return c|0};function b(a,f){var b;var c;var d;var e;b=false;if((c=a.B)<0||c>(d=a.C)||d>(e=a.A)||e>a.D.length?false:true){A(a,a.B,a.C,f);b=true}return b};f.prototype.J=function(){return false};f.prototype.e=function(b){var a;var c;var d;var e;a=this.G['.'+b];if(a==null){c=this.D=b;d=this._=0;e=this.A=c.length;this.E=0;this.B=d;this.C=e;this.J();a=this.D;this.G['.'+b]=a}return a};f.prototype.stemWord=f.prototype.e;f.prototype.f=function(e){var d;var b;var c;var a;var f;var g;var h;d=[];for(b=0;b<e.length;b++){c=e[b];a=this.G['.'+c];if(a==null){f=this.D=c;g=this._=0;h=this.A=f.length;this.E=0;this.B=g;this.C=h;this.J();a=this.D;this.G['.'+c]=a}d.push(a)}return d};f.prototype.stemWords=f.prototype.f;function c(){f.call(this);this.I_p1=0};j([c],f);c.prototype.M=function(a){this.I_p1=a.I_p1;s(this,a)};c.prototype.copy_from=c.prototype.M;c.prototype.X=function(){var m;var b;var j;var d;var e;var a;var f;var g;var h;var n;var i;this.I_p1=this.A;d=true;b:while(d===true){d=false;m=this._;e=true;a:while(e===true){e=false;if(!k(this,c.g_v,97,252)){break a}c:while(true){b=this._;
|
||
|
var Stemmer = JSX.require("src/hungarian-stemmer.jsx").HungarianStemmer;
|
||
|
"""
|
||
|
|
||
|
|
||
|
class SearchHungarian(SearchLanguage):
|
||
|
lang = 'hu'
|
||
|
language_name = 'Hungarian'
|
||
|
js_stemmer_rawcode = 'hungarian-stemmer.js'
|
||
|
js_stemmer_code = js_stemmer
|
||
|
stopwords = hungarian_stopwords
|
||
|
|
||
|
def init(self, options: Dict) -> None:
|
||
|
self.stemmer = snowballstemmer.stemmer('hungarian')
|
||
|
|
||
|
def stem(self, word: str) -> str:
|
||
|
return self.stemmer.stemWord(word.lower())
|