You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
212 lines
32 KiB
212 lines
32 KiB
4 years ago
|
"""
|
||
|
sphinx.search.fr
|
||
|
~~~~~~~~~~~~~~~~
|
||
|
|
||
|
French search language: includes the JS French stemmer.
|
||
|
|
||
|
:copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
|
||
|
:license: BSD, see LICENSE for details.
|
||
|
"""
|
||
|
|
||
|
from typing import Dict
|
||
|
|
||
|
import snowballstemmer
|
||
|
|
||
|
from sphinx.search import SearchLanguage, parse_stop_word
|
||
|
|
||
|
french_stopwords = parse_stop_word('''
|
||
|
| source: http://snowball.tartarus.org/algorithms/french/stop.txt
|
||
|
au | a + le
|
||
|
aux | a + les
|
||
|
avec | with
|
||
|
ce | this
|
||
|
ces | these
|
||
|
dans | with
|
||
|
de | of
|
||
|
des | de + les
|
||
|
du | de + le
|
||
|
elle | she
|
||
|
en | `of them' etc
|
||
|
et | and
|
||
|
eux | them
|
||
|
il | he
|
||
|
je | I
|
||
|
la | the
|
||
|
le | the
|
||
|
leur | their
|
||
|
lui | him
|
||
|
ma | my (fem)
|
||
|
mais | but
|
||
|
me | me
|
||
|
même | same; as in moi-même (myself) etc
|
||
|
mes | me (pl)
|
||
|
moi | me
|
||
|
mon | my (masc)
|
||
|
ne | not
|
||
|
nos | our (pl)
|
||
|
notre | our
|
||
|
nous | we
|
||
|
on | one
|
||
|
ou | where
|
||
|
par | by
|
||
|
pas | not
|
||
|
pour | for
|
||
|
qu | que before vowel
|
||
|
que | that
|
||
|
qui | who
|
||
|
sa | his, her (fem)
|
||
|
se | oneself
|
||
|
ses | his (pl)
|
||
|
son | his, her (masc)
|
||
|
sur | on
|
||
|
ta | thy (fem)
|
||
|
te | thee
|
||
|
tes | thy (pl)
|
||
|
toi | thee
|
||
|
ton | thy (masc)
|
||
|
tu | thou
|
||
|
un | a
|
||
|
une | a
|
||
|
vos | your (pl)
|
||
|
votre | your
|
||
|
vous | you
|
||
|
|
||
|
| single letter forms
|
||
|
|
||
|
c | c'
|
||
|
d | d'
|
||
|
j | j'
|
||
|
l | l'
|
||
|
à | to, at
|
||
|
m | m'
|
||
|
n | n'
|
||
|
s | s'
|
||
|
t | t'
|
||
|
y | there
|
||
|
|
||
|
| forms of être (not including the infinitive):
|
||
|
été
|
||
|
étée
|
||
|
étées
|
||
|
étés
|
||
|
étant
|
||
|
suis
|
||
|
es
|
||
|
est
|
||
|
sommes
|
||
|
êtes
|
||
|
sont
|
||
|
serai
|
||
|
seras
|
||
|
sera
|
||
|
serons
|
||
|
serez
|
||
|
seront
|
||
|
serais
|
||
|
serait
|
||
|
serions
|
||
|
seriez
|
||
|
seraient
|
||
|
étais
|
||
|
était
|
||
|
étions
|
||
|
étiez
|
||
|
étaient
|
||
|
fus
|
||
|
fut
|
||
|
fûmes
|
||
|
fûtes
|
||
|
furent
|
||
|
sois
|
||
|
soit
|
||
|
soyons
|
||
|
soyez
|
||
|
soient
|
||
|
fusse
|
||
|
fusses
|
||
|
fût
|
||
|
fussions
|
||
|
fussiez
|
||
|
fussent
|
||
|
|
||
|
| forms of avoir (not including the infinitive):
|
||
|
ayant
|
||
|
eu
|
||
|
eue
|
||
|
eues
|
||
|
eus
|
||
|
ai
|
||
|
as
|
||
|
avons
|
||
|
avez
|
||
|
ont
|
||
|
aurai
|
||
|
auras
|
||
|
aura
|
||
|
aurons
|
||
|
aurez
|
||
|
auront
|
||
|
aurais
|
||
|
aurait
|
||
|
aurions
|
||
|
auriez
|
||
|
auraient
|
||
|
avais
|
||
|
avait
|
||
|
avions
|
||
|
aviez
|
||
|
avaient
|
||
|
eut
|
||
|
eûmes
|
||
|
eûtes
|
||
|
eurent
|
||
|
aie
|
||
|
aies
|
||
|
ait
|
||
|
ayons
|
||
|
ayez
|
||
|
aient
|
||
|
eusse
|
||
|
eusses
|
||
|
eût
|
||
|
eussions
|
||
|
eussiez
|
||
|
eussent
|
||
|
|
||
|
| Later additions (from Jean-Christophe Deschamps)
|
||
|
ceci | this
|
||
|
cela | that (added 11 Apr 2012. Omission reported by Adrien Grand)
|
||
|
celà | that (incorrect, though common)
|
||
|
cet | this
|
||
|
cette | this
|
||
|
ici | here
|
||
|
ils | they
|
||
|
les | the (pl)
|
||
|
leurs | their (pl)
|
||
|
quel | which
|
||
|
quels | which
|
||
|
quelle | which
|
||
|
quelles | which
|
||
|
sans | without
|
||
|
soi | oneself
|
||
|
''')
|
||
|
|
||
|
js_stemmer = """
|
||
|
var JSX={};(function(l){function m(b,e){var a=function(){};a.prototype=e.prototype;var c=new a;for(var d in b){b[d].prototype=c}}function P(c,b){for(var a in b.prototype)if(b.prototype.hasOwnProperty(a))c.prototype[a]=b.prototype[a]}function g(a,b,d){function c(a,b,c){delete a[b];a[b]=c;return c}Object.defineProperty(a,b,{get:function(){return c(a,b,d())},set:function(d){c(a,b,d)},enumerable:true,configurable:true})}function O(a,b,c){return a[b]=a[b]/c|0}var u=parseInt;var v=parseFloat;function N(a){return a!==a}var x=isFinite;var y=encodeURIComponent;var z=decodeURIComponent;var A=encodeURI;var B=decodeURI;var C=Object.prototype.toString;var D=Object.prototype.hasOwnProperty;function k(){}l.require=function(b){var a=q[b];return a!==undefined?a:null};l.profilerIsRunning=function(){return k.getResults!=null};l.getProfileResults=function(){return(k.getResults||function(){return{}})()};l.postProfileResults=function(a,b){if(k.postResults==null)throw new Error('profiler has not been turned on');return k.postResults(a,b)};l.resetProfileResults=function(){if(k.resetResults==null)throw new Error('profiler has not been turned on');return k.resetResults()};l.DEBUG=false;function G(){};m([G],Error);function a(a,b,c){this.F=a.length;this.K=a;this.L=b;this.I=c;this.H=null;this.P=null};m([a],Object);function p(){};m([p],Object);function i(){var a;var b;var c;this.G={};a=this.E='';b=this._=0;c=this.A=a.length;this.B=0;this.D=b;this.C=c};m([i],p);function s(a,b){a.E=b.E;a._=b._;a.A=b.A;a.B=b.B;a.D=b.D;a.C=b.C};function e(b,d,c,e){var a;if(b._>=b.A){return false}a=b.E.charCodeAt(b._);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._++;return true};function r(b,d,c,e){var a;if(b._<=b.B){return false}a=b.E.charCodeAt(b._-1);if(a>e||a<c){return false}a-=c;if((d[a>>>3]&1<<(a&7))===0){return false}b._--;return true};function o(a,d,c,e){var b;if(a._>=a.A){return false}b=a.E.charCodeAt(a._);if(b>e||b<c){a._++;return true}b-=c;if((d[b>>>3]&1<<(b&7))===0){a._++;return true}return false};function j(a,d,c,e){var b;if(a._<=a.B){return false}b=a.E.charCodeAt(a._-1);if(b>e||b<c){a._--;return true}b-=c;if((d[b>>>3]&1<<(b&7))===0){a._--;return true}return false};function h(a,b,d){var c;if(a.A-a._<b){return false}if(a.E.slice(c=a._,c+b)!==d){return false}a._+=b;return true};function d(a,b,d){var c;if(a._-a.B<b){return false}if(a.E.slice((c=a._)-b,c)!==d){return false}a._-=b;return true};function n(f,m,p){var b;var d;var e;var n;var g;var k;var l;var i;var h;var c;var a;var j;var o;b=0;d=p;e=f._;n=f.A;g=0;k=0;l=false;while(true){i=b+(d-b>>>1);h=0;c=g<k?g:k;a=m[i];for(j=c;j<a.F;j++){if(e+c===n){h=-1;break}h=f.E.charCodeAt(e+c)-a.K.charCodeAt(j);if(h!==0){break}c++}if(h<0){d=i;k=c}else{b=i;g=c}if(d-b<=1){if(b>0){break}if(d===b){break}if(l){break}l=true}}while(true){a=m[b];if(g>=a.F){f._=e+a.F|0;if(a.H==null){return a.I}o=a.H(a.P);f._=e+a.F|0;if(o){return a.I}}b=a.L;if(b<0){return 0}}return-1};function f(d,m,p){var b;var g;var e;var n;var f;var k;var l;var i;var h;var c;var a;var j;var o;b=0;g=p;e=d._;n=d.B;f=0;k=0;l=false;while(true){i=b+(g-b>>1);h=0;c=f<k?f:k;a=m[i];for(j=a.F-1-c;j>=0;j--){if(e-c===n){h=-1;break}h=d.E.charCodeAt(e-1-c)-a.K.charCodeAt(j);if(h!==0){break}c++}if(h<0){g=i;k=c}else{b=i;f=c}if(g-b<=1){if(b>0){break}if(g===b){break}if(l){break}l=true}}while(true){a=m[b];if(f>=a.F){d._=e-a.F|0;if(a.H==null){return a.I}o=a.H(d);d._=e-a.F|0;if(o){return a.I}}b=a.L;if(b<0){return 0}}return-1};function E(a,b,d,e){var c;c=e.length-(d-b);a.E=a.E.slice(0,b)+e+a.E.slice(d);a.A+=c|0;if(a._>=d){a._+=c|0}else if(a._>b){a._=b}return c|0};function c(a,f){var b;var c;var d;var e;b=false;if((c=a.D)<0||c>(d=a.C)||d>(e=a.A)||e>a.E.length?false:true){E(a,a.D,a.C,f);b=true}return b};i.prototype.J=function(){return false};i.prototype.c=function(b){var a;var c;var d;var e;a=this.G['.'+b];if(a==null){c=this.E=b;d=this._=0;e=this.A=c.length;this.B=0;this.D=d;this.C=e;this.J();a=this.E;this.G['.'+b]=a}return a};i.prototype.stemWord=i.prototype.c;i.prototype.d=function(e){var d;var b;var c;var a;var f;var g;var h;d=[];for(b=0;b<e.length;b++){c=e
|
||
|
var Stemmer = JSX.require("src/french-stemmer.jsx").FrenchStemmer;
|
||
|
"""
|
||
|
|
||
|
|
||
|
class SearchFrench(SearchLanguage):
|
||
|
lang = 'fr'
|
||
|
language_name = 'French'
|
||
|
js_stemmer_rawcode = 'french-stemmer.js'
|
||
|
js_stemmer_code = js_stemmer
|
||
|
stopwords = french_stopwords
|
||
|
|
||
|
def init(self, options: Dict) -> None:
|
||
|
self.stemmer = snowballstemmer.stemmer('french')
|
||
|
|
||
|
def stem(self, word: str) -> str:
|
||
|
return self.stemmer.stemWord(word.lower())
|