diff --git a/404.html b/404.html new file mode 100644 index 0000000..b79bfa6 --- /dev/null +++ b/404.html @@ -0,0 +1,66 @@ + + + + + + + + + + Page not found - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Page not found

+ +
+
+
+ +
+ +
+ + + + diff --git a/CNAME b/CNAME new file mode 100644 index 0000000..9fbf460 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +snowballstem.org diff --git a/algorithms/armenian/stemmer.html b/algorithms/armenian/stemmer.html new file mode 100644 index 0000000..7f9d5f2 --- /dev/null +++ b/algorithms/armenian/stemmer.html @@ -0,0 +1,495 @@ + + + + + + + + + + Armenian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Armenian stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Armenian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+աղոթում
+աղոթք
+աղոթքը
+աղոթքի
+աղոթքին
+աղոթքից
+աղոթքն
+աղոթքներ
+աղոթքները
+աղոթքների
+աղոթքներին
+
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+աղոթ
+
+բանաձևեր
+բանաձևերը
+բանաձևերի
+բանաձևերից
+բանաձևերն
+բանաձևերով
+բանաձևերում
+բանաձևը
+բանաձևի
+բանաձևին
+բանաձևից
+
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+բանաձև
+
+
+ +

+This stemmer for Armenian was developed and contributed by Astghik Mkrtchyan. +

+ +

+The following characters are vowels for the purposes of this algorithm: + +

+ ա է ի օ ւ ե ո ը +
+ +

+R2 is the region after the first non-vowel following a vowel after the +first non-vowel following a vowel, or the end of the word if there is no such +non-vowel. +

+ +

+RV has the same definition as in the + Spanish stemmer. +

+ +

+The algorithm has a fairly simple structure which only removes suffixes. There +are four steps, applied in turn: +

+ +
    +
  • an "ending" is removed, if one is found in R2,
  • +
  • a verb suffix is removed, if one is found in RV,
  • +
  • an adjective suffix is removed, if one is found in RV,
  • +
  • a noun suffix is removed, if one is found in RV.
  • +
+ +

+See the Snowball implementation of the stemmer below for the lists of suffixes +each step checks for. +

+ +

The full algorithm in Snowball

+ +
stringescapes {}
+
+stringdef a    '{U+0561}' // 531
+stringdef b    '{U+0562}' // 532
+stringdef g    '{U+0563}' // 533
+stringdef d    '{U+0564}' // 534
+stringdef ye   '{U+0565}' // 535
+stringdef z    '{U+0566}' // 536
+stringdef e    '{U+0567}' // 537
+stringdef y    '{U+0568}' // 538
+stringdef dt   '{U+0569}' // 539
+stringdef zh   '{U+056A}' // 53A
+stringdef i    '{U+056B}' // 53B
+stringdef l    '{U+056C}' // 53C
+stringdef kh   '{U+056D}' // 53D
+stringdef ts   '{U+056E}' // 53E
+stringdef k    '{U+056F}' // 53F
+stringdef h    '{U+0570}' // 540
+stringdef dz   '{U+0571}' // 541
+stringdef gh   '{U+0572}' // 542
+stringdef djch '{U+0573}' // 543
+stringdef m    '{U+0574}' // 544
+stringdef j    '{U+0575}' // 545
+stringdef n    '{U+0576}' // 546
+stringdef sh   '{U+0577}' // 547
+stringdef vo   '{U+0578}' // 548
+stringdef ch   '{U+0579}' // 549
+stringdef p    '{U+057A}' // 54A
+stringdef dj   '{U+057B}' // 54B
+stringdef r    '{U+057C}' // 54C
+stringdef s    '{U+057D}' // 54D
+stringdef v    '{U+057E}' // 54E
+stringdef t    '{U+057F}' // 54F
+stringdef r'   '{U+0580}' // 550
+stringdef c    '{U+0581}' // 551
+stringdef u    '{U+0582}' // 552                  //vjun
+stringdef bp   '{U+0583}' // 553
+stringdef q    '{U+0584}' // 554
+stringdef ev   '{U+0587}'
+stringdef o    '{U+0585}' // 555
+stringdef f    '{U+0586}' // 556
+
+routines ( mark_regions R2
+           adjective
+           verb
+           noun
+           ending
+)
+
+externals ( stem )
+
+integers ( pV p2 )
+
+groupings ( v )
+
+define v '{a}{e}{i}{o}{u}{ye}{vo}{y}'
+
+define mark_regions as (
+
+    $pV = limit
+    $p2 = limit
+    do (
+        gopast v  setmark pV  gopast non-v
+        gopast v  gopast non-v  setmark p2
+       )
+)
+
+backwardmode (
+
+    define R2 as $p2 <= cursor
+
+    define adjective as (
+        [substring] among (
+            '{b}{a}{r'}'
+            '{p}{ye}{s}'
+            '{vo}{r'}{e}{n}'
+            '{vo}{v}{i}{n}'
+            '{a}{k}{i}'
+            '{l}{a}{j}{n}'
+            '{r'}{vo}{r'}{d}'
+            '{ye}{r'}{vo}{r'}{d}'
+            '{a}{k}{a}{n}'
+            '{a}{l}{i}'
+            '{k}{vo}{t}'
+            '{ye}{k}{ye}{n}'
+            '{vo}{r'}{a}{k}'
+            '{ye}{gh}'
+            '{v}{vo}{u}{n}'
+            '{ye}{r'}{ye}{n}'
+            '{a}{r'}{a}{n}'
+            '{ye}{n}'
+            '{a}{v}{ye}{t}'
+            '{g}{i}{n}'
+            '{i}{v}'
+            '{a}{t}'
+            '{i}{n}'
+
+              (delete)
+        )
+    )
+
+    define verb as (
+        [substring] among (
+            '{vo}{u}{m}'
+            '{v}{vo}{u}{m}'
+            '{a}{l}{vo}{u}'
+            '{ye}{l}{vo}{u}'
+            '{v}{ye}{l}'
+            '{a}{n}{a}{l}'
+            '{ye}{l}{vo}{u}{c}'
+            '{a}{l}{vo}{u}{c}'
+            '{y}{a}{l}'
+            '{y}{ye}{l}'
+            '{a}{l}{vo}{v}'
+            '{ye}{l}{vo}{v}'
+            '{a}{l}{i}{s}'
+            '{ye}{l}{i}{s}'
+            '{ye}{n}{a}{l}'
+            '{a}{c}{n}{a}{l}'
+            '{ye}{c}{n}{ye}{l}'
+            '{c}{n}{ye}{l}'
+            '{n}{ye}{l}'
+            '{a}{t}{ye}{l}'
+            '{vo}{t}{ye}{l}'
+            '{k}{vo}{t}{ye}{l}'
+            '{t}{ye}{l}'
+            '{v}{a}{ts}'
+            '{ye}{c}{v}{ye}{l}'
+            '{a}{c}{v}{ye}{l}'
+            '{ye}{c}{i}{r'}'
+            '{a}{c}{i}{r'}'
+            '{ye}{c}{i}{n}{q}'
+            '{a}{c}{i}{n}{q}'
+            '{v}{ye}{c}{i}{r'}'
+            '{v}{ye}{c}{i}{n}{q}'
+            '{v}{ye}{c}{i}{q}'
+            '{v}{ye}{c}{i}{n}'
+            '{a}{c}{r'}{i}{r'}'
+            '{a}{c}{r'}{ye}{c}'
+            '{a}{c}{r'}{i}{n}{q}'
+            '{a}{c}{r'}{i}{q}'
+            '{a}{c}{r'}{i}{n}'
+            '{ye}{c}{i}{q}'
+            '{a}{c}{i}{q}'
+            '{ye}{c}{i}{n}'
+            '{a}{c}{i}{n}'
+            '{a}{c}{a}{r'}'
+            '{a}{c}{a}{v}'
+            '{a}{c}{a}{n}{q}'
+            '{a}{c}{a}{q}'
+            '{a}{c}{a}{n}'
+            '{v}{ye}{c}{i}'
+            '{a}{c}{r'}{i}'
+            '{ye}{c}{a}{r'}'
+            '{ye}{c}{a}{v}'
+            '{c}{a}{n}{q}'
+            '{c}{a}{q}'
+            '{c}{a}{n}'
+            '{a}{c}{a}'
+            '{a}{c}{i}'
+            '{ye}{c}{a}'
+            '{ch}{ye}{l}'
+            '{ye}{c}{i}'
+            '{a}{r'}'
+            '{a}{v}'
+            '{a}{n}{q}'
+            '{a}{q}'
+            '{a}{n}'
+            '{a}{l}'
+            '{ye}{l}'
+            '{ye}{c}'
+            '{a}{c}'
+            '{v}{ye}'
+            '{a}'
+
+                (delete)
+        )
+    )
+
+    define noun as (
+        [substring] among (
+            '{a}{ts}{vo}'
+            '{a}{n}{a}{k}'
+            '{a}{n}{o}{c}'
+            '{a}{r'}{a}{n}'
+            '{a}{r'}{q}'
+            '{p}{a}{n}'
+            '{s}{t}{a}{n}'
+            '{ye}{gh}{e}{n}'
+            '{ye}{n}{q}'
+            '{i}{k}'
+            '{i}{ch}'
+            '{i}{q}'
+            '{m}{vo}{u}{n}{q}'
+            '{j}{a}{k}'
+            '{j}{vo}{u}{n}'
+            '{vo}{n}{q}'
+            '{vo}{r'}{d}'
+            '{vo}{c}'
+            '{ch}{ye}{q}'
+            '{v}{a}{ts}{q}'
+            '{v}{vo}{r'}'
+            '{a}{v}{vo}{r'}'
+            '{vo}{u}{dt}{j}{vo}{u}{n}'
+            '{vo}{u}{k}'
+            '{vo}{u}{h}{i}'
+            '{vo}{u}{j}{dt}'
+            '{vo}{u}{j}{q}'
+            '{vo}{u}{s}{t}'
+            '{vo}{u}{s}'
+            '{c}{i}'
+            '{a}{l}{i}{q}'
+            '{a}{n}{i}{q}'
+            '{i}{l}'
+            '{i}{ch}{q}'
+            '{vo}{u}{n}{q}'
+            '{g}{a}{r'}'
+            '{vo}{u}'
+            '{a}{k}'
+            '{a}{n}'
+            '{q}'
+
+                (delete)
+        )
+    )
+
+    define ending as (
+        [substring] R2 among (
+            '{n}{ye}{r'}{y}'
+            '{n}{ye}{r'}{n}'
+            '{n}{ye}{r'}{i}'
+            '{n}{ye}{r'}{d}'
+            '{ye}{r'}{i}{c}'
+            '{n}{ye}{r'}{i}{c}'
+            '{ye}{r'}{i}'
+            '{ye}{r'}{d}'
+            '{ye}{r'}{n}'
+            '{ye}{r'}{y}'
+            '{n}{ye}{r'}{i}{n}'
+            '{vo}{u}{dt}{j}{a}{n}{n}'
+            '{vo}{u}{dt}{j}{a}{n}{y}'
+            '{vo}{u}{dt}{j}{a}{n}{s}'
+            '{vo}{u}{dt}{j}{a}{n}{d}'
+            '{vo}{u}{dt}{j}{a}{n}'
+            '{ye}{r'}{i}{n}'
+            '{i}{n}'
+            '{s}{a}'
+            '{vo}{dj}'
+            '{i}{c}'
+            '{ye}{r'}{vo}{v}'
+            '{n}{ye}{r'}{vo}{v}'
+            '{ye}{r'}{vo}{u}{m}'
+            '{n}{ye}{r'}{vo}{u}{m}'
+            '{vo}{u}{n}'
+            '{vo}{u}{d}'
+            '{v}{a}{n}{s}'
+            '{v}{a}{n}{y}'
+            '{v}{a}{n}{d}'
+            '{a}{n}{y}'
+            '{a}{n}{d}'
+            '{v}{a}{n}'
+            '{vo}{dj}{y}'
+            '{vo}{dj}{s}'
+            '{vo}{dj}{d}'
+            '{vo}{c}'
+            '{vo}{u}{c}'
+            '{vo}{dj}{i}{c}'
+            '{c}{i}{c}'
+            '{v}{i}{c}'
+            '{v}{i}'
+            '{v}{vo}{v}'
+            '{vo}{v}'
+            '{a}{n}{vo}{v}'
+            '{a}{n}{vo}{u}{m}'
+            '{v}{a}{n}{i}{c}'
+            '{a}{m}{b}'
+            '{a}{n}'
+            '{n}{ye}{r'}'
+            '{ye}{r'}'
+            '{v}{a}'
+            '{y}'
+            '{n}'
+            '{d}'
+            '{c}'
+            '{i}'
+
+                (delete)
+        )
+    )
+)
+
+define stem as (
+
+    do mark_regions
+    backwards setlimit tomark pV for (
+        do ending
+        do verb
+        do adjective
+        do noun
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/armenian/stemmer.tt b/algorithms/armenian/stemmer.tt new file mode 100644 index 0000000..b57a5d2 --- /dev/null +++ b/algorithms/armenian/stemmer.tt @@ -0,0 +1,54 @@ +[% header('Armenian stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([11, 'աղոթում', 'բանաձևեր']) %] + +

+This stemmer for Armenian was developed and contributed by Astghik Mkrtchyan. +

+ +

+The following characters are vowels for the purposes of this algorithm: + +

+ ա է ի օ ւ ե ո ը +
+ +

+R2 is the region after the first non-vowel following a vowel after the +first non-vowel following a vowel, or the end of the word if there is no such +non-vowel. +

+ +

+RV has the same definition as in the + Spanish stemmer. +

+ +

+The algorithm has a fairly simple structure which only removes suffixes. There +are four steps, applied in turn: +

+ + + +

+See the Snowball implementation of the stemmer below for the lists of suffixes +each step checks for. +

+ +

The full algorithm in Snowball

+ +[% highlight_file('armenian') %] + +[% footer %] diff --git a/algorithms/basque/stemmer.html b/algorithms/basque/stemmer.html new file mode 100644 index 0000000..4c7678e --- /dev/null +++ b/algorithms/basque/stemmer.html @@ -0,0 +1,317 @@ + + + + + + + + + + Basque stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Basque stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Basque vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+barrutiaren
+barrutiek
+barrutien
+barrutietako
+barrutietan
+barrutik
+barrutiko
+barrutitan
+basa
+basailu
+basalto
+basamortu
+
+barru
+barru
+barru
+barru
+barru
+barrut
+barru
+barrutit
+basa
+basailu
+basal
+basam
+
+museoak
+museoan
+museoaren
+museoen
+museoetan
+museoko
+music
+musika
+musikagile
+musikagilea
+musikagileak
+musikagilearen
+
+museo
+museo
+museoaren
+museo
+museo
+museo
+music
+musi
+musika
+musi
+musi
+musi
+
+
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u +
+ + +

The stemming algorithm

+ +
routines (
+           aditzak
+           izenak
+           adjetiboak
+           mark_regions
+           RV R2 R1
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef n~ '{U+00F1}'
+
+define v 'aeiou'
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit  // defaults
+
+    do (
+        ( v (non-v gopast v) or (v gopast non-v) )
+        or
+        ( non-v (non-v gopast v) or (v next) )
+        setmark pV
+    )
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+backwardmode (
+
+    define RV as $pV <= cursor
+    define R2 as $p2 <= cursor
+    define R1 as $p1 <= cursor
+
+    define aditzak as (
+        [substring] among(
+            'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea'
+            'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza'
+            'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza'
+            'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez'
+            'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea'
+            'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena'
+            'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea'
+            'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari'
+            'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu'
+            'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako'
+            ( RV delete )
+            'garri' 'garria' 'tza'
+            (R2 delete)
+            'atseden'
+            (<- 'atseden')
+            'arabera'
+            (<- 'arabera')
+            'baditu'
+            (<- 'baditu')
+
+        )
+    )
+
+    define izenak as (
+        [substring] among(
+            'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina'
+            'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea'
+            'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua'
+            'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di'
+            'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa'
+            'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia'
+            'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia'
+            'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua'
+            'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara'
+            'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge'
+            'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua'
+            'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia'
+            'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde'
+            'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea'
+            'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea'
+            'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia'
+            'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa'
+            'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa'
+            'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila'
+            'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa'
+            'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia'
+            'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena'
+            'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan'
+            'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek'
+            'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara'
+            'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket'
+            'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko'
+            'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera'
+            'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko'
+            ( RV delete )
+            'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza'
+            ( R2 delete )
+            'joka'
+            (<- 'jok')
+            'tzen' 'ten' 'en' 'tatu'
+            (R1 delete)
+            'trako'
+            (<- 'tra')
+            'minutuko'
+            (<- 'minutu')
+            'zehar'
+            (<- 'zehar')
+            'geldi'
+            (<- 'geldi')
+            'igaro'
+            (<- 'igaro')
+            'aurka'
+            (<- 'aurka')
+        )
+    )
+
+    define adjetiboak as (
+        [substring] among(
+            'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria'
+            'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik'
+            ( RV delete )
+            'zlea'
+            (<- 'z')
+        )
+    )
+
+)
+
+define stem as (
+    do mark_regions
+    backwards (
+        repeat aditzak
+        repeat izenak
+        do adjetiboak
+    )
+
+)
+
+/*
+    Note 1: additions of 21 Jul 2010
+*/
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/basque/stemmer.tt b/algorithms/basque/stemmer.tt new file mode 100644 index 0000000..171932c --- /dev/null +++ b/algorithms/basque/stemmer.tt @@ -0,0 +1,24 @@ +[% header('Basque stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([12, 'barrutiaren', 'museoak']) %] + +

+The following letters are vowels: +

+ +
+ a   e   i   o   u +
+ + +

The stemming algorithm

+ +[% highlight_file('basque') %] + +[% footer %] diff --git a/algorithms/basque/tarball.tgz b/algorithms/basque/tarball.tgz new file mode 100644 index 0000000..812d582 Binary files /dev/null and b/algorithms/basque/tarball.tgz differ diff --git a/algorithms/catalan/stemmer.html b/algorithms/catalan/stemmer.html new file mode 100644 index 0000000..1ae4191 --- /dev/null +++ b/algorithms/catalan/stemmer.html @@ -0,0 +1,379 @@ + + + + + + + + + + Catalan stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Catalan stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Catalan vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+abatuda
+abatut
+abatuts
+abbàssida
+abbàssides
+abbé
+abdalwadita
+abdalwadites
+abdicació
+abdicar
+abdicat
+abdicà
+
+abat
+abat
+abat
+abbas
+abbas
+abb
+abdalwad
+abdalwad
+abdic
+abdic
+abdic
+abdic
+
+gore
+gorg
+gorga
+gorges
+gorgs
+goril
+gorja
+gorra
+gorres
+gosa
+gosadia
+gosar
+
+gor
+gorg
+gorg
+gorg
+gorg
+gor
+gorj
+gorr
+gorr
+gos
+gosad
+gos
+
+
+ +

+Letters in Catalan include the following accented forms, +

+ +
+ á   é   í   ó   ú   à   è   ì   ò   ù   ü   ï +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   á   é   í   ó   ú   ü   ï +
+ + +

The stemming algorithm

+ +
routines (
+           cleaning mark_regions
+           R1  R2
+           attached_pronoun
+           standard_suffix
+           verb_suffix
+           residual_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a'   '{U+00E1}'  // a-acute
+stringdef a`   '{U+00E0}'  // a-grave
+stringdef cc   '{U+00E7}'  // c-cedilla
+stringdef e'   '{U+00E9}'  // e-acute
+stringdef e`   '{U+00E8}'  // e-grave
+stringdef i'   '{U+00ED}'  // i-acute
+stringdef i`   '{U+00EC}'  // i-grave
+stringdef i"   '{U+00EF}'  // i-diaeresis
+stringdef o'   '{U+00F3}'  // o-acute
+stringdef o`   '{U+00F2}'  // o-grave
+stringdef u'   '{U+00FA}'  // u-acute
+stringdef u"   '{U+00FC}'  // u-diaeresis
+stringdef .    '{U+00B7}'   // - per l aggeminades
+
+define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}'
+
+define mark_regions as (
+
+    $p1 = limit
+    $p2 = limit  // defaults
+
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define cleaning as repeat (
+    [substring] among(
+        '{a'}' (<- 'a')
+        '{a`}' (<- 'a')
+        '{e'}' (<- 'e')
+        '{e`}' (<- 'e')
+        '{i'}' (<- 'i')
+        '{i`}' (<- 'i')
+        '{o'}' (<- 'o')
+        '{o`}' (<- 'o')
+        '{u'}' (<- 'u')
+        '{u"}' (<- 'u')
+        '{i"}' (<- 'i')
+        '{.}' (<- '.')
+        ''     (next)
+    )
+)
+
+backwardmode (
+
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define attached_pronoun as (
+    [substring] among (
+        '{'}s'  '{'}hi' '{'}ho' '{'}l' '{'}ls'
+        '-ls' '-la' '-les' '-li'
+        'vos' 'se'  'nos' '-nos' '-us' 'us'
+        '{'}n' '{'}ns' '-n' '-ns'
+        '{'}m' '-me' '-m'
+        '-te' '{'}t'
+        'li' 'lo' 'los'
+        'me'  'sela' 'selo' 'selas' 'selos' 'le'
+        'la' 'las' 'les' 'ens' 'ho' 'hi'
+        (R1 delete)
+    )
+    )
+
+    define standard_suffix as (
+        [substring] among(
+            'ar' 'atge' 'formes' 'icte' 'ictes'
+            'ell' 'ells' 'ella'  '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta'
+            'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls'
+            'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
+            'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
+            'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
+            '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all'
+            'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
+            '{o'}s' 'osa'  'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
+            'itar' 'ables' 'adors' 'idores' 'idors'
+            'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es'
+            'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris'
+            'ats' 'ions'  'ota' 'isam' 'ors' 'ora' 'ores' 'isament'
+            'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes'
+            'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies'
+            '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles'
+            'assa' 'asses' 'assos'
+             'ent' 'ents'
+             '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin'
+             'ims' 'ima' 'imes'
+             'isme' 'ista' 'ismes' 'istes'
+             'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius'
+             'oses' 'osos' 'ient' 'otes' 'ots'
+            (R1 delete)
+            'acions' 'ada' 'ades'
+            (R2 delete)
+            'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques'
+            (R2 <- 'log')
+            'ic' 'ica' 'ics' 'iques'
+            (R2 <- 'ic')
+            'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima'
+            (R1 <- 'c')
+        )
+    )
+
+    define verb_suffix as (
+        [substring] among(
+            'ador' 'adora'  'adors' 'adores' 're' 'ie'
+             'ent' 'ents' 'udes' 'ar{a`}' 'eren'
+            'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+            'aria' 'arian' 'arien' 'aries' 'ar{a`}s'
+            'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara'
+            'ar{e'}' 'ar{e'}s'
+            'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+            'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+            'er{e'}' 'er' 'erau' 'erass'
+            'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+            'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+            'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu'
+            'ia' 'ies' '{i'}em' '{i`}eu' 'ien'
+            'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats'
+            'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu'
+            'essen' 'esses' 'assen' 'asses' 'assim' 'assiu'
+            '{e'}ssen' '{e'}sseu'  '{e'}ssim' '{e'}ssiu' '{e'}ssem'
+            '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren'
+            'ar{i'}em' 'ar{i'}eu'
+            'areu' 'aren' 'ant' '{i"}m' '{i"}u'
+            '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es'
+            'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da'
+            'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its'
+            'id' 'ids'  'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+            'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+            'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as'
+            'ieu' 'ii' 'io' 'i{a`}'
+            'ess' 'essin' 'essis'  'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu'
+            'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+            'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+            'ierais'  'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+            'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques'
+            '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+            'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien'
+            'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu'
+            'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis'
+            'eixen' 'eixo' 'isin' 'isis'  'esques' 'sis' 'sin'
+            'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen'
+            'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim'
+            '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu'
+            '{i"}ra' '{i"}ren' '{i"}res'
+            '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x'
+            'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis'
+            'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s'
+                (R1 delete)
+            'ando'
+                (R2 delete)
+        )
+    )
+
+    define residual_suffix as (
+        [substring] among(
+            'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu'
+            'is' 'i' 'ir'  's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it'
+            (R1 delete)
+            'iqu'
+            (R1 <- 'ic')
+        )
+    )
+)
+
+define stem as (
+    do mark_regions
+    backwards (
+    do attached_pronoun
+    do ( standard_suffix or
+             verb_suffix
+           )
+        do residual_suffix
+    )
+    do cleaning
+)
+
+/*
+     First works 2010/07/19
+     First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0
+     Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos
+     Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
+*/
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/catalan/stemmer.tt b/algorithms/catalan/stemmer.tt new file mode 100644 index 0000000..d38f8e4 --- /dev/null +++ b/algorithms/catalan/stemmer.tt @@ -0,0 +1,33 @@ +[% header('Catalan stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([12, 'abatuda', 'gore']) %] + +

+Letters in Catalan include the following accented forms, +

+ +
+ á   é   í   ó   ú   à   è   ì   ò   ù   ü   ï +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   á   é   í   ó   ú   ü   ï +
+ + +

The stemming algorithm

+ +[% highlight_file('catalan') %] + +[% footer %] diff --git a/algorithms/catalan/tarball.tgz b/algorithms/catalan/tarball.tgz new file mode 100644 index 0000000..fe0cd67 Binary files /dev/null and b/algorithms/catalan/tarball.tgz differ diff --git a/algorithms/czech/stemmer.html b/algorithms/czech/stemmer.html new file mode 100644 index 0000000..de0e9ea --- /dev/null +++ b/algorithms/czech/stemmer.html @@ -0,0 +1,332 @@ + + + + + + + + + + Czech stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Czech stemming algorithm

+ + +

Links to resources

+ + + +

+In March 2012 Jim O’Regan sent us an implementation of Ljiljana +Dolamic's Czech stemmer. +

+ +
routines (
+  RV R1
+  palatalise
+  mark_regions
+  do_possessive
+  do_case
+  do_comparative
+  do_diminutive
+  do_augmentative
+  do_derivational
+  do_deriv_single
+  do_aggressive
+)
+
+externals ( stem )
+
+integers ( pV p1 )
+
+groupings ( v )
+
+stringescapes {}
+
+stringdef a' '{U+00E1}'
+stringdef c^ '{U+010D}'
+stringdef d^ '{U+010F}'
+stringdef e' '{U+00E9}'
+stringdef e^ '{U+011B}'
+stringdef i' '{U+00ED}'
+stringdef n^ '{U+0148}'
+stringdef o' '{U+00F3}'
+stringdef r^ '{U+0159}'
+stringdef s^ '{U+0161}'
+stringdef t^ '{U+0165}'
+stringdef u' '{U+00FA}'
+stringdef u* '{U+016F}'
+stringdef y' '{U+00FD}'
+stringdef z^ '{U+017E}'
+
+define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+
+    do (
+        gopast non-v setmark pV
+        gopast non-v gopast v setmark p1
+    )
+)
+
+backwardmode (
+
+  define RV as $pV <= cursor
+  define R1 as $p1 <= cursor
+
+  define palatalise as (
+    [substring] RV among (
+      'ci' 'ce' '{c^}i' '{c^}'
+      (<- 'k')
+      'zi' 'ze' '{z^}i' '{z^}e'
+      (<- 'h')
+      '{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
+      (<- 'ck')
+      '{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
+      (<- 'sk')
+    )
+  )
+
+  define do_possessive as (
+    [substring] RV among (
+      'ov' '{u*}v'
+      (delete)
+      'in'
+      (
+        delete
+        try palatalise
+      )
+    )
+  )
+
+  define do_case as (
+    [substring] among (
+      'atech'
+      '{e^}tem' 'at{u*}m'
+      '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
+      'ata' 'aty' 'ama' 'ami' 'ovi'
+      'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
+      'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
+      (delete)
+      'ech' 'ich' '{i'}ch'
+      '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
+      'emi' 'iho' 'imu'
+      '{e'}m' '{i'}m' 'es'
+      'e' 'i' '{i'}' '{e^}'
+      (
+        delete
+        try palatalise
+      )
+      'em'
+      (
+        <- 'e'
+        try palatalise
+      )
+    )
+  )
+
+  define do_derivational as (
+    [substring] R1 among (
+      'obinec'
+      'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
+      '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
+      '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
+      'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
+      '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
+      'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
+      '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
+      (delete)
+      'ion{a'}{r^}'
+      'inec' 'itel'
+      'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
+      'ic' 'in' 'it' 'iv'
+      (
+        <- 'i'
+        palatalise
+      )
+      'enic' 'ec' 'en'
+      (
+        <- 'e'
+        palatalise
+      )
+      '{e'}{r^}'
+      (
+        <- '{e'}'
+        palatalise
+      )
+      '{e^}n'
+      (
+        <- '{e^}'
+        palatalise
+      )
+      '{i'}rn'
+      '{i'}{r^}' '{i'}n'
+      (
+        <- '{i'}'
+        palatalise
+      )
+    )
+  )
+  define do_deriv_single as (
+    [substring] among (
+      'c' '{c^}' 'k' 'l' 'n' 't'
+      (delete)
+    )
+  )
+
+
+  define do_augmentative as (
+    [substring] among (
+      'ajzn' '{a'}k'
+      (delete)
+      'izn' 'isk'
+      (
+        <- 'i'
+        palatalise
+      )
+    )
+  )
+
+  define do_diminutive as (
+    [substring] among (
+      'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
+      'anek' 'onek' 'unek' '{a'}nek'
+      'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
+      '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
+      '{a'}tk' '{a'}nk' 'u{s^}k'
+      'k'
+      (delete)
+      'e{c^}ek' 'enek' 'ek'
+      (
+        <- 'e'
+        palatalise
+      )
+      '{e'}{c^}ek' '{e'}k'
+      (
+        <- '{e'}'
+        palatalise
+      )
+      'i{c^}ek' 'inek' 'ik'
+      (
+        <- 'i'
+        palatalise
+      )
+      '{i'}{c^}ek' '{i'}k'
+      (
+        <- '{i'}'
+        palatalise
+      )
+      '{a'}k'
+       (<- '{a'}')
+      'ak'
+       (<- 'a')
+      'ok'
+       (<- 'o')
+      'uk'
+       (<- 'u')
+    )
+  )
+
+  define do_comparative as (
+    [substring] among (
+      '{e^}j{s^}'
+      (
+        <- '{e^}'
+        palatalise
+      )
+      'ej{s^}'
+      (
+        <- 'e'
+        palatalise
+      )
+    )
+  )
+
+  define do_aggressive as (
+    do do_comparative
+    do do_diminutive
+    do do_augmentative
+    do_derivational or do_deriv_single
+  )
+)
+
+define stem as (
+  do mark_regions
+  backwards (
+    do_case
+    do_possessive
+    // light and aggressive are the same to this point
+    // comment next line for light stemmer
+    do_aggressive
+  )
+)
+
+// Ljiljana Dolamic and Jacques Savoy. 2009.
+// Indexing and stemming approaches for the Czech language.
+// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
+// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
+// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/czech/stemmer.tt b/algorithms/czech/stemmer.tt new file mode 100644 index 0000000..92d050e --- /dev/null +++ b/algorithms/czech/stemmer.tt @@ -0,0 +1,16 @@ +[% header('Czech stemming algorithm') %] + +

Links to resources

+ + + +

+In March 2012 Jim O’Regan sent us an implementation of Ljiljana +Dolamic's Czech stemmer. +

+ +[% highlight_file('czech') %] + +[% footer %] diff --git a/algorithms/danish/stemmer.html b/algorithms/danish/stemmer.html new file mode 100644 index 0000000..5485b25 --- /dev/null +++ b/algorithms/danish/stemmer.html @@ -0,0 +1,482 @@ + + + + + + + + + + Danish stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Danish stemming algorithm

+ +

Links to resources

+ + + +

+Here is a sample of Danish vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+indtage
+indtagelse
+indtager
+indtages
+indtaget
+indtil
+indtog
+indtraf
+indtryk
+indtræde
+indtræder
+indtræffe
+indtræffer
+indtrængende
+indtægt
+indtægter
+indvandrede
+indvandret
+indvender
+indvendig
+indvendige
+indvendigt
+indvending
+indvendingerne
+indvie
+indviede
+indvielse
+indvielsen
+indvielsesløfte
+indvielsestid
+indvier
+indvies
+indviet
+indvikle
+indvikler
+indvolde
+indvoldene
+indvortes
+indånde
+indåndede
+
+indtag
+indtag
+indtag
+indtag
+indtag
+indtil
+indtog
+indtraf
+indtryk
+indtræd
+indtræd
+indtræf
+indtræf
+indtræng
+indtæg
+indtæg
+indvandred
+indvandr
+indvend
+indvend
+indvend
+indvend
+indvending
+indvending
+indvi
+indvied
+indvi
+indvi
+indvielsesløft
+indvielsestid
+indvi
+indvi
+indvi
+indvikl
+indvikl
+indvold
+indvold
+indvort
+indånd
+indånded
+
+underste
+undersåtter
+undersåtters
+undersøg
+undersøge
+undersøgelse
+undersøgelsen
+undersøger
+undersøgt
+undersøgte
+undertryk
+undertrykke
+undertrykkelse
+undertrykker
+undertrykkere
+undertrykkeren
+undertrykkerens
+undertrykkeres
+undertrykkes
+undertrykt
+undertrykte
+undertryktes
+undertvang
+undertvunget
+undertvungne
+undervejs
+underverdenen
+undervise
+underviser
+undervises
+undervisning
+undervisningen
+undervist
+underviste
+underværk
+underværker
+undevise
+undeviste
+undfange
+undfanged
+
+underst
+undersåt
+undersåt
+undersøg
+undersøg
+undersøg
+undersøg
+undersøg
+undersøg
+undersøg
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertryk
+undertvang
+undertvung
+undertvungn
+undervej
+underverden
+undervis
+undervis
+undervis
+undervisning
+undervisning
+undervist
+undervist
+underværk
+underværk
+undevis
+undevist
+undfang
+undfanged
+
+
+ +

The stemming algorithm

+ +

+The Danish alphabet includes the following additional letters, +

+ +
+ æ   å   ø +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   æ   å   ø +
+ +

+A consonant is defined as a character from ASCII a-z which isn't a vowel +(originally this was "A consonant is defined as a non-vowel" but since +2018-11-15 we've changed this definition to avoid the stemmer altering +alphanumeric codes which end with a repeated digit). +

+ +

+R2 is not used: R1 is defined in the same way as in the +German stemmer. +(See the note on R1 and R2.) +

+ +

+Define a valid s-ending as one of +

+ +
+a   b   c   d   f   g   h   j   k   l   m   n   o   p   r +   t   v   y   z   å +
+ +

+Do each of steps 1, 2, 3 and 4. +

+ +

+Step 1: +

+
+

+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +

+
+
(a) + hed   ethed   ered   e   erede   ende   erende   ene +   erne   ere   en   heden   eren   er   heder   erer +   heds   es   endes   erendes   enes   ernes   eres   + ens   hedens   erens   ers   ets   erets   et   eret +
delete +
(b) + s +
delete if preceded by a valid s-ending +
+

+ (Of course the letter of the valid s-ending is + not necessarily in R1) +

+
+ +

+Step 2: +

+
+

+ Search for one of the following suffixes in R1, and if found + delete the last letter. +

+
+ gd   dt   gt   kt +
+ (For example, frisktfrisk) +
+ +Step 3: +
+

+ If the word ends igst, remove the final st. +

+ +

+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +

+
+
(a) + ig   lig   elig   els +
delete, and then repeat step 2 +
(b) + løst +
replace with løs +
+
+ +Step 4: undouble +
+

+ If the word ends with double consonant in R1, remove one of the + consonants. +

+ +

+ (For example, bestemmelsebestemmels (step 1) + → bestemm (step 3a) + → bestem in this step.) +

+
+ +

The same algorithm in Snowball

+ +
routines (
+           mark_regions
+           main_suffix
+           consonant_pair
+           other_suffix
+           undouble
+)
+
+externals ( stem )
+
+strings ( ch )
+
+integers ( p1 x )
+
+groupings ( c v s_ending )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef ae   '{U+00E6}'
+stringdef ao   '{U+00E5}'
+stringdef o/   '{U+00F8}'
+
+define c 'bcdfghjklmnpqrstvwxz'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending  'abcdfghjklmnoprtvyz{ao}'
+
+define mark_regions as (
+
+    $p1 = limit
+
+    test ( hop 3 setmark x )
+    goto v gopast non-v  setmark p1
+    try ( $p1 < x  $p1 = x )
+)
+
+backwardmode (
+
+    define main_suffix as (
+        setlimit tomark p1 for ([substring])
+        among(
+
+            'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
+            'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
+            'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
+            'erets' 'et' 'eret'
+                (delete)
+            's'
+                (s_ending delete)
+        )
+    )
+
+    define consonant_pair as (
+        test (
+            setlimit tomark p1 for ([substring])
+            among(
+                'gd' // significant in the call from other_suffix
+                'dt' 'gt' 'kt'
+            )
+        )
+        next] delete
+    )
+
+    define other_suffix as (
+        do ( ['st'] 'ig' delete )
+        setlimit tomark p1 for ([substring])
+        among(
+            'ig' 'lig' 'elig' 'els'
+                (delete do consonant_pair)
+            'l{o/}st'
+                (<-'l{o/}s')
+        )
+    )
+    define undouble as (
+        setlimit tomark p1 for ([c] ->ch)
+        ch
+        delete
+    )
+)
+
+define stem as (
+
+    do mark_regions
+    backwards (
+        do main_suffix
+        do consonant_pair
+        do other_suffix
+        do undouble
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/danish/stemmer.tt b/algorithms/danish/stemmer.tt new file mode 100644 index 0000000..3491cb1 --- /dev/null +++ b/algorithms/danish/stemmer.tt @@ -0,0 +1,132 @@ +[% header('Danish stemming algorithm') %] +

Links to resources

+ + + +[% algorithm_vocab([40, 'indtage', 'underste']) %] + +

The stemming algorithm

+ +

+The Danish alphabet includes the following additional letters, +

+ +
+ æ   å   ø +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   æ   å   ø +
+ +

+A consonant is defined as a character from ASCII a-z which isn't a vowel +(originally this was "A consonant is defined as a non-vowel" but since +2018-11-15 we've changed this definition to avoid the stemmer altering +alphanumeric codes which end with a repeated digit). +

+ +

+R2 is not used: R1 is defined in the same way as in the +German stemmer. +(See the note on R1 and R2.) +

+ +

+Define a valid s-ending as one of +

+ +
+a   b   c   d   f   g   h   j   k   l   m   n   o   p   r +   t   v   y   z   å +
+ +

+Do each of steps 1, 2, 3 and 4. +

+ +

+Step 1: +

+
+

+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +

+
+
(a) + hed   ethed   ered   e   erede   ende   erende   ene +   erne   ere   en   heden   eren   er   heder   erer +   heds   es   endes   erendes   enes   ernes   eres   + ens   hedens   erens   ers   ets   erets   et   eret +
delete +
(b) + s +
delete if preceded by a valid s-ending +
+

+ (Of course the letter of the valid s-ending is + not necessarily in R1) +

+
+ +

+Step 2: +

+
+

+ Search for one of the following suffixes in R1, and if found + delete the last letter. +

+
+ gd   dt   gt   kt +
+ (For example, frisktfrisk) +
+ +Step 3: +
+

+ If the word ends igst, remove the final st. +

+ +

+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +

+
+
(a) + ig   lig   elig   els +
delete, and then repeat step 2 +
(b) + løst +
replace with løs +
+
+ +Step 4: undouble +
+

+ If the word ends with double consonant in R1, remove one of the + consonants. +

+ +

+ (For example, bestemmelsebestemmels (step 1) + → bestemm (step 3a) + → bestem in this step.) +

+
+ +

The same algorithm in Snowball

+ +[% highlight_file('danish') %] + +[% footer %] diff --git a/algorithms/danish/stop.txt b/algorithms/danish/stop.txt new file mode 100644 index 0000000..3705204 --- /dev/null +++ b/algorithms/danish/stop.txt @@ -0,0 +1,102 @@ + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/yourself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/algorithms/dutch/stemmer.html b/algorithms/dutch/stemmer.html new file mode 100644 index 0000000..a5e06c0 --- /dev/null +++ b/algorithms/dutch/stemmer.html @@ -0,0 +1,524 @@ + + + + + + + + + + Dutch stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Dutch stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Dutch vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+lichaamsziek
+lichamelijk
+lichamelijke
+lichamelijkheden
+lichamen
+lichere
+licht
+lichtbeeld
+lichtbruin
+lichtdoorlatende
+lichte
+lichten
+lichtende
+lichtenvoorde
+lichter
+lichtere
+lichters
+lichtgevoeligheid
+lichtgewicht
+lichtgrijs
+lichthoeveelheid
+lichtintensiteit
+lichtje
+lichtjes
+lichtkranten
+lichtkring
+lichtkringen
+lichtregelsystemen
+lichtste
+lichtstromende
+lichtte
+lichtten
+lichttoetreding
+lichtverontreinigde
+lichtzinnige
+lid
+lidia
+lidmaatschap
+lidstaten
+lidvereniging
+
+lichaamsziek
+licham
+licham
+licham
+licham
+licher
+licht
+lichtbeeld
+lichtbruin
+lichtdoorlat
+licht
+licht
+lichtend
+lichtenvoord
+lichter
+lichter
+lichter
+lichtgevoel
+lichtgewicht
+lichtgrijs
+lichthoevel
+lichtintensiteit
+lichtj
+lichtjes
+lichtkrant
+lichtkring
+lichtkring
+lichtregelsystem
+lichtst
+lichtstrom
+licht
+licht
+lichttoetred
+lichtverontreinigd
+lichtzinn
+lid
+lidia
+lidmaatschap
+lidstat
+lidveren
+
+opgingen
+opglanzing
+opglanzingen
+opglimlachten
+opglimpen
+opglimpende
+opglimping
+opglimpingen
+opgraven
+opgrijnzen
+opgrijzende
+opgroeien
+opgroeiende
+opgroeiplaats
+ophaal
+ophaaldienst
+ophaalkosten
+ophaalsystemen
+ophaalt
+ophaaltruck
+ophalen
+ophalend
+ophalers
+ophef
+opheffen
+opheffende
+opheffing
+opheldering
+ophemelde
+ophemelen
+opheusden
+ophief
+ophield
+ophieven
+ophoepelt
+ophoog
+ophoogzand
+ophopen
+ophoping
+ophouden
+
+opging
+opglanz
+opglanz
+opglimlacht
+opglimp
+opglimp
+opglimp
+opglimp
+opgrav
+opgrijnz
+opgrijz
+opgroei
+opgroei
+opgroeiplat
+ophal
+ophaaldienst
+ophaalkost
+ophaalsystem
+ophaalt
+ophaaltruck
+ophal
+ophal
+ophaler
+ophef
+opheff
+opheff
+opheff
+ophelder
+ophemeld
+ophemel
+opheusd
+ophief
+ophield
+ophiev
+ophoepelt
+ophog
+ophoogzand
+ophop
+ophop
+ophoud
+
+
+ +

The stemming algorithm

+ +Dutch includes the following accented forms +
+ ä   ë   ï   ö   ü   á   é   í   ó   ú   è +
+First, remove all umlaut and acute accents. A vowel is then one of, +
+ a   e   i   o   u   y   è +
+Put initial y, y after a vowel, and +i between vowels into upper case. R1 and +R2 +(see the note on R1 and R2) +are then defined as in German. +

+Define a valid s-ending as a non-vowel other than j. +

+Define a valid en-ending as a non-vowel, and not gem. +

+Define undoubling the ending as removing the last letter if the word ends +kk, dd or tt. +

+Do each of steps 1, 2 3 and 4. +

+Step 1: +
+ Search for the longest among the following suffixes, and perform the + action indicated +

+
+
(a) heden +
replace with heid if in R1 +

+
(b) en   ene +
delete if in R1 and preceded by a valid en-ending, and then + undouble the ending +

+
(c) s   se +
delete if in R1 and preceded by a valid s-ending +
+
+Step 2: +
+ Delete suffix e if in R1 and preceded by a non-vowel, and then undouble + the ending +
+Step 3a: heid +
+ delete heid if in R2 and not preceded by c, and treat a preceding + en as in step 1(b) +
+Step 3b: d-suffixes (*) +
+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
end   ing +
delete if in R2 +
if preceded by ig, delete if in R2 and not preceded by e, otherwise + undouble the ending +

+
ig +
delete if in R2 and not preceded by e +

+
lijk +
delete if in R2, and then repeat step 2 +

+
baar +
delete if in R2 +

+
bar +
delete if in R2 and if step 2 actually removed an e +
+
+Step 4: undouble vowel +
+ If the words ends CVD, where C is a non-vowel, D is a non-vowel other + than I, and V is double a, e, o or u, remove one of the vowels from + V (for example, maanman, broodbrod). +
+Finally, +
+ Turn I and Y back into lower case. +
+ +

The same algorithm in Snowball

+ +
routines (
+           prelude postlude
+           e_ending
+           en_ending
+           mark_regions
+           R1 R2
+           undouble
+           standard_suffix
+)
+
+externals ( stem )
+
+booleans ( e_found )
+
+integers ( p1 p2 )
+
+groupings ( v v_I v_j )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a"   '{U+00E4}'
+stringdef e"   '{U+00EB}'
+stringdef i"   '{U+00EF}'
+stringdef o"   '{U+00F6}'
+stringdef u"   '{U+00FC}'
+
+stringdef a'   '{U+00E1}'
+stringdef e'   '{U+00E9}'
+stringdef i'   '{U+00ED}'
+stringdef o'   '{U+00F3}'
+stringdef u'   '{U+00FA}'
+
+stringdef e`   '{U+00E8}'
+
+define v       'aeiouy{e`}'
+define v_I     v + 'I'
+define v_j     v + 'j'
+
+define prelude as (
+    test repeat (
+        [substring] among(
+            '{a"}' '{a'}'
+                (<- 'a')
+            '{e"}' '{e'}'
+                (<- 'e')
+            '{i"}' '{i'}'
+                (<- 'i')
+            '{o"}' '{o'}'
+                (<- 'o')
+            '{u"}' '{u'}'
+                (<- 'u')
+            ''  (next)
+        ) //or next
+    )
+    try(['y'] <- 'Y')
+    repeat goto (
+        v [('i'] v <- 'I') or
+           ('y']   <- 'Y')
+    )
+)
+
+define mark_regions as (
+
+    $p1 = limit
+    $p2 = limit
+
+    gopast v  gopast non-v  setmark p1
+    try($p1 < 3  $p1 = 3)  // at least 3
+    gopast v  gopast non-v  setmark p2
+
+)
+
+define postlude as repeat (
+
+    [substring] among(
+        'Y'  (<- 'y')
+        'I'  (<- 'i')
+        ''   (next)
+    ) //or next
+
+)
+
+backwardmode (
+
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define undouble as (
+        test among('kk' 'dd' 'tt') [next] delete
+    )
+
+    define e_ending as (
+        unset e_found
+        ['e'] R1 test non-v delete
+        set e_found
+        undouble
+    )
+
+    define en_ending as (
+        R1 non-v and not 'gem' delete
+        undouble
+    )
+
+    define standard_suffix as (
+        do (
+            [substring] among(
+                'heden'
+                (   R1 <- 'heid'
+                )
+                'en' 'ene'
+                (   en_ending
+                )
+                's' 'se'
+                (   R1 non-v_j delete
+                )
+            )
+        )
+        do e_ending
+
+        do ( ['heid'] R2 not 'c' delete
+             ['en'] en_ending
+           )
+
+        do (
+            [substring] among(
+                'end' 'ing'
+                (   R2 delete
+                    (['ig'] R2 not 'e' delete) or undouble
+                )
+                'ig'
+                (   R2 not 'e' delete
+                )
+                'lijk'
+                (   R2 delete e_ending
+                )
+                'baar'
+                (   R2 delete
+                )
+                'bar'
+                (   R2 e_found delete
+                )
+            )
+        )
+        do (
+            non-v_I
+            test (
+                among ('aa' 'ee' 'oo' 'uu')
+                non-v
+            )
+            [next] delete
+        )
+    )
+)
+
+define stem as (
+
+        do prelude
+        do mark_regions
+        backwards
+            do standard_suffix
+        do postlude
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/dutch/stemmer.tt b/algorithms/dutch/stemmer.tt new file mode 100644 index 0000000..76d9f33 --- /dev/null +++ b/algorithms/dutch/stemmer.tt @@ -0,0 +1,103 @@ +[% header('Dutch stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'lichaamsziek', 'opgingen']) %] + +

The stemming algorithm

+ +Dutch includes the following accented forms +
+ ä   ë   ï   ö   ü   á   é   í   ó   ú   è +
+First, remove all umlaut and acute accents. A vowel is then one of, +
+ a   e   i   o   u   y   è +
+Put initial y, y after a vowel, and +i between vowels into upper case. R1 and +R2 +(see the note on R1 and R2) +are then defined as in German. +

+Define a valid s-ending as a non-vowel other than j. +

+Define a valid en-ending as a non-vowel, and not gem. +

+Define undoubling the ending as removing the last letter if the word ends +kk, dd or tt. +

+Do each of steps 1, 2 3 and 4. +

+Step 1: +
+ Search for the longest among the following suffixes, and perform the + action indicated +

+
+
(a) heden +
replace with heid if in R1 +

+
(b) en   ene +
delete if in R1 and preceded by a valid en-ending, and then + undouble the ending +

+
(c) s   se +
delete if in R1 and preceded by a valid s-ending +
+
+Step 2: +
+ Delete suffix e if in R1 and preceded by a non-vowel, and then undouble + the ending +
+Step 3a: heid +
+ delete heid if in R2 and not preceded by c, and treat a preceding + en as in step 1(b) +
+Step 3b: d-suffixes (*) +
+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
end   ing +
delete if in R2 +
if preceded by ig, delete if in R2 and not preceded by e, otherwise + undouble the ending +

+
ig +
delete if in R2 and not preceded by e +

+
lijk +
delete if in R2, and then repeat step 2 +

+
baar +
delete if in R2 +

+
bar +
delete if in R2 and if step 2 actually removed an e +
+
+Step 4: undouble vowel +
+ If the words ends CVD, where C is a non-vowel, D is a non-vowel other + than I, and V is double a, e, o or u, remove one of the vowels from + V (for example, maanman, broodbrod). +
+Finally, +
+ Turn I and Y back into lower case. +
+ +

The same algorithm in Snowball

+ +[% highlight_file('dutch') %] + +[% footer %] diff --git a/algorithms/dutch/stop.txt b/algorithms/dutch/stop.txt new file mode 100644 index 0000000..d9f38a8 --- /dev/null +++ b/algorithms/dutch/stop.txt @@ -0,0 +1,113 @@ + + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other + diff --git a/algorithms/english-combining-forms.png b/algorithms/english-combining-forms.png new file mode 100644 index 0000000..ecac711 Binary files /dev/null and b/algorithms/english-combining-forms.png differ diff --git a/algorithms/english/stemmer.html b/algorithms/english/stemmer.html new file mode 100644 index 0000000..16b6714 --- /dev/null +++ b/algorithms/english/stemmer.html @@ -0,0 +1,1069 @@ + + + + + + + + + + The English (Porter2) stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

The English (Porter2) stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of English vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+consign
+consigned
+consigning
+consignment
+consist
+consisted
+consistency
+consistent
+consistently
+consisting
+consists
+consolation
+consolations
+consolatory
+console
+consoled
+consoles
+consolidate
+consolidated
+consolidating
+consoling
+consolingly
+consols
+consonant
+consort
+consorted
+consorting
+conspicuous
+conspicuously
+conspiracy
+conspirator
+conspirators
+conspire
+conspired
+conspiring
+constable
+constables
+constance
+constancy
+constant
+
+consign
+consign
+consign
+consign
+consist
+consist
+consist
+consist
+consist
+consist
+consist
+consol
+consol
+consolatori
+consol
+consol
+consol
+consolid
+consolid
+consolid
+consol
+consol
+consol
+conson
+consort
+consort
+consort
+conspicu
+conspicu
+conspiraci
+conspir
+conspir
+conspir
+conspir
+conspir
+constabl
+constabl
+constanc
+constanc
+constant
+
+knack
+knackeries
+knacks
+knag
+knave
+knaves
+knavish
+kneaded
+kneading
+knee
+kneel
+kneeled
+kneeling
+kneels
+knees
+knell
+knelt
+knew
+knick
+knif
+knife
+knight
+knightly
+knights
+knit
+knits
+knitted
+knitting
+knives
+knob
+knobs
+knock
+knocked
+knocker
+knockers
+knocking
+knocks
+knopp
+knot
+knots
+
+knack
+knackeri
+knack
+knag
+knave
+knave
+knavish
+knead
+knead
+knee
+kneel
+kneel
+kneel
+kneel
+knee
+knell
+knelt
+knew
+knick
+knif
+knife
+knight
+knight
+knight
+knit
+knit
+knit
+knit
+knive
+knob
+knob
+knock
+knock
+knocker
+knocker
+knock
+knock
+knopp
+knot
+knot
+
+
+ +

Developing the English stemmer

+ +

+(Revised slightly, December 2001)
+(Further revised, September 2002) +

+ +

+I have made more than one attempt to improve the structure of the Porter +algorithm by making it follow the pattern of ending removal of the Romance +language stemmers. It is not hard to see why one should want to do this: +step 1b of the Porter stemmer removes ed and ing, which are +i-suffixes (*) attached to verbs. If these suffixes are removed, there +should be no need to remove d-suffixes which are not verbal, although +it will try to do so. This seems to be a deficiency in the Porter stemmer, +not shared by the Romance stemmers. Again, the divisions between steps +2, 3 and 4 seem rather arbitrary, and are not found in the Romance stemmers. +

+ +

+Nevertheless, these attempts at improvement have been abandoned. They seem +to lead to a more complicated algorithm with no very obvious improvements. +A reason for not taking note of the outcome of step 1b may be that +English endings do not determine word categories quite as strongly as +endings in the Romance languages. For example, condition and +position in French have to be nouns, but in English they can be verbs +as well as nouns, +

+ We are all conditioned by advertising
+ They are positioning themselves differently today +
+A possible reason for having separate steps 2, 3 and 4 is that +d-suffix combinations in English are quite complex, a point which has +been made +elsewhere. +

+ +

+But it is hardly surprising that after twenty years of use of the Porter +stemmer, certain improvements did suggest themselves, and a new algorithm +for English is therefore offered here. (It could be called the ‘Porter2’ +stemmer to distinguish it from the Porter stemmer, from which it derives.) +The changes are not so very extensive: (1) terminating y is changed to +i rather less often, (2) suffix us does not lose its s, (3) a +few additional suffixes are included for removal, including (4) suffix +ly. In addition, a small list of exceptional forms is included. In +December 2001 there were two further adjustments: (5) Steps 5a and 5b +of the old Porter stemmer were combined into a single step. This means +that undoubling final ll is not done with removal of final e. (6) +In Step 3 ative is removed only when in region R2. +(7) +In July +2005 a small adjustment was made (including a new step 0) to handle +apostrophe. +

+ +

+To begin with, here is the basic algorithm without reference to the +exceptional forms. An exact comparison with the Porter algorithm needs to +be done quite carefully if done at all. Here we indicate by * points +of departure, and by + additional features. In the sample vocabulary, +Porter and Porter2 stem slightly under 5% of words to different forms. +

+ +

Definition of the English stemmer

+ +

+Define a vowel as one of +

+ a   e   i   o   u   y +
+Define a double as one of +
+ bb   dd   ff   gg   mm   nn   pp   rr   tt +
+Define a valid li-ending as one of +
+ c   d   e   g   h   k   m   n   r   t +
+ +R1 is the region after the first non-vowel following a vowel, or the end of +the word if there is no such non-vowel. (This definition may be modified for certain exceptional +words — see below.) +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or the +end of the word if there is no such non-vowel. +(See note on R1 and R2.) +

+ +

+Define a short syllable in a word as either (a) a vowel followed by a +non-vowel other than w, x or Y and preceded by a non-vowel, or +* +(b) a vowel at the beginning of the word followed by a non-vowel. +

+ +

+So rap, +trap, entrap end with a short syllable, and ow, on, at are +classed as short syllables. But uproot, bestow, disturb do not end with a +short syllable. +

+ +

+A word is called short if it ends in a short syllable, and if R1 is null. +

+ +

+So bed, shed and shred are short words, bead, embed, beds are +not short words. +

+ +

+An apostrophe (') may be regarded as a letter. +(See note on apostrophes in English.) +

+ +

+If the word has two letters or less, leave it as it is. +

+ +

+Otherwise, do each of the following operations, +

+ +

+Remove initial ', if present. + Then, +

+ +

+Set initial y, or y after a vowel, to Y, and then establish the regions +R1 and R2. +(See note on vowel marking.) +

+ +

+Step 0: + +

+

+ Search for the longest among the suffixes, +

+
+
' +
's +
's' +
and remove if found. +
+
+Step 1a: +
+

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
sses +
replace by ss +
ied+   ies* +
replace by i if preceded by more than one letter, otherwise by ie + (so tiestie, criescri) +
s +
delete if the preceding word part contains a vowel not immediately before the +s (so gas and this retain the s, gaps and kiwis lose it) +
us+   ss +
do nothing +
+
+ +Step 1b: +
+

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
eed   eedly+ +
replace by ee if in R1 +
ed   edly+   ing   ingly+ +
delete if the preceding word part contains a vowel, and after the deletion: +
if the word ends at, bl or iz add e (so luxuriatluxuriate), or +
if the word ends with a double + remove the last letter (so hopphop), or +
if the word is short, add e (so hophope) +
+
+ +Step 1c: * +
+ replace suffix y or Y by i if preceded by a non-vowel which is not the + first letter of the word (so crycri, byby, saysay) +
+ +Step 2: +
+

+ Search for the longest among the following suffixes, and, if + found and in R1, perform the action indicated. +

+
+
tional:   replace by tion +
enci:   replace by ence +
anci:   replace by ance +
abli:   replace by able +
entli:   replace by ent +
izer   ization:   replace by ize +
ational   ation   ator:   replace by ate +
alism   aliti   alli:   replace by al +
fulness:   replace by ful +
ousli   ousness:   replace by ous +
iveness   iviti:   replace by ive +
biliti   bli+:   replace by ble +
ogi+:   replace by og if preceded by l +
fulli+:   replace by ful +
lessli+:   replace by less +
li+:   delete if preceded by a valid li-ending +
+
+ +Step 3: +
+

+ Search for the longest among the following suffixes, and, if + found and in R1, perform the action indicated. +

+
+
tional+:   replace by tion +
ational+:   replace by ate +
alize:   replace by al +
icate   iciti   ical:   replace by ic +
ful   ness:   delete +
ative*:   delete if in R2 +
+
+ +Step 4: +
+

+ Search for the longest among the following suffixes, and, if + found and in R2, perform the action indicated. +

+
+
al   ance   ence   er   ic   able   ible   ant   ement   + ment   ent   ism   ate   iti   ous   ive   ize +
delete +
ion +
delete if preceded by s or t +
+
+ +Step 5: * +
+

+ Search for the following suffixes, and, if + found, perform the action indicated. +

+
+
e +
delete if in R2, or in R1 and not preceded by a short + syllable +
l +
delete if in R2 and preceded by l +
+
+

+Finally, turn any remaining Y letters in the word back into lower case. +

+ +

Exceptional forms in general

+ +

+It is quite easy to expand a Snowball script so that certain exceptional +word forms get special treatment. The standard case is that certain words +W1,  W2  ..., instead of passing through the stemming process, are +mapped to the forms  X1,  X2  ... respectively. If the script does +the stemming by means of the call +

+
    define stem as C
+
+ +

+where  C  is a command, the exceptional cases can be dealt with by extending this to +

+
    define stem as ( exception or C )
+
+ +

+and putting in a routine  exception: +

+
+    define exception as (
+        [substring] atlimit among(
+            'W1'  ( <- 'X1' )
+            'W2'  ( <- 'X2' )
+            ...
+        )
+    )
+
+

+atlimit  causes the whole string to be tested for equality with one of +the  Wi, and if a match is found, the string is replaced with +Xi. +

+ +

+More precisely we might have a group of words  W11,  W12  ... +that need to be mapped to  X1, another group  W21,  W22 +... that need to be mapped to  X2, and so on, and a list of words +V1,  V2  ...  Vk  that are to remain invariant. The +exception  routine may then be written as follows: + +

+    among( 'W11' 'W12' ... (<- 'X1')
+           'W21' 'W22' ... (<- 'X2')
+           ...
+           'Wn1' 'Wn2' ... (<- 'Xn')
+           'V1' 'V2' ... 'Vk'
+         )
+
+

+And indeed the  exception1  routine for the English stemmer has just that +shape: +

+
    define exception1 as (
+
+         [substring] atlimit among(
+
+            /* special changes: */
+
+            'skis'      (<-'ski')
+            'skies'     (<-'sky')
+            'dying'     (<-'die')
+            'lying'     (<-'lie')
+            'tying'     (<-'tie')
+
+            /* special -LY cases */
+
+            'idly'      (<-'idl')
+            'gently'    (<-'gentl')
+            'ugly'      (<-'ugli')
+            'early'     (<-'earli')
+            'only'      (<-'onli')
+            'singly'    (<-'singl')
+
+            // ... extensions possible here ...
+
+            /* invariant forms: */
+
+            'sky'
+            'news'
+            'howe'
+
+            'atlas' 'cosmos' 'bias' 'andes' // not plural forms
+
+            // ... extensions possible here ...
+        )
+    )
+
+ + +

+(More will be said about the words that appear here shortly.) +

+ +

+Here we see words being treated exceptionally before stemming is done, but equally we could +treat stems exceptionally after stemming is done, and so, if we wish, map absorpt to +absorb, reduct to reduc etc., as in the +Lovins stemmer. +But more generally, throughout the algorithm, each significant step may have recognised +exceptions, and a suitably placed  among  will take care of them. For example, a point made +at least twice in the literature is that words beginning gener are overstemmed by the +Porter stemmer: +

+ +
+
generate
+ generates
+ generated
+ generating
+ general
+ generally
+ generic
+ generically
+ generous
+ generously
  →   gener + +
+ +

+To fix this over-stemming, we make an exception to the usual setting of p1, +the left point of R1, and therefore replace +

+ +
    gopast v  gopast non-v  setmark p1
+
+ + +

+with +

+ +
    among (
+        'gener'
+        // ... and other stems may be included here ...
+    ) or (gopast v  gopast non-v)
+    setmark p
+
+ + +

+after which the words beginning gener stem as follows: +

+ +
+
generate
+ generates
+ generated
+ generating
+
  →   generat + +
general
+ generally
+
  →   general +
generic
+ generically
+
  →   generic +
generous
+ generously
+
  →   generous +
+ +

+Another example is given by the  exception2  routine, which is similar to  exception1, +but placed after the call of  Step_1a, which may have removed terminal s, +

+ +
    define exception2 as (
+
+        [substring] atlimit among(
+            'inning' 'outing' 'canning' 'herring'
+            'proceed' 'exceed' 'succeed'
+
+            // ... extensions possible here ...
+
+        )
+    )
+
+ + +

+Snowball makes it easy therefore to add in lists of exceptions. But deciding what the lists of +exceptions should be is far from easy. Essentially there are two lines of attack, the +systematic and the piecemeal. One might systematically treat as exceptions the stem changes of +irregular verbs, for example. The piecemeal approach is to add in exceptions as people notice +them — like gener above. The problem with the systematic approach is that it should be +done by investigating the entire language vocabulary, and that is more than most people are +prepared to do. The problem with the piecemeal approach is that it is arbitrary, and usually +yields little. +

+ +

+The exception lists in the English stemmer are meant to be illustrative (‘this is how it is done if you +want to do it’), and were derived piecemeal. +

+ +

+a) +The new stemmer improves on the Porter stemmer in handling short words ending e and +y. There is however a mishandling of the four forms sky, skies, ski, +skis, which is easily corrected by treating three of these words as +special cases. +

+ +

+b) +Similarly there is a problem with the ing form of three letter verbs ending ie. There +are only three such verbs: die, lie and tie, so a special case is made for +dying, lying and tying. +

+ +

+c) +One has to be a little careful of certain ing forms. +inning, outing, canning, which one does not wish +to be stemmed to +in, out, can. +

+ +

+d) +The removal of suffix ly, which is not in the Porter stemmer, has a number of exceptions. +Certain short-word exceptions are idly, gently, ugly, early, only, singly. +Rarer words (bristly, burly, curly, surly ...) are not included. +

+ +

+e) +The remaining words were included following complaints from users of the Porter algorithm. +news is not the plural of new (noticed when IR systems were being set up for +Reuters). Howe is a surname, and needs to be separated from how (noticed when +doing a search for ‘Sir Geoffrey Howe’ in a demonstration at the House of Commons). +succeed etc are not past participles, so the ed should not be removed (pointed out +to me in an email from India). herring should not stem to her (another email from +Russia). +

+ +

+f) +Finally, a few non-plural words ending s have been added. +

+ +

+Incidentally, this illustrates how much feedback to expect from the real users of a stemming +algorithm: seven or eight words in twenty years! +

+ +

+The definition of the English stemmer above is therefore supplemented by the following: +

+ +

Exceptional forms in the English stemmer

+ +
+

+ If the words begins gener, commun or arsen, set R1 to be the remainder of the + word. +

+ +

+ Stem certain special words as follows, +

+ +
+
skis   →   ski +
skies   →   sky + +
dying
lying
tying
+
→ + die
lie
tie
+ + +
idly
gently
ugly
early
only
singly
+
→ + idl
gentl
ugli
earli
onli
singl
+ +
+ +

+ If one of the following is found, leave it invariant, +

+ +
+
sky
news
howe
+
atlas     cosmos     bias     andes +
+ +

+ Following step 1a, leave the following invariant, +

+ +
+
inning     outing     canning     herring     earring +
proceed     exceed     succeed +
+
+ +

The full algorithm in Snowball

+ +
integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+    prelude postlude
+    mark_regions
+    shortv
+    R1 R2
+    Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5
+    exception1
+    exception2
+)
+
+externals ( stem )
+
+groupings ( v v_WXY valid_LI )
+
+stringescapes {}
+
+define v        'aeiouy'
+define v_WXY    v + 'wxY'
+
+define valid_LI 'cdeghkmnrt'
+
+define prelude as (
+    unset Y_found
+    do ( ['{'}'] delete)
+    do ( ['y'] <-'Y' set Y_found)
+    do repeat(goto (v ['y']) <-'Y' set Y_found)
+)
+
+define mark_regions as (
+    $p1 = limit
+    $p2 = limit
+    do(
+        among (
+            'gener'
+            'commun'  //  added May 2005
+            'arsen'   //  added Nov 2006 (arsenic/arsenal)
+            // ... extensions possible here ...
+        ) or (gopast v  gopast non-v)
+        setmark p1
+        gopast v  gopast non-v  setmark p2
+    )
+)
+
+backwardmode (
+
+    define shortv as (
+        ( non-v_WXY v non-v )
+        or
+        ( non-v v atlimit )
+    )
+
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define Step_1a as (
+        try (
+            [substring] among (
+                '{'}' '{'}s' '{'}s{'}'
+                       (delete)
+            )
+        )
+        [substring] among (
+            'sses' (<-'ss')
+            'ied' 'ies'
+                   ((hop 2 <-'i') or <-'ie')
+            's'    (next gopast v delete)
+            'us' 'ss'
+        )
+    )
+
+    define Step_1b as (
+        [substring] among (
+            'eed' 'eedly'
+                (R1 <-'ee')
+            'ed' 'edly' 'ing' 'ingly'
+                (
+                test gopast v  delete
+                test substring among(
+                    'at' 'bl' 'iz'
+                         (<+ 'e')
+                    'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+                    // ignoring double c, h, j, k, q, v, w, and x
+                         ([next]  delete)
+                    ''   (atmark p1  test shortv  <+ 'e')
+                )
+            )
+        )
+    )
+
+    define Step_1c as (
+        ['y' or 'Y']
+        non-v not atlimit
+        <-'i'
+    )
+
+    define Step_2 as (
+        [substring] R1 among (
+            'tional'  (<-'tion')
+            'enci'    (<-'ence')
+            'anci'    (<-'ance')
+            'abli'    (<-'able')
+            'entli'   (<-'ent')
+            'izer' 'ization'
+                      (<-'ize')
+            'ational' 'ation' 'ator'
+                      (<-'ate')
+            'alism' 'aliti' 'alli'
+                      (<-'al')
+            'fulness' (<-'ful')
+            'ousli' 'ousness'
+                      (<-'ous')
+            'iveness' 'iviti'
+                      (<-'ive')
+            'biliti' 'bli'
+                      (<-'ble')
+            'ogi'     ('l' <-'og')
+            'fulli'   (<-'ful')
+            'lessli'  (<-'less')
+            'li'      (valid_LI delete)
+        )
+    )
+
+    define Step_3 as (
+        [substring] R1 among (
+            'tional'  (<- 'tion')
+            'ational' (<- 'ate')
+            'alize'   (<-'al')
+            'icate' 'iciti' 'ical'
+                      (<-'ic')
+            'ful' 'ness'
+                      (delete)
+            'ative'
+                      (R2 delete)  // 'R2' added Dec 2001
+        )
+    )
+
+    define Step_4 as (
+        [substring] R2 among (
+            'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
+            'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
+                      (delete)
+            'ion'     ('s' or 't' delete)
+        )
+    )
+
+    define Step_5 as (
+        [substring] among (
+            'e' (R2 or (R1 not shortv) delete)
+            'l' (R2 'l' delete)
+        )
+    )
+
+    define exception2 as (
+
+        [substring] atlimit among(
+            'inning' 'outing' 'canning' 'herring' 'earring'
+            'proceed' 'exceed' 'succeed'
+
+            // ... extensions possible here ...
+
+        )
+    )
+)
+
+define exception1 as (
+
+    [substring] atlimit among(
+
+        /* special changes: */
+
+        'skis'      (<-'ski')
+        'skies'     (<-'sky')
+        'dying'     (<-'die')
+        'lying'     (<-'lie')
+        'tying'     (<-'tie')
+
+        /* special -LY cases */
+
+        'idly'      (<-'idl')
+        'gently'    (<-'gentl')
+        'ugly'      (<-'ugli')
+        'early'     (<-'earli')
+        'only'      (<-'onli')
+        'singly'    (<-'singl')
+
+        // ... extensions possible here ...
+
+        /* invariant forms: */
+
+        'sky'
+        'news'
+        'howe'
+
+        'atlas' 'cosmos' 'bias' 'andes' // not plural forms
+
+        // ... extensions possible here ...
+    )
+)
+
+define postlude as (Y_found  repeat(goto (['Y']) <-'y'))
+
+define stem as (
+
+    exception1 or
+    not hop 3 or (
+        do prelude
+        do mark_regions
+        backwards (
+
+            do Step_1a
+
+            exception2 or (
+
+                do Step_1b
+                do Step_1c
+
+                do Step_2
+                do Step_3
+                do Step_4
+
+                do Step_5
+            )
+        )
+        do postlude
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/english/stemmer.tt b/algorithms/english/stemmer.tt new file mode 100644 index 0000000..8b35c7a --- /dev/null +++ b/algorithms/english/stemmer.tt @@ -0,0 +1,583 @@ +[% header('The English (Porter2) stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'consign', 'knack']) %] + +

Developing the English stemmer

+ +

+(Revised slightly, December 2001)
+(Further revised, September 2002) +

+ +

+I have made more than one attempt to improve the structure of the Porter +algorithm by making it follow the pattern of ending removal of the Romance +language stemmers. It is not hard to see why one should want to do this: +step 1b of the Porter stemmer removes ed and ing, which are +i-suffixes (*) attached to verbs. If these suffixes are removed, there +should be no need to remove d-suffixes which are not verbal, although +it will try to do so. This seems to be a deficiency in the Porter stemmer, +not shared by the Romance stemmers. Again, the divisions between steps +2, 3 and 4 seem rather arbitrary, and are not found in the Romance stemmers. +

+ +

+Nevertheless, these attempts at improvement have been abandoned. They seem +to lead to a more complicated algorithm with no very obvious improvements. +A reason for not taking note of the outcome of step 1b may be that +English endings do not determine word categories quite as strongly as +endings in the Romance languages. For example, condition and +position in French have to be nouns, but in English they can be verbs +as well as nouns, +

+ We are all conditioned by advertising
+ They are positioning themselves differently today +
+A possible reason for having separate steps 2, 3 and 4 is that +d-suffix combinations in English are quite complex, a point which has +been made +elsewhere. +

+ +

+But it is hardly surprising that after twenty years of use of the Porter +stemmer, certain improvements did suggest themselves, and a new algorithm +for English is therefore offered here. (It could be called the ‘Porter2’ +stemmer to distinguish it from the Porter stemmer, from which it derives.) +The changes are not so very extensive: (1) terminating y is changed to +i rather less often, (2) suffix us does not lose its s, (3) a +few additional suffixes are included for removal, including (4) suffix +ly. In addition, a small list of exceptional forms is included. In +December 2001 there were two further adjustments: (5) Steps 5a and 5b +of the old Porter stemmer were combined into a single step. This means +that undoubling final ll is not done with removal of final e. (6) +In Step 3 ative is removed only when in region R2. +(7) +In July +2005 a small adjustment was made (including a new step 0) to handle +apostrophe. +

+ +

+To begin with, here is the basic algorithm without reference to the +exceptional forms. An exact comparison with the Porter algorithm needs to +be done quite carefully if done at all. Here we indicate by * points +of departure, and by + additional features. In the sample vocabulary, +Porter and Porter2 stem slightly under 5% of words to different forms. +

+ +

Definition of the English stemmer

+ +

+Define a vowel as one of +

+ a   e   i   o   u   y +
+Define a double as one of +
+ bb   dd   ff   gg   mm   nn   pp   rr   tt +
+Define a valid li-ending as one of +
+ c   d   e   g   h   k   m   n   r   t +
+ +R1 is the region after the first non-vowel following a vowel, or the end of +the word if there is no such non-vowel. (This definition may be modified for certain exceptional +words — see below.) +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or the +end of the word if there is no such non-vowel. +(See note on R1 and R2.) +

+ +

+Define a short syllable in a word as either (a) a vowel followed by a +non-vowel other than w, x or Y and preceded by a non-vowel, or +* +(b) a vowel at the beginning of the word followed by a non-vowel. +

+ +

+So rap, +trap, entrap end with a short syllable, and ow, on, at are +classed as short syllables. But uproot, bestow, disturb do not end with a +short syllable. +

+ +

+A word is called short if it ends in a short syllable, and if R1 is null. +

+ +

+So bed, shed and shred are short words, bead, embed, beds are +not short words. +

+ +

+An apostrophe (') may be regarded as a letter. +(See note on apostrophes in English.) +

+ +

+If the word has two letters or less, leave it as it is. +

+ +

+Otherwise, do each of the following operations, +

+ +

+Remove initial ', if present. + Then, +

+ +

+Set initial y, or y after a vowel, to Y, and then establish the regions +R1 and R2. +(See note on vowel marking.) +

+ +

+Step 0: + +

+

+ Search for the longest among the suffixes, +

+
+
' +
's +
's' +
and remove if found. +
+
+Step 1a: +
+

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
sses +
replace by ss +
ied+   ies* +
replace by i if preceded by more than one letter, otherwise by ie + (so tiestie, criescri) +
s +
delete if the preceding word part contains a vowel not immediately before the +s (so gas and this retain the s, gaps and kiwis lose it) +
us+   ss +
do nothing +
+
+ +Step 1b: +
+

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
eed   eedly+ +
replace by ee if in R1 +
ed   edly+   ing   ingly+ +
delete if the preceding word part contains a vowel, and after the deletion: +
if the word ends at, bl or iz add e (so luxuriatluxuriate), or +
if the word ends with a double + remove the last letter (so hopphop), or +
if the word is short, add e (so hophope) +
+
+ +Step 1c: * +
+ replace suffix y or Y by i if preceded by a non-vowel which is not the + first letter of the word (so crycri, byby, saysay) +
+ +Step 2: +
+

+ Search for the longest among the following suffixes, and, if + found and in R1, perform the action indicated. +

+
+
tional:   replace by tion +
enci:   replace by ence +
anci:   replace by ance +
abli:   replace by able +
entli:   replace by ent +
izer   ization:   replace by ize +
ational   ation   ator:   replace by ate +
alism   aliti   alli:   replace by al +
fulness:   replace by ful +
ousli   ousness:   replace by ous +
iveness   iviti:   replace by ive +
biliti   bli+:   replace by ble +
ogi+:   replace by og if preceded by l +
fulli+:   replace by ful +
lessli+:   replace by less +
li+:   delete if preceded by a valid li-ending +
+
+ +Step 3: +
+

+ Search for the longest among the following suffixes, and, if + found and in R1, perform the action indicated. +

+
+
tional+:   replace by tion +
ational+:   replace by ate +
alize:   replace by al +
icate   iciti   ical:   replace by ic +
ful   ness:   delete +
ative*:   delete if in R2 +
+
+ +Step 4: +
+

+ Search for the longest among the following suffixes, and, if + found and in R2, perform the action indicated. +

+
+
al   ance   ence   er   ic   able   ible   ant   ement   + ment   ent   ism   ate   iti   ous   ive   ize +
delete +
ion +
delete if preceded by s or t +
+
+ +Step 5: * +
+

+ Search for the following suffixes, and, if + found, perform the action indicated. +

+
+
e +
delete if in R2, or in R1 and not preceded by a short + syllable +
l +
delete if in R2 and preceded by l +
+
+

+Finally, turn any remaining Y letters in the word back into lower case. +

+ +

Exceptional forms in general

+ +

+It is quite easy to expand a Snowball script so that certain exceptional +word forms get special treatment. The standard case is that certain words +W1,  W2  ..., instead of passing through the stemming process, are +mapped to the forms  X1,  X2  ... respectively. If the script does +the stemming by means of the call +

+[% highlight(" + define stem as C +") %] +

+where  C  is a command, the exceptional cases can be dealt with by extending this to +

+[% highlight(" + define stem as ( exception or C ) +") %] +

+and putting in a routine  exception: +

+
+    define exception as (
+        [substring] atlimit among(
+            'W1'  ( <- 'X1' )
+            'W2'  ( <- 'X2' )
+            ...
+        )
+    )
+
+

+atlimit  causes the whole string to be tested for equality with one of +the  Wi, and if a match is found, the string is replaced with +Xi. +

+ +

+More precisely we might have a group of words  W11,  W12  ... +that need to be mapped to  X1, another group  W21,  W22 +... that need to be mapped to  X2, and so on, and a list of words +V1,  V2  ...  Vk  that are to remain invariant. The +exception  routine may then be written as follows: + +

+    among( 'W11' 'W12' ... (<- 'X1')
+           'W21' 'W22' ... (<- 'X2')
+           ...
+           'Wn1' 'Wn2' ... (<- 'Xn')
+           'V1' 'V2' ... 'Vk'
+         )
+
+

+And indeed the  exception1  routine for the English stemmer has just that +shape: +

+[% highlight(" + define exception1 as ( + + [substring] atlimit among( + + /* special changes: */ + + 'skis' (<-'ski') + 'skies' (<-'sky') + 'dying' (<-'die') + 'lying' (<-'lie') + 'tying' (<-'tie') + + /* special -LY cases */ + + 'idly' (<-'idl') + 'gently' (<-'gentl') + 'ugly' (<-'ugli') + 'early' (<-'earli') + 'only' (<-'onli') + 'singly' (<-'singl') + + // ... extensions possible here ... + + /* invariant forms: */ + + 'sky' + 'news' + 'howe' + + 'atlas' 'cosmos' 'bias' 'andes' // not plural forms + + // ... extensions possible here ... + ) + ) +") %] + +

+(More will be said about the words that appear here shortly.) +

+ +

+Here we see words being treated exceptionally before stemming is done, but equally we could +treat stems exceptionally after stemming is done, and so, if we wish, map absorpt to +absorb, reduct to reduc etc., as in the +Lovins stemmer. +But more generally, throughout the algorithm, each significant step may have recognised +exceptions, and a suitably placed  among  will take care of them. For example, a point made +at least twice in the literature is that words beginning gener are overstemmed by the +Porter stemmer: +

+ +
+
generate
+ generates
+ generated
+ generating
+ general
+ generally
+ generic
+ generically
+ generous
+ generously
  →   gener + +
+ +

+To fix this over-stemming, we make an exception to the usual setting of p1, +the left point of R1, and therefore replace +

+ +[% highlight(" + gopast v gopast non-v setmark p1 +") %] + +

+with +

+ +[% highlight(" + among ( + 'gener' + // ... and other stems may be included here ... + ) or (gopast v gopast non-v) + setmark p +") %] + +

+after which the words beginning gener stem as follows: +

+ +
+
generate
+ generates
+ generated
+ generating
+
  →   generat + +
general
+ generally
+
  →   general +
generic
+ generically
+
  →   generic +
generous
+ generously
+
  →   generous +
+ +

+Another example is given by the  exception2  routine, which is similar to  exception1, +but placed after the call of  Step_1a, which may have removed terminal s, +

+ +[% highlight(" + define exception2 as ( + + [substring] atlimit among( + 'inning' 'outing' 'canning' 'herring' + 'proceed' 'exceed' 'succeed' + + // ... extensions possible here ... + + ) + ) +") %] + +

+Snowball makes it easy therefore to add in lists of exceptions. But deciding what the lists of +exceptions should be is far from easy. Essentially there are two lines of attack, the +systematic and the piecemeal. One might systematically treat as exceptions the stem changes of +irregular verbs, for example. The piecemeal approach is to add in exceptions as people notice +them — like gener above. The problem with the systematic approach is that it should be +done by investigating the entire language vocabulary, and that is more than most people are +prepared to do. The problem with the piecemeal approach is that it is arbitrary, and usually +yields little. +

+ +

+The exception lists in the English stemmer are meant to be illustrative (‘this is how it is done if you +want to do it’), and were derived piecemeal. +

+ +

+a) +The new stemmer improves on the Porter stemmer in handling short words ending e and +y. There is however a mishandling of the four forms sky, skies, ski, +skis, which is easily corrected by treating three of these words as +special cases. +

+ +

+b) +Similarly there is a problem with the ing form of three letter verbs ending ie. There +are only three such verbs: die, lie and tie, so a special case is made for +dying, lying and tying. +

+ +

+c) +One has to be a little careful of certain ing forms. +inning, outing, canning, which one does not wish +to be stemmed to +in, out, can. +

+ +

+d) +The removal of suffix ly, which is not in the Porter stemmer, has a number of exceptions. +Certain short-word exceptions are idly, gently, ugly, early, only, singly. +Rarer words (bristly, burly, curly, surly ...) are not included. +

+ +

+e) +The remaining words were included following complaints from users of the Porter algorithm. +news is not the plural of new (noticed when IR systems were being set up for +Reuters). Howe is a surname, and needs to be separated from how (noticed when +doing a search for ‘Sir Geoffrey Howe’ in a demonstration at the House of Commons). +succeed etc are not past participles, so the ed should not be removed (pointed out +to me in an email from India). herring should not stem to her (another email from +Russia). +

+ +

+f) +Finally, a few non-plural words ending s have been added. +

+ +

+Incidentally, this illustrates how much feedback to expect from the real users of a stemming +algorithm: seven or eight words in twenty years! +

+ +

+The definition of the English stemmer above is therefore supplemented by the following: +

+ +

Exceptional forms in the English stemmer

+ +
+

+ If the words begins gener, commun or arsen, set R1 to be the remainder of the + word. +

+ +

+ Stem certain special words as follows, +

+ +
+
skis   →   ski +
skies   →   sky + +
dying
lying
tying
+
→ + die
lie
tie
+ + +
idly
gently
ugly
early
only
singly
+
→ + idl
gentl
ugli
earli
onli
singl
+ +
+ +

+ If one of the following is found, leave it invariant, +

+ +
+
sky
news
howe
+
atlas     cosmos     bias     andes +
+ +

+ Following step 1a, leave the following invariant, +

+ +
+
inning     outing     canning     herring     earring +
proceed     exceed     succeed +
+
+ +

The full algorithm in Snowball

+ +[% highlight_file('english') %] + +[% footer %] diff --git a/algorithms/english/stop.txt b/algorithms/english/stop.txt new file mode 100644 index 0000000..aee35c5 --- /dev/null +++ b/algorithms/english/stop.txt @@ -0,0 +1,312 @@ + + | An English stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | Many of the forms below are quite rare (e.g. "yourselves") but included for + | completeness. + + | PRONOUNS FORMS + | 1st person sing + +i | subject, always in upper case of course + +me | object +my | possessive adjective + | the possessive pronoun `mine' is best suppressed, because of the + | sense of coal-mine etc. +myself | reflexive + | 1st person plural +we | subject + +| us | object + | care is required here because US = United States. It is usually + | safe to remove it if it is in lower case. +our | possessive adjective +ours | possessive pronoun +ourselves | reflexive + | second person (archaic `thou' forms not included) +you | subject and object +your | possessive adjective +yours | possessive pronoun +yourself | reflexive (singular) +yourselves | reflexive (plural) + | third person singular +he | subject +him | object +his | possessive adjective and pronoun +himself | reflexive + +she | subject +her | object and possessive adjective +hers | possessive pronoun +herself | reflexive + +it | subject and object +its | possessive adjective +itself | reflexive + | third person plural +they | subject +them | object +their | possessive adjective +theirs | possessive pronoun +themselves | reflexive + | other forms (demonstratives, interrogatives) +what +which +who +whom +this +that +these +those + + | VERB FORMS (using F.R. Palmer's nomenclature) + | BE +am | 1st person, present +is | -s form (3rd person, present) +are | present +was | 1st person, past +were | past +be | infinitive +been | past participle +being | -ing form + | HAVE +have | simple +has | -s form +had | past +having | -ing form + | DO +do | simple +does | -s form +did | past +doing | -ing form + + | The forms below are, I believe, best omitted, because of the significant + | homonym forms: + + | He made a WILL + | old tin CAN + | merry month of MAY + | a smell of MUST + | fight the good fight with all thy MIGHT + + | would, could, should, ought might however be included + + | | AUXILIARIES + | | WILL + |will + +would + + | | SHALL + |shall + +should + + | | CAN + |can + +could + + | | MAY + |may + |might + | | MUST + |must + | | OUGHT + +ought + + | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing + | pronoun + verb + +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll + + | verb + negation + +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't + + | auxiliary + negation + +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't + + | miscellaneous forms + +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's + + | rarer forms + + | daren't needn't + + | doubtful forms + + | oughtn't mightn't + + | ARTICLES +a +an +the + + | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so + | high, that classification is pointless.) +and +but +if +or +because +as +until +while + +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under + +again +further +then +once + +here +there +when +where +why +how + +all +any +both +each +few +more +most +other +some +such + +no +nor +not +only +own +same +so +than +too +very + + | Just for the record, the following words are among the commonest in English + + | one + | every + | least + | less + | many + | now + | ever + | never + | say + | says + | said + | also + | get + | go + | goes + | just + | made + | make + | put + | see + | seen + | whether + | like + | well + | back + | even + | still + | way + | take + | since + | another + | however + | two + | three + | four + | five + | first + | second + | new + | old + | high + | long + diff --git a/algorithms/estonian/stemmer.html b/algorithms/estonian/stemmer.html new file mode 100644 index 0000000..62f5154 --- /dev/null +++ b/algorithms/estonian/stemmer.html @@ -0,0 +1,657 @@ + + + + + + + + + + Estonian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Estonian stemming algorithm

+ + +

Links to resources

+ + + +
+ +

+Here is a sample of vocabulary, with the stemmed forms that will +be generated with the algorithm. +

+ +

+Here is a sample of Estonian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+raamat
+raamatu
+raamatut
+raamatule
+raamatud
+raamatute
+raamatuid
+raamatutele
+raamatutestki
+hele
+heleda
+heledat
+heledale
+heledad
+heledate
+heledaid
+heledatele
+heledam
+heledama
+heledamat
+heledamad
+heledamate
+heledamaid
+heledamatelegi
+heledaim
+heledaima
+heledaimat
+heledaimale
+heledaimad
+heledaimate
+heledaimaid
+heledaimatelt
+hobune
+hobuse
+hobust
+hobusele
+hobused
+hobuste
+hobuseid
+hobustele
+
+raama
+raama
+raama
+raama
+raama
+raama
+raama
+raama
+raama
+hele
+hele
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+heleda
+hobune
+hobuse
+hobu
+hobuse
+hobuse
+hobus
+hobuse
+hobus
+
+hüpata
+hüppa
+hüppaksin
+hüppaksid
+hüppaks
+hüppaksime
+hüppaksite
+hüppan
+hüppad
+hüppab
+hüppame
+hüppate
+hüppavad
+hüppasin
+hüppasid
+hüppas
+hüppasime
+hüppasite
+hüpanuksite
+hüpatakse
+hüpati
+hüpanud
+hüpanutest
+hüpates
+hüppavat
+hüppavatele
+hüppamata
+hüppamast
+hüljes
+hülge
+hüljest
+hülgesse
+hüljeste
+hülgeid
+hüljestesse
+hülgeisse
+ohutule
+ohutud
+ohutuid
+ohututele
+
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpati
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hüpa
+hülje
+hülge
+hülje
+hülge
+hüljes
+hülge
+hüljes
+hülge
+ohu
+ohu
+ohu
+ohu
+
+
+
+ +

+This algorithm is written in collaboration with Estonian text analytics enterprise Texta. +

+ +

+Letters in Estonian include the following accented forms, +

+ +
+ ä   ö   õ   ü   š   ž +
+ +

+The following letters are vowels (V1): +

+ +
+ a   e   i   o   u   õ   ä   ö   ü +
+ +

+RV is defined as one of the following: +

+ +
+ a   e   i   u   o +
+ +

+KI is defined as one of the following (letters possible before -ki emphasis): +

+ +
+ k   p   t   g   b   d   s   h   f   š   z   ž +
+ +

+GI is defined as one of the following (letters possible before -gi emphasis): +

+ +
+ c   j   l   m   n   q   r   v   w   x   a   e   i   o   u   õ   ä   ö   ü +
+ +

+R1 in this algorithm is defined as a region after the first consonant preceded by a vowel (laul[nud], mõt[teid], kar[tuleid], saab[as]). If there’s no such region, then R1 is empty (laul[Ø], saun[Ø]). Limitations in steps (such as "if preceded by RV") are not restricted to the R1 region. +

+ +

+LONGV is defined as one of the following: +

+ +
+ aa   ee   ii   oo   uu   ää   öö   üü   õõ +
+ +

+Do step 0. If nothing was changed in step 0, continue with the steps, otherwise stop. Do step 1 and step 2. If nothing was changed in step 2, do steps 3, 4, 5, 6, 7, 8 and 9. If something was changed in step 2, do step 9. +

+ +

+Step 0: verb_exceptions +

+ +
+ Search for some frequent irregular short verbs which wouldn’t have been found otherwise and give them a chosen stem. +

+
+
joon   jood   joob   joote   joome   joovad +
replace by joo +
jõin   jõid   jõi   jõime   jõite +
replace by joo +
joomata   juuakse   joodakse   juua   jooma +
replace by joo +
saan   saad   saab   saate   saame   saavad +
replace by saa +
saaksin   saaksid   saaks   saaksite   saaksime +
replace by saa +
sain   said   sai   saite   saime +
replace by saa +
saamata   saadakse   saadi   saama   saada +
replace by saa +
viin   viid   viib   viite   viime   viivad +
replace by viima +
viiksin   viiksid   viiks   viiksite   viiksime +
replace by viima +
viisin   viisite   viisime +
replace by viima +
viimata   viiakse   viidi   viima   viia +
replace by viima +
keen   keeb   keed   kees   keeme   keete   keevad +
replace by keesi +
keeksin   keeks   keeksid   keeksime   keeksite +
replace by keesi +
keemata   keema   keeta   keedakse +
replace by keesi +
löön   lööd   lööb   lööme   lööte   löövad +
replace by löö +
lööksin   lööksid   lööks   lööksime   lööksite +
replace by löö +
löömata   lüüakse   löödakse   löödi   lööma   lüüa +
replace by löö +
lõin   lõid   lõi   lõime   lõite +
replace by lõi +
loon   lood   loob   loome   loote   loovad +
replace by loo +
looksin   looksid   looks   looksime   looksite +
replace by loo +
loomata   luuakse   loodi   luua   looma +
replace by loo +
käin   käib   käid   käis   käime   käite   käivad +
replace by käisi +
käiksin   käiks   käiksid   käiksime   käiksite +
replace by käisi +
käimata   käiakse   käidi   käia   käima +
replace by käisi +
söön   sööb   sööd   sööme   sööte   söövad +
replace by söö +
sööksin   sööks   sööksid   sööksime   sööksite +
replace by söö +
sõin   sõi   sõid   sõime   sõite +
replace by söö +
söömata   süüakse   söödakse   söödi   sööma   süüa +
replace by söö +
toon   tood   toob   toote   toome   toovad +
replace by too +
tooksin   tooksid   tooks   tooksite   tooksime +
replace by too +
tõin   tõid   tõi   tõime   tõite +
replace by too +
toomata   tuuakse   toodi   tooma   tuua +
replace by too +
võin   võid   võib   võime   võis   võite   võivad +
replace by võisi +
võiksin   võiksid   võiks   võiksime   võiksite +
replace by võisi +
võimata   võidakse   võidi   võida   võima +
replace by võisi +
jään   jääd   jääb   jääme   jääte   jäävad +
replace by jääma +
jääksin   jääksid   jääks   jääksime   jääksite +
replace by jääma +
jäime   jäite   jäin   jäid   jäi +
replace by jääma +
jäämata   jäädakse   jääda   jääma   jäädi +
replace by jääma +
müün   müüd   müüb   müüs   müüme   müüte   müüvad +
replace by müüsi +
müüksin   müüksid   müüks   müüksime   müüksite +
replace by müüsi +
müümata   müüakse   müüdi   müüa   müüma +
replace by müüsi +
loeb   loen   loed   loeme   loete   loevad +
replace by luge +
loeks   loeksin   loeksid   loeksime   loeksite +
replace by luge +
põen   põeb   põed   põeme   põete   põevad +
replace by põde +
põeksin   põeks   põeksid   põeksime   põeksite +
replace by põde +
laon   laob   laod   laome   laote   laovad +
replace by ladu +
laoksin   laoks   laoksid   laoksime   laoksite +
replace by ladu +
teeksin   teeks   teeksid   teeksime   teeksite +
replace by tegi +
teen   teeb   teed   teeme   teete   teevad +
replace by tegi +
tegemata   tehakse   tehti   tegema   teha +
replace by tegi +
näen   näeb   näed   näeme   näete   näevad +
replace by nägi +
näeksin   näeks   näeksid   näeksime   näeksite +
replace by nägi +
nägemata   nähakse   nähti   näha   nägema +
replace by nägi +
+
+ + +

+Step 1: emphasis +

+ +
+ Search for the longest among the following suffixes in R1, and perform the action indicated +

+
+
Test if there’s at least 4 characters before R1 region. If so, continue this step +
gi +
if preceded by a character from GI which is not the second character of a long vowel as defined by LONGV, delete +
ki +
if preceded by KI, delete +
+
+ +

+Step 2: verb +

+
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+ +
+
nuksin   nuksime   nuksid   nuksite +
delete +
ksin   ksid   ksime   ksite +
delete +
mata +
delete +
takse   dakse +
delete +
taks   daks +
delete +
akse +
replace with a +
sime +
delete +
site +
delete +
sin +
delete +
me +
if preceded by V1, delete +
da +
if preceded by V1, delete +
n +
if preceded by V1, delete +
b +
if preceded by V1, delete +
+
+ +

+Step 3: special_noun_endings +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
lasse +
replace by lase +
last +
replace by lase +
lane +
replace by lase +
lasi +
replace by lase +
misse +
replace by mise +
mist +
replace by mise +
mine +
replace by mise +
misi +
replace by mise +
lisse +
replace by lise +
list +
replace by lise +
line +
replace by lise +
lisi +
replace by lise +
+
+ +

+Step 4: case_ending +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
sse if preceded by RV or LONGV +
st if preceded by RV or LONGV +
le if preceded by RV or LONGV +
lt if preceded by RV or LONGV +
ga if preceded by RV or LONGV +
ks if preceded by RV or LONGV +
ta if preceded by RV or LONGV +
t if preceded by at least 4 characters +
s if preceded by RV or LONGV +
l if preceded by RV or LONGV +
delete +
+
+ +

+Step 5: plural_three_first_cases +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
ikkude +
replace by iku +
ikke +
replace by iku +
ike +
replace by iku +
sid +
if it is not preceded by LONGV, delete +
te +
if it doesn't have at least 4 characters before it, replace by t. +
Otherwise: +
a) if it is preceded by mis, replace with e, +
b) if it is preceded by las, replace with e, +
c) if it is preceded by lis, replace with e, +
if it wasn't replaced with e in steps a)-c) and it isn't preceded by t, delete +
de if preceded by RV or LONGV +
delete +
d if preceded by RV or LONGV +
delete +
+
+ +

+Step 6: degrees +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
mai if preceded by RV +
ma +
m if preceded by RV +
delete +
+
+ +

+Step 7: i_plural +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
i if preceded by RV +
delete +
+
+ +

+Step 8: nu +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
nu +
tu +
du +
va +
delete +
+
+ +

+Step 9: undouble_kpt +

+ +
+ Undouble consonant if word ending is kk+V1, tt+V1, pp+V1, + provided the vowel is in R1. +
+ +

The full algorithm in Snowball

+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/estonian/stemmer.tt b/algorithms/estonian/stemmer.tt new file mode 100644 index 0000000..dec4b74 --- /dev/null +++ b/algorithms/estonian/stemmer.tt @@ -0,0 +1,485 @@ +[% header('Estonian stemming algorithm') %] + +

Links to resources

+ + + +
+ +

+Here is a sample of vocabulary, with the stemmed forms that will +be generated with the algorithm. +

+ +[% algorithm_vocab([ +'raamat', +'raamatu', +'raamatut', +'raamatule', +'raamatud', +'raamatute', +'raamatuid', +'raamatutele', +'raamatutestki', +'hele', +'heleda', +'heledat', +'heledale', +'heledad', +'heledate', +'heledaid', +'heledatele', +'heledam', +'heledama', +'heledamat', +'heledamad', +'heledamate', +'heledamaid', +'heledamatelegi', +'heledaim', +'heledaima', +'heledaimat', +'heledaimale', +'heledaimad', +'heledaimate', +'heledaimaid', +'heledaimatelt', +'hobune', +'hobuse', +'hobust', +'hobusele', +'hobused', +'hobuste', +'hobuseid', +'hobustele', +], [ +'hüpata', +'hüppa', +'hüppaksin', +'hüppaksid', +'hüppaks', +'hüppaksime', +'hüppaksite', +'hüppan', +'hüppad', +'hüppab', +'hüppame', +'hüppate', +'hüppavad', +'hüppasin', +'hüppasid', +'hüppas', +'hüppasime', +'hüppasite', +'hüpanuksite', +'hüpatakse', +'hüpati', +'hüpanud', +'hüpanutest', +'hüpates', +'hüppavat', +'hüppavatele', +'hüppamata', +'hüppamast', +'hüljes', +'hülge', +'hüljest', +'hülgesse', +'hüljeste', +'hülgeid', +'hüljestesse', +'hülgeisse', +'ohutule', +'ohutud', +'ohutuid', +'ohututele', +]) %] +
+ +

+This algorithm is written in collaboration with Estonian text analytics enterprise Texta. +

+ +

+Letters in Estonian include the following accented forms, +

+ +
+ ä   ö   õ   ü   š   ž +
+ +

+The following letters are vowels (V1): +

+ +
+ a   e   i   o   u   õ   ä   ö   ü +
+ +

+RV is defined as one of the following: +

+ +
+ a   e   i   u   o +
+ +

+KI is defined as one of the following (letters possible before -ki emphasis): +

+ +
+ k   p   t   g   b   d   s   h   f   š   z   ž +
+ +

+GI is defined as one of the following (letters possible before -gi emphasis): +

+ +
+ c   j   l   m   n   q   r   v   w   x   a   e   i   o   u   õ   ä   ö   ü +
+ +

+R1 in this algorithm is defined as a region after the first consonant preceded by a vowel (laul[nud], mõt[teid], kar[tuleid], saab[as]). If there’s no such region, then R1 is empty (laul[Ø], saun[Ø]). Limitations in steps (such as "if preceded by RV") are not restricted to the R1 region. +

+ +

+LONGV is defined as one of the following: +

+ +
+ aa   ee   ii   oo   uu   ää   öö   üü   õõ +
+ +

+Do step 0. If nothing was changed in step 0, continue with the steps, otherwise stop. Do step 1 and step 2. If nothing was changed in step 2, do steps 3, 4, 5, 6, 7, 8 and 9. If something was changed in step 2, do step 9. +

+ +

+Step 0: verb_exceptions +

+ +
+ Search for some frequent irregular short verbs which wouldn’t have been found otherwise and give them a chosen stem. +

+
+
joon   jood   joob   joote   joome   joovad +
replace by joo +
jõin   jõid   jõi   jõime   jõite +
replace by joo +
joomata   juuakse   joodakse   juua   jooma +
replace by joo +
saan   saad   saab   saate   saame   saavad +
replace by saa +
saaksin   saaksid   saaks   saaksite   saaksime +
replace by saa +
sain   said   sai   saite   saime +
replace by saa +
saamata   saadakse   saadi   saama   saada +
replace by saa +
viin   viid   viib   viite   viime   viivad +
replace by viima +
viiksin   viiksid   viiks   viiksite   viiksime +
replace by viima +
viisin   viisite   viisime +
replace by viima +
viimata   viiakse   viidi   viima   viia +
replace by viima +
keen   keeb   keed   kees   keeme   keete   keevad +
replace by keesi +
keeksin   keeks   keeksid   keeksime   keeksite +
replace by keesi +
keemata   keema   keeta   keedakse +
replace by keesi +
löön   lööd   lööb   lööme   lööte   löövad +
replace by löö +
lööksin   lööksid   lööks   lööksime   lööksite +
replace by löö +
löömata   lüüakse   löödakse   löödi   lööma   lüüa +
replace by löö +
lõin   lõid   lõi   lõime   lõite +
replace by lõi +
loon   lood   loob   loome   loote   loovad +
replace by loo +
looksin   looksid   looks   looksime   looksite +
replace by loo +
loomata   luuakse   loodi   luua   looma +
replace by loo +
käin   käib   käid   käis   käime   käite   käivad +
replace by käisi +
käiksin   käiks   käiksid   käiksime   käiksite +
replace by käisi +
käimata   käiakse   käidi   käia   käima +
replace by käisi +
söön   sööb   sööd   sööme   sööte   söövad +
replace by söö +
sööksin   sööks   sööksid   sööksime   sööksite +
replace by söö +
sõin   sõi   sõid   sõime   sõite +
replace by söö +
söömata   süüakse   söödakse   söödi   sööma   süüa +
replace by söö +
toon   tood   toob   toote   toome   toovad +
replace by too +
tooksin   tooksid   tooks   tooksite   tooksime +
replace by too +
tõin   tõid   tõi   tõime   tõite +
replace by too +
toomata   tuuakse   toodi   tooma   tuua +
replace by too +
võin   võid   võib   võime   võis   võite   võivad +
replace by võisi +
võiksin   võiksid   võiks   võiksime   võiksite +
replace by võisi +
võimata   võidakse   võidi   võida   võima +
replace by võisi +
jään   jääd   jääb   jääme   jääte   jäävad +
replace by jääma +
jääksin   jääksid   jääks   jääksime   jääksite +
replace by jääma +
jäime   jäite   jäin   jäid   jäi +
replace by jääma +
jäämata   jäädakse   jääda   jääma   jäädi +
replace by jääma +
müün   müüd   müüb   müüs   müüme   müüte   müüvad +
replace by müüsi +
müüksin   müüksid   müüks   müüksime   müüksite +
replace by müüsi +
müümata   müüakse   müüdi   müüa   müüma +
replace by müüsi +
loeb   loen   loed   loeme   loete   loevad +
replace by luge +
loeks   loeksin   loeksid   loeksime   loeksite +
replace by luge +
põen   põeb   põed   põeme   põete   põevad +
replace by põde +
põeksin   põeks   põeksid   põeksime   põeksite +
replace by põde +
laon   laob   laod   laome   laote   laovad +
replace by ladu +
laoksin   laoks   laoksid   laoksime   laoksite +
replace by ladu +
teeksin   teeks   teeksid   teeksime   teeksite +
replace by tegi +
teen   teeb   teed   teeme   teete   teevad +
replace by tegi +
tegemata   tehakse   tehti   tegema   teha +
replace by tegi +
näen   näeb   näed   näeme   näete   näevad +
replace by nägi +
näeksin   näeks   näeksid   näeksime   näeksite +
replace by nägi +
nägemata   nähakse   nähti   näha   nägema +
replace by nägi +
+
+ + +

+Step 1: emphasis +

+ +
+ Search for the longest among the following suffixes in R1, and perform the action indicated +

+
+
Test if there’s at least 4 characters before R1 region. If so, continue this step +
gi +
if preceded by a character from GI which is not the second character of a long vowel as defined by LONGV, delete +
ki +
if preceded by KI, delete +
+
+ +

+Step 2: verb +

+
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+ +
+
nuksin   nuksime   nuksid   nuksite +
delete +
ksin   ksid   ksime   ksite +
delete +
mata +
delete +
takse   dakse +
delete +
taks   daks +
delete +
akse +
replace with a +
sime +
delete +
site +
delete +
sin +
delete +
me +
if preceded by V1, delete +
da +
if preceded by V1, delete +
n +
if preceded by V1, delete +
b +
if preceded by V1, delete +
+
+ +

+Step 3: special_noun_endings +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
lasse +
replace by lase +
last +
replace by lase +
lane +
replace by lase +
lasi +
replace by lase +
misse +
replace by mise +
mist +
replace by mise +
mine +
replace by mise +
misi +
replace by mise +
lisse +
replace by lise +
list +
replace by lise +
line +
replace by lise +
lisi +
replace by lise +
+
+ +

+Step 4: case_ending +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
sse if preceded by RV or LONGV +
st if preceded by RV or LONGV +
le if preceded by RV or LONGV +
lt if preceded by RV or LONGV +
ga if preceded by RV or LONGV +
ks if preceded by RV or LONGV +
ta if preceded by RV or LONGV +
t if preceded by at least 4 characters +
s if preceded by RV or LONGV +
l if preceded by RV or LONGV +
delete +
+
+ +

+Step 5: plural_three_first_cases +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
ikkude +
replace by iku +
ikke +
replace by iku +
ike +
replace by iku +
sid +
if it is not preceded by LONGV, delete +
te +
if it doesn't have at least 4 characters before it, replace by t. +
Otherwise: +
a) if it is preceded by mis, replace with e, +
b) if it is preceded by las, replace with e, +
c) if it is preceded by lis, replace with e, +
if it wasn't replaced with e in steps a)-c) and it isn't preceded by t, delete +
de if preceded by RV or LONGV +
delete +
d if preceded by RV or LONGV +
delete +
+
+ +

+Step 6: degrees +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
mai if preceded by RV +
ma +
m if preceded by RV +
delete +
+
+ +

+Step 7: i_plural +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
i if preceded by RV +
delete +
+
+ +

+Step 8: nu +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
nu +
tu +
du +
va +
delete +
+
+ +

+Step 9: undouble_kpt +

+ +
+ Undouble consonant if word ending is kk+V1, tt+V1, pp+V1, + provided the vowel is in R1. +
+ +

The full algorithm in Snowball

+ +[% highlight_file('estonian') %] + +[% footer %] diff --git a/algorithms/finnish/stemmer.html b/algorithms/finnish/stemmer.html new file mode 100644 index 0000000..ebd5b2a --- /dev/null +++ b/algorithms/finnish/stemmer.html @@ -0,0 +1,664 @@ + + + + + + + + + + Finnish stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Finnish stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Finnish vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+edeltäjien
+edeltäjiensä
+edeltäjiinsä
+edeltäjistään
+edeltäjiä
+edeltäjiään
+edeltäjä
+edeltäjälleen
+edeltäjän
+edeltäjäni
+edeltäjänsä
+edeltäjänä
+edeltäjässä
+edeltäjästä
+edeltäjästään
+edeltäjät
+edeltäjää
+edeltäjään
+edeltäjäänsä
+edeltäneelle
+edeltäneellä
+edeltäneeltä
+edeltäneen
+edeltäneenä
+edeltäneeseen
+edeltäneessä
+edeltäneestä
+edeltäneet
+edeltäneiden
+edeltäneissä
+edeltäneitä
+edeltänyt
+edeltänyttä
+edeltävien
+edeltäviin
+edeltävinä
+edeltävissä
+edeltävä
+edeltävälle
+edeltävällä
+
+edeltäj
+edeltäjie
+edeltäj
+edeltäj
+edeltäj
+edeltäjiä
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltäj
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltän
+edeltäny
+edeltänyt
+edeltäv
+edeltäv
+edeltäv
+edeltäv
+edeltäv
+edeltäv
+edeltäv
+
+innostu
+innostua
+innostuessaan
+innostui
+innostuimme
+innostuin
+innostuisi
+innostuisivat
+innostuivat
+innostukseen
+innostuksella
+innostuksen
+innostuksensa
+innostuksessa
+innostuksessaan
+innostuksesta
+innostuksissaan
+innostumaan
+innostuminen
+innostun
+innostuneelle
+innostuneempia
+innostuneen
+innostuneena
+innostuneesta
+innostuneesti
+innostuneet
+innostuneiden
+innostuneiksi
+innostunein
+innostuneina
+innostuneissa
+innostuneisuus
+innostuneita
+innostunut
+innostunutta
+innostus
+innostusta
+innostustaan
+innostutaan
+
+innostu
+innostu
+innostue
+innostui
+innostui
+innostu
+innostui
+innostuisiv
+innostuiv
+innostuks
+innostuks
+innostuks
+innostuks
+innostuks
+innostuks
+innostuks
+innostuks
+innostum
+innostumin
+innostu
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostun
+innostuneisuus
+innostun
+innostunu
+innostunut
+innostus
+innostu
+innostu
+innostu
+
+
+ +

+Finnish is not an Indo-European language, but belongs to the Finno-Ugric +group, which again belongs to the Uralic group (*). Distinctions between +a-, i- and d-suffixes can be made in Finnish, but they are much +less sharply separated than in an Indo-European language. The system of +endings is extremely elaborate, but strictly defined, and applies equally to +all nominals, that is, to nouns, adjectives and pronouns. Verb endings have a +close similarity to nominal endings, which again makes Finnish very different +from any Indo-European language. +

+ +

+More problematical than the endings themselves is the change that can be +effected in a stem as a result of taking a particular ending. A stem typically +has two forms, strong and weak, where one class of ending follows the +strong form and the complementary class the weak. Normalising strong and weak +forms after ending removal is not generally possible, although the common case +where strong and weak forms only differ in the single or double form of a +final consonant can be dealt with. +

+ +

+Finnish includes the following accented forms, +

+ +
+ ä   ö +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   ä   ö +
+ +

+R1 and +R2 are then defined in the usual way +(see the note on R1 and R2). +

+ +

+Do each of steps 1, 2, 3, 4, 5 and 6. +

+ +

+Step 1: particles etc +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
(a) kin   kaan   kään   ko   kö   han   hän   pa   pä +
delete if preceded by n, t or a vowel +
(b) sti +
delete if in R2 +
+
+ +

+(Of course, the n, t or vowel of 1(a) need not be in R1: only +the suffix removed must be in R1. And similarly below. +

+ +

+Step 2: possessives +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
si +
delete if not preceded by k +
ni +
delete +
if preceded by kse, replace with ksi +
nsa   nsä   mme   nne +
delete +
an +
delete if preceded by one of   ta   ssa   sta   lla   lta   na +
än +
delete if preceded by one of   tä   ssä   stä   llä   ltä   nä +
en +
delete if preceded by one of   lle   ine +
+
+ +

+The remaining steps require a few definitions. +

+ +

+Define a v (vowel) as one of   a   e   i   o   u   y   ä   ö. +
+Define a V (restricted vowel) as one of   a   e   i   o   u   ä   ö. +
+So Vi means a V followed by letter i. +
+Define LV (long vowel) as one of   aa   ee   ii   oo   uu   ää   öö. +
+Define a c (consonant) as a character from ASCII a-z which isn't in +v (originally this was "a character other than a v but since +2018-04-11 we've changed this definition to avoid the stemmer from altering +sequences of digits). +
+So cv means a c followed by a v. +

+ +

+Step 3: cases +

+
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+ +
+
hXn   preceded by X, where X is a V other than u (a/han, e/hen etc) +
siin   den   tten   preceded by Vi +
seen   preceded by LV +
a   ä   preceded by cv +
tta   ttä   preceded by e +
ta   tä   ssa   ssä   sta   stä   lla   llä   lta   ltä   lle   na   nä   ksi   ine +
delete +
n +
delete, and if preceded by LV or ie, delete the last vowel +
+
+ +

+So aarteisiinaartei, the longest matching suffix being siin, +preceded as it is by Vi. But adressiinadressi. The longest +matching suffix is not siin, because there is no preceding Vi, but n, +and then the last vowel of the preceding LV is removed. +

+ +

+Step 4: other endings +

+ +
+ Search for the longest among the following suffixes in R2, and perform the + action indicated +

+
+
mpi   mpa   mpä   mmi   mma   mmä +
delete if not preceded by po +
impi   impa   impä   immi   imma   immä   eja   ejä +
delete +
+
+ +

+Step 5: plurals +

+ +
+If an ending was removed in step 3, delete a final i or j if in R1; +otherwise, if an ending was not removed in step 3, delete a final t in +R1 if it follows a vowel, and, if a t is removed, delete a final mma or +imma in R2, unless the mma is preceded by po. +
+ +

+Step 6: tidying up +

+ +
+Do in turn steps (a), (b), (c), (d), restricting all tests to the region +R1. +

+ +

+a) If R1 ends LV delete the last letter
+b) If R1 ends cX, c a consonant and X one of   a   ä   e   i, delete the last +letter
+c) If R1 ends oj or uj delete the last letter
+d) If R1 ends jo delete the last letter +

+ +

+Do step (e), which is not restricted to R1. +

+ +

+e) If the word ends with a double consonant followed by zero or more vowels, +remove the last consonant (so eläkkeläk, aatonaatto → +aatonaato) +

+
+ +

The full algorithm in Snowball

+ +
/* Finnish stemmer.
+
+   Numbers in square brackets refer to the sections in
+   Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
+   ISBN 0-415-20705-3
+
+*/
+
+routines (
+           mark_regions
+           R2
+           particle_etc possessive
+           LONG VI
+           case_ending
+           i_plural
+           t_plural
+           other_endings
+           tidy
+)
+
+externals ( stem )
+
+integers ( p1 p2 )
+strings ( x )
+booleans ( ending_removed )
+groupings ( AEI C V1 V2 particle_end )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a"   '{U+00E4}'
+stringdef o"   '{U+00F6}'
+
+define AEI 'a{a"}ei'
+define C 'bcdfghjklmnpqrstvwxz'
+define V1 'aeiouy{a"}{o"}'
+define V2 'aeiou{a"}{o"}'
+define particle_end V1 + 'nt'
+
+define mark_regions as (
+
+    $p1 = limit
+    $p2 = limit
+
+    goto V1  gopast non-V1  setmark p1
+    goto V1  gopast non-V1  setmark p2
+)
+
+backwardmode (
+
+    define R2 as $p2 <= cursor
+
+    define particle_etc as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'kin'
+            'kaan' 'k{a"}{a"}n'
+            'ko'   'k{o"}'
+            'han'  'h{a"}n'
+            'pa'   'p{a"}'    // Particles [91]
+                (particle_end)
+            'sti'             // Adverb [87]
+                (R2)
+        )
+        delete
+    )
+    define possessive as (    // [36]
+        setlimit tomark p1 for ([substring])
+        among(
+            'si'
+                (not 'k' delete)  // take 'ksi' as the Comitative case
+            'ni'
+                (delete ['kse'] <- 'ksi') // kseni = ksi + ni
+            'nsa' 'ns{a"}'
+            'mme'
+            'nne'
+                (delete)
+            /* Now for Vn possessives after case endings: [36] */
+            'an'
+                (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
+            '{a"}n'
+                (among('t{a"}' 'ss{a"}' 'st{a"}'
+                       'll{a"}' 'lt{a"}' 'n{a"}') delete)
+            'en'
+                (among('lle' 'ine') delete)
+        )
+    )
+
+    define LONG as
+        among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
+
+    define VI as ('i' V2)
+
+    define case_ending as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'han'    ('a')          //-.
+            'hen'    ('e')          // |
+            'hin'    ('i')          // |
+            'hon'    ('o')          // |
+            'h{a"}n' ('{a"}')       // Illative   [43]
+            'h{o"}n' ('{o"}')       // |
+            'siin'   VI             // |
+            'seen'   LONG           //-'
+
+            'den'    VI
+            'tten'   VI             // Genitive plurals [34]
+                     ()
+            'n'                     // Genitive or Illative
+                ( try ( LONG // Illative
+                        or 'ie' // Genitive
+                          and next ]
+                      )
+                  /* otherwise Genitive */
+                )
+
+            'a' '{a"}'              //-.
+                     (V1 C)         // |
+            'tta' 'tt{a"}'          // Partitive  [32]
+                     ('e')          // |
+            'ta' 't{a"}'            //-'
+
+            'ssa' 'ss{a"}'          // Inessive   [41]
+            'sta' 'st{a"}'          // Elative    [42]
+
+            'lla' 'll{a"}'          // Adessive   [44]
+            'lta' 'lt{a"}'          // Ablative   [51]
+            'lle'                   // Allative   [46]
+            'na' 'n{a"}'            // Essive     [49]
+            'ksi'                   // Translative[50]
+            'ine'                   // Comitative [51]
+
+            /* Abessive and Instructive are too rare for
+               inclusion [51] */
+
+        )
+        delete
+        set ending_removed
+    )
+    define other_endings as (
+        setlimit tomark p2 for ([substring])
+        among(
+            'mpi' 'mpa' 'mp{a"}'
+            'mmi' 'mma' 'mm{a"}'    // Comparative forms [85]
+                (not 'po')          //-improves things
+            'impi' 'impa' 'imp{a"}'
+            'immi' 'imma' 'imm{a"}' // Superlative forms [86]
+            'eja' 'ej{a"}'          // indicates agent [93.1B]
+        )
+        delete
+    )
+    define i_plural as (            // [26]
+        setlimit tomark p1 for ([substring])
+        among(
+            'i'  'j'
+        )
+        delete
+    )
+    define t_plural as (            // [26]
+        setlimit tomark p1 for (
+            ['t'] test V1
+            delete
+        )
+        setlimit tomark p2 for ([substring])
+        among(
+            'mma' (not 'po') //-mmat endings
+            'imma'           //-immat endings
+        )
+        delete
+    )
+    define tidy as (
+        setlimit tomark p1 for (
+            do ( LONG and ([next] delete ) ) // undouble vowel
+            do ( [AEI] C delete ) // remove trailing a, a", e, i
+            do ( ['j'] 'o' or 'u' delete )
+            do ( ['o'] 'j' delete )
+        )
+        goto non-V1 [C] -> x  x delete // undouble consonant
+    )
+)
+
+define stem as (
+
+    do mark_regions
+    unset ending_removed
+    backwards (
+        do particle_etc
+        do possessive
+        do case_ending
+        do other_endings
+        (ending_removed do i_plural) or do t_plural
+        do tidy
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/finnish/stemmer.tt b/algorithms/finnish/stemmer.tt new file mode 100644 index 0000000..dd2c0bf --- /dev/null +++ b/algorithms/finnish/stemmer.tt @@ -0,0 +1,212 @@ +[% header('Finnish stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'edeltäjien', 'innostu']) %] + +

+Finnish is not an Indo-European language, but belongs to the Finno-Ugric +group, which again belongs to the Uralic group (*). Distinctions between +a-, i- and d-suffixes can be made in Finnish, but they are much +less sharply separated than in an Indo-European language. The system of +endings is extremely elaborate, but strictly defined, and applies equally to +all nominals, that is, to nouns, adjectives and pronouns. Verb endings have a +close similarity to nominal endings, which again makes Finnish very different +from any Indo-European language. +

+ +

+More problematical than the endings themselves is the change that can be +effected in a stem as a result of taking a particular ending. A stem typically +has two forms, strong and weak, where one class of ending follows the +strong form and the complementary class the weak. Normalising strong and weak +forms after ending removal is not generally possible, although the common case +where strong and weak forms only differ in the single or double form of a +final consonant can be dealt with. +

+ +

+Finnish includes the following accented forms, +

+ +
+ ä   ö +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   ä   ö +
+ +

+R1 and +R2 are then defined in the usual way +(see the note on R1 and R2). +

+ +

+Do each of steps 1, 2, 3, 4, 5 and 6. +

+ +

+Step 1: particles etc +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
(a) kin   kaan   kään   ko   kö   han   hän   pa   pä +
delete if preceded by n, t or a vowel +
(b) sti +
delete if in R2 +
+
+ +

+(Of course, the n, t or vowel of 1(a) need not be in R1: only +the suffix removed must be in R1. And similarly below. +

+ +

+Step 2: possessives +

+ +
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+
+
si +
delete if not preceded by k +
ni +
delete +
if preceded by kse, replace with ksi +
nsa   nsä   mme   nne +
delete +
an +
delete if preceded by one of   ta   ssa   sta   lla   lta   na +
än +
delete if preceded by one of   tä   ssä   stä   llä   ltä   nä +
en +
delete if preceded by one of   lle   ine +
+
+ +

+The remaining steps require a few definitions. +

+ +

+Define a v (vowel) as one of   a   e   i   o   u   y   ä   ö. +
+Define a V (restricted vowel) as one of   a   e   i   o   u   ä   ö. +
+So Vi means a V followed by letter i. +
+Define LV (long vowel) as one of   aa   ee   ii   oo   uu   ää   öö. +
+Define a c (consonant) as a character from ASCII a-z which isn't in +v (originally this was "a character other than a v but since +2018-04-11 we've changed this definition to avoid the stemmer from altering +sequences of digits). +
+So cv means a c followed by a v. +

+ +

+Step 3: cases +

+
+ Search for the longest among the following suffixes in R1, and perform the + action indicated +

+ +
+
hXn   preceded by X, where X is a V other than u (a/han, e/hen etc) +
siin   den   tten   preceded by Vi +
seen   preceded by LV +
a   ä   preceded by cv +
tta   ttä   preceded by e +
ta   tä   ssa   ssä   sta   stä   lla   llä   lta   ltä   lle   na   nä   ksi   ine +
delete +
n +
delete, and if preceded by LV or ie, delete the last vowel +
+
+ +

+So aarteisiinaartei, the longest matching suffix being siin, +preceded as it is by Vi. But adressiinadressi. The longest +matching suffix is not siin, because there is no preceding Vi, but n, +and then the last vowel of the preceding LV is removed. +

+ +

+Step 4: other endings +

+ +
+ Search for the longest among the following suffixes in R2, and perform the + action indicated +

+
+
mpi   mpa   mpä   mmi   mma   mmä +
delete if not preceded by po +
impi   impa   impä   immi   imma   immä   eja   ejä +
delete +
+
+ +

+Step 5: plurals +

+ +
+If an ending was removed in step 3, delete a final i or j if in R1; +otherwise, if an ending was not removed in step 3, delete a final t in +R1 if it follows a vowel, and, if a t is removed, delete a final mma or +imma in R2, unless the mma is preceded by po. +
+ +

+Step 6: tidying up +

+ +
+Do in turn steps (a), (b), (c), (d), restricting all tests to the region +R1. +

+ +

+a) If R1 ends LV delete the last letter
+b) If R1 ends cX, c a consonant and X one of   a   ä   e   i, delete the last +letter
+c) If R1 ends oj or uj delete the last letter
+d) If R1 ends jo delete the last letter +

+ +

+Do step (e), which is not restricted to R1. +

+ +

+e) If the word ends with a double consonant followed by zero or more vowels, +remove the last consonant (so eläkkeläk, aatonaatto → +aatonaato) +

+
+ +

The full algorithm in Snowball

+ +[% highlight_file('finnish') %] + +[% footer %] diff --git a/algorithms/finnish/stop.txt b/algorithms/finnish/stop.txt new file mode 100644 index 0000000..2be66c0 --- /dev/null +++ b/algorithms/finnish/stop.txt @@ -0,0 +1,88 @@ + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tällä tältä tälle tänä täksi | this +tuo tuon tuota tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +nyt | now +itse | self + diff --git a/algorithms/french/stemmer.html b/algorithms/french/stemmer.html new file mode 100644 index 0000000..8b361cc --- /dev/null +++ b/algorithms/french/stemmer.html @@ -0,0 +1,811 @@ + + + + + + + + + + French stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

French stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of French vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+continu
+continua
+continuait
+continuant
+continuation
+continue
+continué
+continuel
+continuelle
+continuellement
+continuelles
+continuels
+continuer
+continuera
+continuerait
+continueront
+continuez
+continuité
+continuons
+contorsions
+contour
+contournait
+contournant
+contourne
+contours
+contractait
+contracté
+contractée
+contracter
+contractés
+contractions
+contradictoirement
+contradictoires
+contraindre
+contraint
+contrainte
+contraintes
+contraire
+contraires
+contraria
+
+continu
+continu
+continu
+continu
+continu
+continu
+continu
+continuel
+continuel
+continuel
+continuel
+continuel
+continu
+continu
+continu
+continu
+continu
+continu
+continuon
+contors
+contour
+contourn
+contourn
+contourn
+contour
+contract
+contract
+contract
+contract
+contract
+contract
+contradictoir
+contradictoir
+contraindr
+contraint
+contraint
+contraint
+contrair
+contrair
+contrari
+
+main
+mains
+maintenaient
+maintenait
+maintenant
+maintenir
+maintenue
+maintien
+maintint
+maire
+maires
+mairie
+mais
+maïs
+maison
+maisons
+maistre
+maitre
+maître
+maîtres
+maîtresse
+maîtresses
+majesté
+majestueuse
+majestueusement
+majestueux
+majeur
+majeure
+major
+majordome
+majordomes
+majorité
+majorités
+mal
+malacca
+malade
+malades
+maladie
+maladies
+maladive
+
+main
+main
+mainten
+mainten
+mainten
+mainten
+maintenu
+maintien
+maintint
+mair
+mair
+mair
+mais
+maï
+maison
+maison
+maistr
+maitr
+maîtr
+maîtr
+maîtress
+maîtress
+majest
+majestu
+majestu
+majestu
+majeur
+majeur
+major
+majordom
+majordom
+major
+major
+mal
+malacc
+malad
+malad
+malad
+malad
+malad
+
+
+ +

The stemming algorithm

+ +

+Letters in French include the following accented forms, +

+ +
+ â   à   ç   ë   é   ê   è   ï   î   ô   û   ù +
+The following letters are vowels: +
+ a   e   i   o   u   y   â   à   ë   é   ê   è   ï   î   ô   û   ù +
+Assume the word is in lower case. Then, taking the letters in turn from the +beginning to end of the word, put u or i into upper +case when it is both preceded and followed by a vowel; put y into +upper case when it is either preceded or followed by a vowel; and put u into upper case when it follows q. For example, +
+
jouer joUer +
ennuie ennuIe +
yeux Yeux +
quand qUand +
croyiez croYiez +
+ +

+In the last example, y becomes Y because it is +between two vowels, but i does not become I because +it is between Y and e, and Y is not +defined as a vowel above. +

+ +

+(The upper case forms are not then classed as vowels — see note on vowel +marking.) +

+ +

+Replace ë and ï with He and Hi. The H +marks the vowel as having originally had a diaeresis, while the vowel itself, lacking an accent, is able to +match suffixes beginning in e or i. +

+ +

+If the word begins with two vowels, RV is the region after the third +letter, otherwise the region after the first vowel not at the beginning of +the word, or the end of the word if these positions cannot be found. (Exceptionally, +par, col or tap, at the beginning of a word is also taken to define +RV as the region to their right.) +

+ +

+For example, +

+ +
+    a i m e r     a d o r e r     v o l e r    t a p i s
+         |...|         |.....|       |.....|        |...|
+
+ +

+R1 is the region after the first non-vowel following a vowel, or the end of +the word if there is no such non-vowel. +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or the +end of the word if there is no such non-vowel. +(See note on R1 and R2.) +

+ +

+For example: +

+ +
+    f a m e u s e m e n t
+         |......R1.......|
+               |...R2....|
+
+ +

+Note that R1 can contain RV (adorer), and RV can contain R1 (voler). +

+ +

+Below, ‘delete if in R2’ means that a found suffix should be removed if it +lies entirely in R2, but not if it overlaps R2 and the rest of the word. +‘delete if in R1 and preceded by X’ means that X itself does not have to +come in R1, while ‘delete if preceded by X in R1’ means that X, like the +suffix, must be entirely in R1. +

+ +

+Start with step 1 +

+ +

+Step 1: Standard suffix removal +

+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
ance   iqUe   isme   able   iste   eux   ances   iqUes   ismes   ables   istes +
delete if in R2 +
atrice   ateur   ation   atrices   ateurs   ations +
delete if in R2 +
if preceded by ic, delete if in R2, else replace by iqU +
logie   logies +
replace with log if in R2 +
usion   ution   usions   utions +
replace with u if in R2 +
ence   ences +
replace with ent if in R2 +
ement   ements +
delete if in RV +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by eus, delete if in R2, else replace by eux + if in R1, otherwise, +
if preceded by abl or iqU, delete if in R2, otherwise, +
if preceded by ièr or Ièr, replace by i if in RV +
ité   ités +
delete if in R2 +
if preceded by abil, delete if in R2, else replace by abl, + otherwise, +
if preceded by ic, delete if in R2, else replace by iqU, otherwise, +
if preceded by iv, delete if in R2 +
if   ive   ifs   ives +
delete if in R2 +
if preceded by at, delete if in R2 (and if further preceded by ic, + delete if in R2, else replace by iqU) +
eaux +
replace with eau +
aux +
replace with al if in R1 +
euse   euses +
delete if in R2, else replace by eux if in R1 +
issement   issements +
delete if in R1 and preceded by a non-vowel +
amment +
replace with ant if in RV +
emment +
replace with ent if in RV +
ment   ments +
delete if preceded by a vowel in RV +
+
+ +

+In steps 2a and 2b all tests are confined to the RV region. +

+ +

+Do step 2a if either no ending was removed by step 1, or if one of endings +amment, emment, ment, ments was found. +

+ +

+Step 2a: Verb suffixes beginning i +

+ +
+ Search for the longest among the following suffixes and if found, + delete if the preceding character is neither a vowel nor H. +
+ îmes   ît   îtes   i   ie   ies   ir   ira   irai   iraIent   irais   irait   iras +   irent   irez   iriez   irions   irons   iront   is   issaIent   issais   issait +   issant   issante   issantes   issants   isse   issent   isses   issez   issiez +   issions   issons   it +
+
+ (Note that the preceding character itself must also be in RV.) +
+ +

+Do step 2b if step 2a was done, but failed to remove a suffix. +

+ +

+Step 2b: Other verb suffixes +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
ions +
delete if in R2 +
é   ée   ées   és   èrent   er   era   erai   eraIent   erais   erait   eras   erez +   eriez   erions   erons   eront   ez   iez +
delete +
âmes   ât   âtes   a   ai   aIent   ais   ait   ant   ante   antes   ants   as   asse +   assent   asses   assiez   assions +
delete +
if preceded by e, delete +
+
+ (Note that the e that may be deleted in this last step must also be in + RV.) +
+ +

+If the last step to be obeyed — either step 1, 2a or 2b — altered the word, +do step 3 +

+ +

+Step 3 +

+
+ Replace final Y with i or final ç with c +
+

+Alternatively, if the last step to be obeyed did not alter the word, do +step 4 +

+ +

+Step 4: Residual suffix +

+ +
+

+ If the word ends s, not preceded by a, i (unless itself preceded by H), o, u, è or s, delete it. +

+ +

+ In the rest of step 4, all tests are confined to the RV region. +

+ +

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+ +
+
ion +
delete if in R2 and preceded by s or t +
ier   ière   Ier   Ière +
replace with i +
e +
delete +
+
+ (So note that ion is removed only when it is in R2 — as well as being + in RV — and preceded by s or t which must be in RV.) +
+ +

+Always do steps 5 and 6. +

+ +

+Step 5: Undouble +

+ +
+ If the word ends enn, onn, ett, ell or eill, delete the last letter +
+ +

+Step 6: Un-accent +

+ +
+ If the words ends é or è followed by at least one non-vowel, remove + the accent from the e. +
+ +

+And finally: +

+ +
+

+ Turn any remaining I, U and Y letters in the word back into lower case. +

+ +

+ Turn He and Hi back into ë and ï, and remove any + remaining H. +

+
+ +

The same algorithm in Snowball

+ +
routines (
+           prelude postlude mark_regions
+           RV R1 R2
+           standard_suffix
+           i_verb_suffix
+           verb_suffix
+           residual_suffix
+           un_double
+           un_accent
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v keep_with_s )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a^   '{U+00E2}'  // a-circumflex
+stringdef a`   '{U+00E0}'  // a-grave
+stringdef cc   '{U+00E7}'  // c-cedilla
+
+stringdef e"   '{U+00EB}'  // e-diaeresis (rare)
+stringdef e'   '{U+00E9}'  // e-acute
+stringdef e^   '{U+00EA}'  // e-circumflex
+stringdef e`   '{U+00E8}'  // e-grave
+stringdef i"   '{U+00EF}'  // i-diaeresis
+stringdef i^   '{U+00EE}'  // i-circumflex
+stringdef o^   '{U+00F4}'  // o-circumflex
+stringdef u^   '{U+00FB}'  // u-circumflex
+stringdef u`   '{U+00F9}'  // u-grave
+
+define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
+
+define prelude as repeat goto (
+
+    (  v [ ('u' ] v <- 'U') or
+           ('i' ] v <- 'I') or
+           ('y' ] <- 'Y')
+    )
+    or
+    (  [ '{e"}' ] <- 'He' )
+    or
+    (  [ '{i"}' ] <- 'Hi' )
+    or
+    (  ['y'] v <- 'Y' )
+    or
+    (  'q' ['u'] <- 'U' )
+)
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit  // defaults
+
+    do (
+        ( v v next )
+        or
+        among ( // this exception list begun Nov 2006
+            'par'  // paris, parie, pari
+            'col'  // colis
+            'tap'  // tapis
+            // extensions possible here
+        )
+        or
+        ( next gopast v )
+        setmark pV
+    )
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define postlude as repeat (
+
+    [substring] among(
+        'I' (<- 'i')
+        'U' (<- 'u')
+        'Y' (<- 'y')
+        'He' (<- '{e"}')
+        'Hi' (<- '{i"}')
+        'H' (delete)
+        ''  (next)
+    )
+)
+
+backwardmode (
+
+    define RV as $pV <= cursor
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define standard_suffix as (
+        [substring] among(
+
+            'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
+            'ances' 'iqUes' 'ismes' 'ables' 'istes'
+               ( R2 delete )
+            'atrice' 'ateur' 'ation'
+            'atrices' 'ateurs' 'ations'
+               ( R2 delete
+                 try ( ['ic'] (R2 delete) or <-'iqU' )
+               )
+            'logie'
+            'logies'
+               ( R2 <- 'log' )
+            'usion' 'ution'
+            'usions' 'utions'
+               ( R2 <- 'u' )
+            'ence'
+            'ences'
+               ( R2 <- 'ent' )
+            'ement'
+            'ements'
+            (
+                RV delete
+                try (
+                    [substring] among(
+                        'iv'   (R2 delete ['at'] R2 delete)
+                        'eus'  ((R2 delete) or (R1<-'eux'))
+                        'abl' 'iqU'
+                               (R2 delete)
+                        'i{e`}r' 'I{e`}r'      //)
+                               (RV <-'i')      //)--new 2 Sept 02
+                    )
+                )
+            )
+            'it{e'}'
+            'it{e'}s'
+            (
+                R2 delete
+                try (
+                    [substring] among(
+                        'abil' ((R2 delete) or <-'abl')
+                        'ic'   ((R2 delete) or <-'iqU')
+                        'iv'   (R2 delete)
+                    )
+                )
+            )
+            'if' 'ive'
+            'ifs' 'ives'
+            (
+                R2 delete
+                try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
+            )
+            'eaux' (<- 'eau')
+            'aux'  (R1 <- 'al')
+            'euse'
+            'euses'((R2 delete) or (R1<-'eux'))
+
+            'issement'
+            'issements'(R1 non-v delete) // verbal
+
+            // fail(...) below forces entry to verb_suffix. -ment typically
+            // follows the p.p., e.g 'confus{e'}ment'.
+
+            'amment'   (RV fail(<- 'ant'))
+            'emment'   (RV fail(<- 'ent'))
+            'ment'
+            'ments'    (test(v RV) fail(delete))
+                       // v is e,i,u,{e'},I or U
+        )
+    )
+
+    define i_verb_suffix as setlimit tomark pV for (
+        [substring] among (
+            '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
+            'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
+            'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
+            'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
+            'issez' 'issiez' 'issions' 'issons' 'it'
+                (not 'H' non-v delete)
+        )
+    )
+
+    define verb_suffix as setlimit tomark pV for (
+        [substring] among (
+            'ions'
+                (R2 delete)
+
+            '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
+            'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
+            'erons' 'eront' 'ez' 'iez'
+
+            // 'ons' //-best omitted
+
+                (delete)
+
+            '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
+            'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
+            'assions'
+                (delete
+                 try(['e'] delete)
+                )
+        )
+    )
+
+    define keep_with_s 'aiou{e`}s'
+
+    define residual_suffix as (
+        try(['s'] test ('Hi' or non-keep_with_s) delete)
+        setlimit tomark pV for (
+            [substring] among(
+                'ion'           (R2 's' or 't' delete)
+                'ier' 'i{e`}re'
+                'Ier' 'I{e`}re' (<-'i')
+                'e'             (delete)
+            )
+        )
+    )
+
+    define un_double as (
+        test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
+    )
+
+    define un_accent as (
+        atleast 1 non-v
+        [ '{e'}' or '{e`}' ] <-'e'
+    )
+)
+
+define stem as (
+
+    do prelude
+    do mark_regions
+    backwards (
+
+        do (
+            (
+                 ( standard_suffix or
+                   i_verb_suffix or
+                   verb_suffix
+                 )
+                 and
+                 try( [ ('Y'   ] <- 'i' ) or
+                        ('{cc}'] <- 'c' )
+                 )
+            ) or
+            residual_suffix
+        )
+
+        // try(['ent'] RV delete) // is best omitted
+
+        do un_double
+        do un_accent
+    )
+    do postlude
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/french/stemmer.tt b/algorithms/french/stemmer.tt new file mode 100644 index 0000000..4b23e44 --- /dev/null +++ b/algorithms/french/stemmer.tt @@ -0,0 +1,301 @@ +[% header('French stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'continu', 'main']) %] + +

The stemming algorithm

+ +

+Letters in French include the following accented forms, +

+ +
+ â   à   ç   ë   é   ê   è   ï   î   ô   û   ù +
+The following letters are vowels: +
+ a   e   i   o   u   y   â   à   ë   é   ê   è   ï   î   ô   û   ù +
+Assume the word is in lower case. Then, taking the letters in turn from the +beginning to end of the word, put u or i into upper +case when it is both preceded and followed by a vowel; put y into +upper case when it is either preceded or followed by a vowel; and put u into upper case when it follows q. For example, +
+
jouer joUer +
ennuie ennuIe +
yeux Yeux +
quand qUand +
croyiez croYiez +
+ +

+In the last example, y becomes Y because it is +between two vowels, but i does not become I because +it is between Y and e, and Y is not +defined as a vowel above. +

+ +

+(The upper case forms are not then classed as vowels — see note on vowel +marking.) +

+ +

+Replace ë and ï with He and Hi. The H +marks the vowel as having originally had a diaeresis, while the vowel itself, lacking an accent, is able to +match suffixes beginning in e or i. +

+ +

+If the word begins with two vowels, RV is the region after the third +letter, otherwise the region after the first vowel not at the beginning of +the word, or the end of the word if these positions cannot be found. (Exceptionally, +par, col or tap, at the beginning of a word is also taken to define +RV as the region to their right.) +

+ +

+For example, +

+ +
+    a i m e r     a d o r e r     v o l e r    t a p i s
+         |...|         |.....|       |.....|        |...|
+
+ +

+R1 is the region after the first non-vowel following a vowel, or the end of +the word if there is no such non-vowel. +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or the +end of the word if there is no such non-vowel. +(See note on R1 and R2.) +

+ +

+For example: +

+ +
+    f a m e u s e m e n t
+         |......R1.......|
+               |...R2....|
+
+ +

+Note that R1 can contain RV (adorer), and RV can contain R1 (voler). +

+ +

+Below, ‘delete if in R2’ means that a found suffix should be removed if it +lies entirely in R2, but not if it overlaps R2 and the rest of the word. +‘delete if in R1 and preceded by X’ means that X itself does not have to +come in R1, while ‘delete if preceded by X in R1’ means that X, like the +suffix, must be entirely in R1. +

+ +

+Start with step 1 +

+ +

+Step 1: Standard suffix removal +

+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
ance   iqUe   isme   able   iste   eux   ances   iqUes   ismes   ables   istes +
delete if in R2 +
atrice   ateur   ation   atrices   ateurs   ations +
delete if in R2 +
if preceded by ic, delete if in R2, else replace by iqU +
logie   logies +
replace with log if in R2 +
usion   ution   usions   utions +
replace with u if in R2 +
ence   ences +
replace with ent if in R2 +
ement   ements +
delete if in RV +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by eus, delete if in R2, else replace by eux + if in R1, otherwise, +
if preceded by abl or iqU, delete if in R2, otherwise, +
if preceded by ièr or Ièr, replace by i if in RV +
ité   ités +
delete if in R2 +
if preceded by abil, delete if in R2, else replace by abl, + otherwise, +
if preceded by ic, delete if in R2, else replace by iqU, otherwise, +
if preceded by iv, delete if in R2 +
if   ive   ifs   ives +
delete if in R2 +
if preceded by at, delete if in R2 (and if further preceded by ic, + delete if in R2, else replace by iqU) +
eaux +
replace with eau +
aux +
replace with al if in R1 +
euse   euses +
delete if in R2, else replace by eux if in R1 +
issement   issements +
delete if in R1 and preceded by a non-vowel +
amment +
replace with ant if in RV +
emment +
replace with ent if in RV +
ment   ments +
delete if preceded by a vowel in RV +
+
+ +

+In steps 2a and 2b all tests are confined to the RV region. +

+ +

+Do step 2a if either no ending was removed by step 1, or if one of endings +amment, emment, ment, ments was found. +

+ +

+Step 2a: Verb suffixes beginning i +

+ +
+ Search for the longest among the following suffixes and if found, + delete if the preceding character is neither a vowel nor H. +
+ îmes   ît   îtes   i   ie   ies   ir   ira   irai   iraIent   irais   irait   iras +   irent   irez   iriez   irions   irons   iront   is   issaIent   issais   issait +   issant   issante   issantes   issants   isse   issent   isses   issez   issiez +   issions   issons   it +
+
+ (Note that the preceding character itself must also be in RV.) +
+ +

+Do step 2b if step 2a was done, but failed to remove a suffix. +

+ +

+Step 2b: Other verb suffixes +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
ions +
delete if in R2 +
é   ée   ées   és   èrent   er   era   erai   eraIent   erais   erait   eras   erez +   eriez   erions   erons   eront   ez   iez +
delete +
âmes   ât   âtes   a   ai   aIent   ais   ait   ant   ante   antes   ants   as   asse +   assent   asses   assiez   assions +
delete +
if preceded by e, delete +
+
+ (Note that the e that may be deleted in this last step must also be in + RV.) +
+ +

+If the last step to be obeyed — either step 1, 2a or 2b — altered the word, +do step 3 +

+ +

+Step 3 +

+
+ Replace final Y with i or final ç with c +
+

+Alternatively, if the last step to be obeyed did not alter the word, do +step 4 +

+ +

+Step 4: Residual suffix +

+ +
+

+ If the word ends s, not preceded by a, i (unless itself preceded by H), o, u, è or s, delete it. +

+ +

+ In the rest of step 4, all tests are confined to the RV region. +

+ +

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+ +
+
ion +
delete if in R2 and preceded by s or t +
ier   ière   Ier   Ière +
replace with i +
e +
delete +
+
+ (So note that ion is removed only when it is in R2 — as well as being + in RV — and preceded by s or t which must be in RV.) +
+ +

+Always do steps 5 and 6. +

+ +

+Step 5: Undouble +

+ +
+ If the word ends enn, onn, ett, ell or eill, delete the last letter +
+ +

+Step 6: Un-accent +

+ +
+ If the words ends é or è followed by at least one non-vowel, remove + the accent from the e. +
+ +

+And finally: +

+ +
+

+ Turn any remaining I, U and Y letters in the word back into lower case. +

+ +

+ Turn He and Hi back into ë and ï, and remove any + remaining H. +

+
+ +

The same algorithm in Snowball

+ +[% highlight_file('french') %] + +[% footer %] diff --git a/algorithms/french/stop.txt b/algorithms/french/stop.txt new file mode 100644 index 0000000..d525c99 --- /dev/null +++ b/algorithms/french/stop.txt @@ -0,0 +1,178 @@ + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) + | son | his, her (masc). Omitted because it is homonym of "sound" +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): + | été - Omitted because it is homonym of "summer" +étée +étées + | étés - Omitted because it is homonym of "summers" +étant +suis +es + | est - Omitted because it is homonym of "east" + | sommes - Omitted because it is homonym of "sums" +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses + | fût - Omitted because it is homonym of "tap", like in "beer on tap" +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai + | as - Omitted because it is homonym of "ace" +avons +avez +ont +aurai + | auras - Omitted because it is also the name of a kind of wind + | aura - Omitted because it is also the name of a kind of wind and homonym of "aura" +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait + | avions - Omitted because it is homonym of "planes" +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that (added 11 Apr 2012. Omission reported by Adrien Grand) +celà | that (incorrect, though common) +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/algorithms/german/stemmer.html b/algorithms/german/stemmer.html new file mode 100644 index 0000000..140d616 --- /dev/null +++ b/algorithms/german/stemmer.html @@ -0,0 +1,547 @@ + + + + + + + + + + German stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

German stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of German vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+aufeinander
+aufeinanderbiss
+aufeinanderfolge
+aufeinanderfolgen
+aufeinanderfolgend
+aufeinanderfolgende
+aufeinanderfolgenden
+aufeinanderfolgender
+aufeinanderfolgt
+aufeinanderfolgten
+aufeinanderschlügen
+aufenthalt
+aufenthalten
+aufenthaltes
+auferlegen
+auferlegt
+auferlegten
+auferstand
+auferstanden
+auferstehen
+aufersteht
+auferstehung
+auferstünde
+auferwecken
+auferweckt
+auferzogen
+aufessen
+auffa
+auffallen
+auffallend
+auffallenden
+auffallender
+auffällig
+auffälligen
+auffälliges
+auffassen
+auffasst
+auffaßt
+auffassung
+auffassungsvermögen
+
+aufeinand
+aufeinanderbiss
+aufeinanderfolg
+aufeinanderfolg
+aufeinanderfolg
+aufeinanderfolg
+aufeinanderfolg
+aufeinanderfolg
+aufeinanderfolgt
+aufeinanderfolgt
+aufeinanderschlug
+aufenthalt
+aufenthalt
+aufenthalt
+auferleg
+auferlegt
+auferlegt
+auferstand
+auferstand
+aufersteh
+aufersteht
+aufersteh
+auferstund
+auferweck
+auferweckt
+auferzog
+aufess
+auffa
+auffall
+auffall
+auffall
+auffall
+auffall
+auffall
+auffall
+auffass
+auffasst
+auffasst
+auffass
+auffassungsvermog
+
+kategorie
+kategorien
+kategorisch
+kategorische
+kategorischen
+kategorischer
+kater
+katerliede
+katern
+katers
+käthchen
+kathedrale
+kathinka
+katholik
+katholische
+katholischen
+katholischer
+kattun
+kattunhalstücher
+katz
+kätzchen
+kätzchens
+katze
+katzen
+katzenschmer
+katzensprung
+katzenwürde
+kätzin
+kätzlein
+katzmann
+kauen
+kauerte
+kauf
+kaufe
+kaufen
+käufer
+kauffahrer
+kaufherr
+kaufleute
+käuflich
+
+kategori
+kategori
+kategor
+kategor
+kategor
+kategor
+kat
+katerlied
+kat
+kat
+kathch
+kathedral
+kathinka
+kathol
+kathol
+kathol
+kathol
+kattun
+kattunhalstuch
+katz
+katzch
+katzch
+katz
+katz
+katzenschm
+katzenspr
+katzenwurd
+katzin
+katzlein
+katzmann
+kau
+kauert
+kauf
+kauf
+kauf
+kauf
+kauffahr
+kaufherr
+kaufleut
+kauflich
+
+
+ +

The stemming algorithm

+ +

+German includes the following accented forms, +

+ +
+ ä   ö   ü +
+ +

+and a special letter, ß, equivalent to double s. +

+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   ä   ö   ü +
+ +

+First put u and y between vowels into +upper case, and then do the following mappings, +

+
+ (a) replace ß with ss,
+ (a) replace ae with ä,
+ (a) replace oe with ö,
+ (a) replace ue with ü unless preceded by q. +
+ +

+(The rules here for ae, oe and ue were +added in Snowball 2.3.0, but were previously present as a variant of the +algorithm termed "german2"). The condition +on the replacement of ue prevents the unwanted changing of +quelle. Also note that feuer is not modified because the first +part of the rule changes it to feUer, so ue is not +found.) +

+ +

+R1 and R2 are first set up in the standard way +(see the note on R1 and R2), +but then R1 is adjusted so that the region before it contains at least 3 letters. +

+ +

+Define a valid s-ending as one of b, d, f, g, h, k, l, m, n, r or t. +

+ +

+Define a valid st-ending as the same list, excluding letter r. +

+ +

+Do each of steps 1, 2 and 3. +

+ +

+Step 1: +

+ Search for the longest among the following suffixes, +
+ (a) em   ern   er
+ (b) e   en   es
+ (c) s (preceded by a valid s-ending) +
+

+ and delete if in R1. (Of course the letter of the valid s-ending is + not necessarily in R1.) If an ending of group (b) is deleted, and the ending + is preceded by niss, delete the final s. +

+ +

+ (For example, äckernäck, ackersacker, + armesarm, bedürfnissenbedürfnis) +

+
+Step 2: +
+

+ Search for the longest among the following suffixes, +

+
+ (a) en   er   est
+ (b) st (preceded by a valid st-ending, itself preceded by at least 3 + letters) +
+

+ and delete if in R1. +

+ +

+ (For example, derbstenderbst by step 1, and derbstderb by step + 2, since b is a valid st-ending, and is preceded by just 3 letters) +

+
+Step 3: d-suffixes (*) +
+

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
end   ung +
delete if in R2 +
if preceded by ig, delete if in R2 and not preceded by e +
ig   ik   isch +
delete if in R2 and not preceded by e +
lich   heit +
delete if in R2 +
if preceded by er or en, delete if in R1 +
keit +
delete if in R2 +
if preceded by lich or ig, delete if in R2 +
+
+ +

+Finally, +

+ +
+ turn U and Y back into lower case, and remove the umlaut accent from a, + o and u. +
+ +

The same algorithm in Snowball

+ +
/*
+    Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+           prelude postlude
+           mark_regions
+           R1 R2
+           standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a"   '{U+00E4}'
+stringdef o"   '{U+00F6}'
+stringdef u"   '{U+00FC}'
+stringdef ss   '{U+00DF}'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending  'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+    test repeat goto (
+        v [('u'] v <- 'U') or
+           ('y'] v <- 'Y')
+    )
+
+    repeat (
+        [substring] among(
+            '{ss}' (<- 'ss')
+            'ae'   (<- '{a"}')
+            'oe'   (<- '{o"}')
+            'ue'   (<- '{u"}')
+            'qu'   ()
+            ''     (next)
+        )
+    )
+
+)
+
+define mark_regions as (
+
+    $p1 = limit
+    $p2 = limit
+
+    test(hop 3 setmark x)
+
+    gopast v  gopast non-v  setmark p1
+    try($p1 < x  $p1 = x)  // at least 3
+    gopast v  gopast non-v  setmark p2
+
+)
+
+define postlude as repeat (
+
+    [substring] among(
+        'Y'    (<- 'y')
+        'U'    (<- 'u')
+        '{a"}' (<- 'a')
+        '{o"}' (<- 'o')
+        '{u"}' (<- 'u')
+        ''     (next)
+    )
+
+)
+
+backwardmode (
+
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define standard_suffix as (
+        do (
+            [substring] R1 among(
+                'em' 'ern' 'er'
+                (   delete
+                )
+                'e' 'en' 'es'
+                (   delete
+                    try (['s'] 'nis' delete)
+                )
+                's'
+                (   s_ending delete
+                )
+            )
+        )
+        do (
+            [substring] R1 among(
+                'en' 'er' 'est'
+                (   delete
+                )
+                'st'
+                (   st_ending hop 3 delete
+                )
+            )
+        )
+        do (
+            [substring] R2 among(
+                'end' 'ung'
+                (   delete
+                    try (['ig'] not 'e' R2 delete)
+                )
+                'ig' 'ik' 'isch'
+                (   not 'e' delete
+                )
+                'lich' 'heit'
+                (   delete
+                    try (
+                        ['er' or 'en'] R1 delete
+                    )
+                )
+                'keit'
+                (   delete
+                    try (
+                        [substring] R2 among(
+                            'lich' 'ig'
+                            (   delete
+                            )
+                        )
+                    )
+                )
+            )
+        )
+    )
+)
+
+define stem as (
+    do prelude
+    do mark_regions
+    backwards
+        do standard_suffix
+    do postlude
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/german/stemmer.tt b/algorithms/german/stemmer.tt new file mode 100644 index 0000000..7220e24 --- /dev/null +++ b/algorithms/german/stemmer.tt @@ -0,0 +1,146 @@ +[% header('German stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'aufeinander', 'kategorie']) %] + +

The stemming algorithm

+ +

+German includes the following accented forms, +

+ +
+ ä   ö   ü +
+ +

+and a special letter, ß, equivalent to double s. +

+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   ä   ö   ü +
+ +

+First put u and y between vowels into +upper case, and then do the following mappings, +

+
+ (a) replace ß with ss,
+ (a) replace ae with ä,
+ (a) replace oe with ö,
+ (a) replace ue with ü unless preceded by q. +
+ +

+(The rules here for ae, oe and ue were +added in Snowball 2.3.0, but were previously present as a variant of the +algorithm termed "german2"). The condition +on the replacement of ue prevents the unwanted changing of +quelle. Also note that feuer is not modified because the first +part of the rule changes it to feUer, so ue is not +found.) +

+ +

+R1 and R2 are first set up in the standard way +(see the note on R1 and R2), +but then R1 is adjusted so that the region before it contains at least 3 letters. +

+ +

+Define a valid s-ending as one of b, d, f, g, h, k, l, m, n, r or t. +

+ +

+Define a valid st-ending as the same list, excluding letter r. +

+ +

+Do each of steps 1, 2 and 3. +

+ +

+Step 1: +

+ Search for the longest among the following suffixes, +
+ (a) em   ern   er
+ (b) e   en   es
+ (c) s (preceded by a valid s-ending) +
+

+ and delete if in R1. (Of course the letter of the valid s-ending is + not necessarily in R1.) If an ending of group (b) is deleted, and the ending + is preceded by niss, delete the final s. +

+ +

+ (For example, äckernäck, ackersacker, + armesarm, bedürfnissenbedürfnis) +

+
+Step 2: +
+

+ Search for the longest among the following suffixes, +

+
+ (a) en   er   est
+ (b) st (preceded by a valid st-ending, itself preceded by at least 3 + letters) +
+

+ and delete if in R1. +

+ +

+ (For example, derbstenderbst by step 1, and derbstderb by step + 2, since b is a valid st-ending, and is preceded by just 3 letters) +

+
+Step 3: d-suffixes (*) +
+

+ Search for the longest among the following suffixes, and perform the + action indicated. +

+
+
end   ung +
delete if in R2 +
if preceded by ig, delete if in R2 and not preceded by e +
ig   ik   isch +
delete if in R2 and not preceded by e +
lich   heit +
delete if in R2 +
if preceded by er or en, delete if in R1 +
keit +
delete if in R2 +
if preceded by lich or ig, delete if in R2 +
+
+ +

+Finally, +

+ +
+ turn U and Y back into lower case, and remove the umlaut accent from a, + o and u. +
+ +

The same algorithm in Snowball

+ +[% highlight_file('german') %] + +[% footer %] diff --git a/algorithms/german/stop.txt b/algorithms/german/stop.txt new file mode 100644 index 0000000..5c45a51 --- /dev/null +++ b/algorithms/german/stop.txt @@ -0,0 +1,286 @@ + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/algorithms/german2/stemmer.html b/algorithms/german2/stemmer.html new file mode 100644 index 0000000..d57bdd5 --- /dev/null +++ b/algorithms/german2/stemmer.html @@ -0,0 +1,86 @@ + + + + + + + + + + German stemming algorithm variant - Snowball + + + + + + + + + + +
+
+
+ +
+
+

German stemming algorithm variant

+ + +

Links to resources

+ + + +

+We used to present a variant of the main German stemmer, termed "german2" which +was the same as the German stemmer but adjusted the first step to improve +handling of input text where the German letters ä, +ö and ü, were written as ae, +oe and ue respectively. +

+ +

+Snowball 2.3.0 added these adjustments to the main German stemmer, so there +is no longer a "german2" variant - just used the "german" stemmer. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/german2/stemmer.tt b/algorithms/german2/stemmer.tt new file mode 100644 index 0000000..6751bd6 --- /dev/null +++ b/algorithms/german2/stemmer.tt @@ -0,0 +1,22 @@ +[% header('German stemming algorithm variant') %] + +

Links to resources

+ + + +

+We used to present a variant of the main German stemmer, termed "german2" which +was the same as the German stemmer but adjusted the first step to improve +handling of input text where the German letters ä, +ö and ü, were written as ae, +oe and ue respectively. +

+ +

+Snowball 2.3.0 added these adjustments to the main German stemmer, so there +is no longer a "german2" variant - just used the "german" stemmer. +

+ +[% footer %] diff --git a/algorithms/germanic.html b/algorithms/germanic.html new file mode 100644 index 0000000..3dbb121 --- /dev/null +++ b/algorithms/germanic.html @@ -0,0 +1,177 @@ + + + + + + + + + + Germanic language stemmers - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Germanic language stemmers

+ + +

Links to resources

+ + + +

+Despite its inflexional complexities, German has quite a simple suffix +structure, so that, if one ignores the almost intractable problems of +compound words, separable verb prefixes, and prefixed and infixed ge, an +algorithmic stemmer can be made quite short. (Infixed zu can be removed +algorithmically, but this minor feature is not shown here.) The umlaut in +German is a regular feature of plural formation, so its removal is a +natural feature of stemming, but this leads to certain false conflations +(for example, schön, beautiful; schon, already). +

+ +

+By contrast, Dutch is inflexionally simple, but even so, this does not make +for any great difference between the stemmers. A feature of Dutch that +makes it markedly different from German is that the grammar of the written +language has changed, and continues to change, relatively rapidly, and that +it has assimilated a large and mixed foreign vocabulary with some of the +accompanying foreign suffixes. Foreign words may, or may not, be +transliterated into a Dutch style. Naturally these create problems in +stemming. The stemmer here is intended for native words of contemporary +Dutch. +

+ +

+In a Dutch noun, a vowel may double in the singular form (manen = moons, maan += moon). We attempt to solve this by undoubling the double vowel (Kraaij +Pohlman by contrast attempt to double the single vowel). The endings je, +tje, pje etc., although extremely common, are not stemmed. They are +diminutives and can significantly alter word meaning. +

+ +

A note on compound words

+ +

+Famously, German allows for the formation of long compound words, written +without spaces. For retrieval purposes, it is useful to be able to search +on the parts of such words, as well as the on the complete words +themselves. This is not just peculiar to German: Dutch, Danish, Norwegian, +Swedish, Icelandic and Finnish have the same property. To split up +compound words cannot be done without a dictionary, and the purely +algorithmic stemmers presented here do not attempt it. +

+ +

+We would suggest, however, that the need for compound word splitting in +these languages has been somewhat overstated. In the case of German: +

+ +

+1) There are many English compounds one would see no advantage in +splitting, +

+ +
+
blackberry blackboard rainbow coastguard .... +
+ +

+Many German compounds are like this, +

+ +
+
Bleistift (pencil) = Blei (lead) + Stift (stick) +
Eisenbahn (railway) = Eisen (iron) + Bahn (road) +
Unterseeboot (submarine) = under + sea + boat +
+ +

+2) Other compounds correspond to what in English one would want to do by +phrase searching, so they are ready made for that purpose, +

+ +
+
Gesundheitspflege = ‘health care’ +
Fachhochschule = ‘technical college’ +
Kunstmuseum = ‘museum of fine art’ +
+ +

+3) In any case, longer compounds, especially involving personal names, are +frequently hyphenated, +

+ +
Heinrich-Heine-Universität +
+ +

+4) It is possible to construct participial adjectives of almost any +length, but they are little used in contemporary German, and regarded now +as poor style. As in English, very long words are not always to be taken +too seriously. On the author's last visit to Germany, the longest word he +had to struggle with was +

+ +
Nasenspitzenwurzelentzündung +
+ +

+It means ‘inflammation of the root of the tip of the nose’, and comes from +a cautionary tale for children. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/germanic.tt b/algorithms/germanic.tt new file mode 100644 index 0000000..eb52a74 --- /dev/null +++ b/algorithms/germanic.tt @@ -0,0 +1,113 @@ +[% header('Germanic language stemmers') %] + +

Links to resources

+ + + +

+Despite its inflexional complexities, German has quite a simple suffix +structure, so that, if one ignores the almost intractable problems of +compound words, separable verb prefixes, and prefixed and infixed ge, an +algorithmic stemmer can be made quite short. (Infixed zu can be removed +algorithmically, but this minor feature is not shown here.) The umlaut in +German is a regular feature of plural formation, so its removal is a +natural feature of stemming, but this leads to certain false conflations +(for example, schön, beautiful; schon, already). +

+ +

+By contrast, Dutch is inflexionally simple, but even so, this does not make +for any great difference between the stemmers. A feature of Dutch that +makes it markedly different from German is that the grammar of the written +language has changed, and continues to change, relatively rapidly, and that +it has assimilated a large and mixed foreign vocabulary with some of the +accompanying foreign suffixes. Foreign words may, or may not, be +transliterated into a Dutch style. Naturally these create problems in +stemming. The stemmer here is intended for native words of contemporary +Dutch. +

+ +

+In a Dutch noun, a vowel may double in the singular form (manen = moons, maan += moon). We attempt to solve this by undoubling the double vowel (Kraaij +Pohlman by contrast attempt to double the single vowel). The endings je, +tje, pje etc., although extremely common, are not stemmed. They are +diminutives and can significantly alter word meaning. +

+ +

A note on compound words

+ +

+Famously, German allows for the formation of long compound words, written +without spaces. For retrieval purposes, it is useful to be able to search +on the parts of such words, as well as the on the complete words +themselves. This is not just peculiar to German: Dutch, Danish, Norwegian, +Swedish, Icelandic and Finnish have the same property. To split up +compound words cannot be done without a dictionary, and the purely +algorithmic stemmers presented here do not attempt it. +

+ +

+We would suggest, however, that the need for compound word splitting in +these languages has been somewhat overstated. In the case of German: +

+ +

+1) There are many English compounds one would see no advantage in +splitting, +

+ +
+
blackberry blackboard rainbow coastguard .... +
+ +

+Many German compounds are like this, +

+ +
+
Bleistift (pencil) = Blei (lead) + Stift (stick) +
Eisenbahn (railway) = Eisen (iron) + Bahn (road) +
Unterseeboot (submarine) = under + sea + boat +
+ +

+2) Other compounds correspond to what in English one would want to do by +phrase searching, so they are ready made for that purpose, +

+ +
+
Gesundheitspflege = ‘health care’ +
Fachhochschule = ‘technical college’ +
Kunstmuseum = ‘museum of fine art’ +
+ +

+3) In any case, longer compounds, especially involving personal names, are +frequently hyphenated, +

+ +
Heinrich-Heine-Universität +
+ +

+4) It is possible to construct participial adjectives of almost any +length, but they are little used in contemporary German, and regarded now +as poor style. As in English, very long words are not always to be taken +too seriously. On the author's last visit to Germany, the longest word he +had to struggle with was +

+ +
Nasenspitzenwurzelentzündung +
+ +

+It means ‘inflammation of the root of the tip of the nose’, and comes from +a cautionary tale for children. +

+ +[% footer %] diff --git a/algorithms/greek/stemmer.html b/algorithms/greek/stemmer.html new file mode 100644 index 0000000..844d1d7 --- /dev/null +++ b/algorithms/greek/stemmer.html @@ -0,0 +1,796 @@ + + + + + + + + + + Greek stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Greek stemming algorithm

+ + +

Links to resources

+ + + +

This is an implementation of the stemmer described in:

+ +
+ Ntais, Georgios. Development of a Stemmer for the Greek Language. Diss. Royal Institute of Technology, 2006. +
+ +

with additional improvements from:

+ +
+ Saroukos, Spyridon. Enhancing a Greek language stemmer. University of Tampere, 2008. +
+ +

The full algorithm in Snowball

+ +
// A stemmer for Modern Greek language, based on:
+//
+// Ntais, Georgios. Development of a Stemmer for the Greek
+// Language. Diss. Royal Institute of Technology, 2006.
+// https://sais.se/mthprize/2007/ntais2007.pdf
+//
+// Saroukos, Spyridon. Enhancing a Greek language stemmer.
+// University of Tampere, 2008.
+// https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf
+
+stringescapes {}
+
+stringdef a    '{U+03B1}' // alpha
+stringdef v    '{U+03B2}' // beta
+stringdef g    '{U+03B3}' // gamma
+stringdef d    '{U+03B4}' // delta
+stringdef e    '{U+03B5}' // epsilon
+stringdef z    '{U+03B6}' // zeta
+stringdef i    '{U+03B7}' // eta
+stringdef th   '{U+03B8}' // theta
+stringdef y    '{U+03B9}' // iota
+stringdef k    '{U+03BA}' // kappa
+stringdef l    '{U+03BB}' // lamda
+stringdef m    '{U+03BC}' // mu
+stringdef n    '{U+03BD}' // nu
+stringdef x    '{U+03BE}' // xi
+stringdef o    '{U+03BF}' // omicron
+stringdef p    '{U+03C0}' // pi
+stringdef r    '{U+03C1}' // rho
+stringdef ss   '{U+03C2}' // sigma final
+stringdef s    '{U+03C3}' // sigma
+stringdef t    '{U+03C4}' // tau
+stringdef u    '{U+03C5}' // upsilon
+stringdef f    '{U+03C6}' // phi
+stringdef ch   '{U+03C7}' // chi
+stringdef ps   '{U+03C8}' // psi
+stringdef oo   '{U+03C9}' // omega
+
+stringdef A    '{U+0391}' // Alpha
+stringdef V    '{U+0392}' // Beta
+stringdef G    '{U+0393}' // Gamma
+stringdef D    '{U+0394}' // Delta
+stringdef E    '{U+0395}' // Epsilon
+stringdef Z    '{U+0396}' // Zeta
+stringdef I    '{U+0397}' // Eta
+stringdef Th   '{U+0398}' // Theta
+stringdef Y    '{U+0399}' // Iota
+stringdef K    '{U+039A}' // Kappa
+stringdef L    '{U+039B}' // Lamda
+stringdef M    '{U+039C}' // Mu
+stringdef N    '{U+039D}' // Nu
+stringdef X    '{U+039E}' // Xi
+stringdef O    '{U+039F}' // Omicron
+stringdef P    '{U+03A0}' // Pi
+stringdef R    '{U+03A1}' // Rho
+stringdef S    '{U+03A3}' // Sigma
+stringdef T    '{U+03A4}' // Tau
+stringdef U    '{U+03A5}' // Upsilon
+stringdef F    '{U+03A6}' // Phi
+stringdef Ch   '{U+03A7}' // Chi
+stringdef Ps   '{U+03A8}' // Psi
+stringdef Oo   '{U+03A9}' // Omega
+
+stringdef Y:   '{U+03AA}' // Iota with dialytika
+stringdef U:   '{U+03AB}' // Upsilon with dialytika
+
+stringdef a'   '{U+03AC}' // alpha with tonos
+stringdef e'   '{U+03AD}' // epsilon with tonos
+stringdef i'   '{U+03AE}' // eta with tonos
+stringdef y'   '{U+03AF}' // iota with tonos
+stringdef o'   '{U+03CC}' // omicron with tonos
+stringdef u'   '{U+03CD}' // upsilon with tonos
+stringdef oo'  '{U+03CE}' // omega with tonos
+
+stringdef i:'  '{U+0390}' // iota with dialytika and tonos
+stringdef u:'  '{U+03B0}' // upsilon with dialytika and tonos
+
+stringdef i:   '{U+03CA}' // iota with dialytika
+stringdef u:   '{U+03CB}' // upsilon with dialytika
+
+stringdef A'   '{U+0386}' // Alpha with tonos
+stringdef E'   '{U+0388}' // Epsilon with tonos
+stringdef I'   '{U+0389}' // Eta with tonos
+stringdef Y'   '{U+038A}' // Iota with tonos
+stringdef O'   '{U+038C}' // Omicron with tonos
+stringdef U'   '{U+038E}' // Upsilon with tonos
+stringdef OO'  '{U+038F}' // Omega with tonos
+
+externals ( stem )
+
+booleans ( test1 )
+
+groupings ( v v2 )
+
+routines ( tolower has_min_length
+           steps1 steps2 steps3 steps4 steps5 steps6 steps7
+           steps8 steps9 steps10
+           step1 step2a step2b step2c step2d step3 step4
+           step5a step5b step5c step5d step5e step5f
+           step5g step5h step5i
+           step5j step5k step5l step5m
+           step6 step7 )
+
+define v '{a}{e}{i}{y}{o}{u}{oo}'
+define v2 '{a}{e}{i}{y}{o}{oo}'
+
+backwardmode (
+  define has_min_length as (
+    $(len >= 3)
+  )
+
+  define tolower as (
+    repeat (
+      [substring] among (
+        '{A}' (<- '{a}')
+        '{V}' (<- '{v}')
+        '{G}' (<- '{g}')
+        '{D}' (<- '{d}')
+        '{E}' (<- '{e}')
+        '{Z}' (<- '{z}')
+        '{I}' (<- '{i}')
+        '{Th}' (<- '{th}')
+        '{Y}' (<- '{y}')
+        '{K}' (<- '{k}')
+        '{L}' (<- '{l}')
+        '{M}' (<- '{m}')
+        '{N}' (<- '{n}')
+        '{X}' (<- '{x}')
+        '{O}' (<- '{o}')
+        '{P}' (<- '{p}')
+        '{R}' (<- '{r}')
+        '{S}' (<- '{s}')
+        '{T}' (<- '{t}')
+        '{U}' (<- '{u}')
+        '{F}' (<- '{f}')
+        '{Ch}' (<- '{ch}')
+        '{Ps}' (<- '{ps}')
+        '{Oo}' (<- '{oo}')
+        '{Y:}' (<- '{y}')
+        '{U:}' (<- '{u}')
+        '{a'}' (<- '{a}')
+        '{e'}' (<- '{e}')
+        '{i'}' (<- '{i}')
+        '{y'}' (<- '{y}')
+        '{o'}' (<- '{o}')
+        '{u'}' (<- '{u}')
+        '{oo'}' (<- '{oo}')
+        '{i:'}' (<- '{i}')
+        '{u:'}' (<- '{u}')
+        '{i:}' (<- '{i}')
+        '{u:}' (<- '{u}')
+        '{A'}' (<- '{a}')
+        '{E'}' (<- '{e}')
+        '{I'}' (<- '{i}')
+        '{Y'}' (<- '{y}')
+        '{O'}' (<- '{o}')
+        '{U'}' (<- '{u}')
+        '{OO'}' (<- '{oo}')
+        '{ss}' (<- '{s}')
+         '' (next)
+      )
+    )
+  )
+
+  define step1 as (
+    [substring] among (
+      '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}')
+      '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}')
+      '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}')
+      '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}')
+      '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}')
+      '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}')
+      '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}')
+      '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}')
+      '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}')
+      '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}')
+      '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}')
+    )
+    unset test1
+  )
+
+  define steps1 as (
+    [substring] among (
+      '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}'
+      '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' (
+        delete
+        unset test1
+        ([] substring atlimit among (
+          '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}'
+          (<- '{y}')
+          '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}'
+          '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}'
+          '{r}' '{p}{y}{p}{e}{r}{o}{r}'
+          (<- '{y}{z}')
+        ))
+      )
+    )
+  )
+
+  define steps2 as (
+    [substring] among (
+      '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' (
+        delete
+        unset test1
+        [] substring atlimit among (
+          '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}')
+        )
+      )
+    )
+  )
+
+  define steps3 as (
+    [substring] among (
+      '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' (
+        delete
+        unset test1
+        ('{y}{s}{a}' atlimit <- '{y}{s}') or
+        ([] substring atlimit among (
+          '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}'
+          '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}'
+          '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}'
+          (<- '{y}')
+          '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}'
+          '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}'
+          (<- '{y}{s}')
+        ))
+      )
+    )
+  )
+
+  define steps4 as (
+    [substring] among (
+      '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' (
+        delete
+        unset test1
+        [] substring atlimit among (
+          '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}'
+          '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}'
+          '{e}{k}{l}{e}' '{p}{e}'
+          (<- '{y}')
+        )
+      )
+    )
+  )
+
+  define steps5 as (
+    [substring] among (
+      '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}'
+      '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' (
+        delete
+        unset test1
+        ([] substring atlimit among (
+          '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}'
+          (<- '{y}')
+          '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}'
+          '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}'
+          '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}'
+          (<- '{y}{s}{t}')
+        ))
+      )
+    )
+  )
+
+  define steps6 as (
+    [substring] among (
+      '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' (
+        delete
+        unset test1
+        ([] substring atlimit among (
+          '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}'
+          (<- '{y}{s}{m}')
+          '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}'
+          (<- '{y}')
+        )) or
+        ([substring] among (
+          '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}')
+          '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}')
+          '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}')
+          '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}')
+          '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}')
+          '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}')
+          '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}')
+          '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}')
+          '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}')
+          '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}')
+        ))
+      )
+    )
+  )
+
+  define steps7 as (
+    [substring] among (
+      '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' (
+        delete
+        unset test1
+        [] substring atlimit among (
+         '{s}' '{ch}'
+         (<- '{a}{r}{a}{k}')
+        )
+      )
+    )
+  )
+
+  define steps8 as (
+    [substring] among (
+      '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' (
+        delete
+        unset test1
+        ([] substring atlimit among (
+          '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}'
+          '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}'
+          '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}'
+          '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}'
+           (<- '{a}{k}')
+          '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}'
+          '{p}{a}{t}{e}{r}' '{p}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}'
+          // We're implementing the revised algorithm from the Saroukos paper
+          // which also lists '{k}{o}{n}' and '{s}{k}' here, but these are
+          // also listed just above in the `Add {a}{k} in the end` exception.
+          // It seems they're redundant here, so we omit them (otherwise the
+          // Snowball compiler would report an error).
+          (<- '{y}{t}{s}')
+        )) or
+        ([] '{k}{o}{r}' <- '{y}{t}{s}')
+      )
+    )
+  )
+
+  define steps9 as (
+    [substring] among (
+      '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' (
+        delete
+        unset test1
+        ([] substring atlimit among (
+          '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}')
+        )) or
+        ([] substring among (
+          '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}')
+        ))
+      )
+    )
+  )
+
+  define steps10 as (
+    [substring] among (
+      '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' (
+        delete
+        unset test1
+        [] substring atlimit among (
+         '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}'
+         (<- '{y}{s}{k}')
+        )
+      )
+    )
+  )
+
+  define step2a as (
+    [substring] among (
+      '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete)
+    )
+    not (substring among (
+      '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}'
+    ))
+    insert '{a}{d}'
+  )
+
+  define step2b as (
+    [substring] among (
+      '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete)
+    )
+    [] substring among (
+      '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}')
+    )
+  )
+
+  define step2c as (
+    [substring] among (
+      '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete)
+    )
+    [] substring among (
+      '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}'
+      '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}')
+    )
+  )
+
+  define step2d as (
+    [substring] among (
+      '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1)
+    )
+    [] substring atlimit among (
+      '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}')
+    )
+  )
+
+  define step3 as (
+    [substring] among (
+      '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1)
+    )
+    ([] v <- '{y}')
+  )
+
+  define step4 as (
+    [substring] among (
+       '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1)
+    )
+    ([] v <- '{y}{k}') or
+    [] substring atlimit among (
+      '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}'
+      '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}'
+      '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}'
+      '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}'
+      (<- '{y}{k}')
+    )
+  )
+
+  define step5a as (
+    do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}')
+    do (
+      [substring] among (
+        '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1)
+      )
+    )
+    ['{a}{m}{e}']
+    delete
+    unset test1
+    [] substring atlimit among (
+      '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}'
+      (<- '{a}{m}')
+    )
+  )
+
+  define step5b as (
+    do (
+      [substring] among (
+        '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}'
+        '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' (
+          delete
+          unset test1
+          [] substring atlimit among (
+            '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}')
+          )
+        )
+      )
+    )
+    ['{a}{n}{e}']
+    delete
+    unset test1
+    ([] v2 <- '{a}{n}') or
+    [] substring atlimit among (
+      '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}'
+      '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}'
+      '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}'
+      '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}'
+      '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}'
+      '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}'
+      '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}'
+      '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}'
+      '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}'
+      '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}'
+      '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}'
+      '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}'
+      '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}'
+      '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}'
+      '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}'
+      '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}'
+      (<- '{a}{n}')
+    )
+  )
+
+  define step5c as (
+    do (
+      [substring] among (
+        '{i}{s}{e}{t}{e}' (delete unset test1)
+      )
+    )
+    ['{e}{t}{e}']
+    delete
+    unset test1
+    ([] v2 <- '{e}{t}') or
+    ([] substring among (
+      '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}'
+      '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}'
+      '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}'
+      (<- '{e}{t}')
+    )) or
+    [] substring atlimit among (
+      '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}'
+      '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}'
+      '{th}{a}{r}{r}' '{th}'
+      (<- '{e}{t}')
+    )
+  )
+
+  define step5d as (
+    [substring] among (
+      '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' (
+        delete
+        unset test1
+        ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or
+        ([] '{k}{r}{e}' <- '{oo}{n}{t}')
+      )
+    )
+  )
+
+  define step5e as (
+    [substring] among (
+      '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' (
+        delete
+        unset test1
+        ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}')
+      )
+    )
+  )
+
+  define step5f as (
+    do (
+      ['{y}{e}{s}{t}{e}']
+      delete
+      unset test1
+      [] substring atlimit among (
+        '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}')
+      )
+    )
+    ['{e}{s}{t}{e}']
+    delete
+    unset test1
+    [] substring atlimit among (
+      '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}'
+      (<- '{y}{e}{s}{t}')
+    )
+  )
+
+  define step5g as (
+    do (
+      [substring] among (
+        '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1)
+      )
+    )
+    [substring] among (
+      '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' (
+        delete
+        unset test1
+        ([] substring among (
+           '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}')
+        )) or
+        ([] substring atlimit among (
+           '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}')
+        ))
+      )
+    )
+  )
+
+  define step5h as (
+    [substring] among (
+      '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' (
+        delete
+        unset test1
+        ([] substring among (
+          '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}'
+          '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}')
+
+        )) or
+        ([] substring atlimit among (
+          '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}'
+          '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}'
+          '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}'
+          (<- '{o}{u}{s}')
+        ))
+      )
+    )
+  )
+
+  define step5i as (
+    [substring] among (
+      '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' (
+        delete
+        unset test1
+        ([] '{k}{o}{l}{l}' <- '{a}{g}') or (
+          ([] substring among (
+            '{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}' ()
+            '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}'
+            (<- '{a}{g}')
+          )) or
+          ([] substring atlimit among (
+            '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}'
+            '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}'
+            '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}'
+            '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}'
+            '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}'
+            (<- '{a}{g}')
+          ))
+        )
+      )
+    )
+  )
+
+  define step5j as (
+    [substring] among (
+      '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1)
+    )
+    [] substring atlimit among (
+      '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}')
+    )
+  )
+
+  define step5k as (
+    [substring] among (
+      '{i}{s}{t}{e}' (delete unset test1)
+    )
+    [] substring atlimit among (
+      '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}'
+      (<- '{i}{s}{t}')
+    )
+  )
+
+  define step5l as (
+    [substring] among (
+      '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1)
+    )
+    [] substring atlimit among (
+      '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}')
+    )
+  )
+
+  define step5m as (
+    [substring] among (
+      '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1)
+    )
+    [] substring atlimit among (
+      '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}'
+      (<- '{o}{u}{m}')
+    )
+  )
+
+  define step6 as (
+    do (
+      [substring] among (
+        '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}')
+      )
+    )
+    test1
+    [substring] among (
+      '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}'
+      '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}'
+      '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}'
+      '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}'
+      '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}'
+      '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}'
+      '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}'
+      '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}'
+      '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}'
+      '{oo}{n}' (delete)
+    )
+  )
+
+  define step7 as (
+    [substring] among (
+      '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete)
+    )
+  )
+)
+
+define stem as (
+    backwards (
+      do tolower
+      has_min_length
+      set test1
+      do step1
+      do steps1
+      do steps2
+      do steps3
+      do steps4
+      do steps5
+      do steps6
+      do steps7
+      do steps8
+      do steps9
+      do steps10
+      do step2a
+      do step2b
+      do step2c
+      do step2d
+      do step3
+      do step4
+      do step5a
+      do step5b
+      do step5c
+      do step5d
+      do step5e
+      do step5f
+      do step5g
+      do step5h
+      do step5j
+      do step5i
+      do step5k
+      do step5l
+      do step5m
+      do step6
+      do step7
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/greek/stemmer.tt b/algorithms/greek/stemmer.tt new file mode 100644 index 0000000..2327149 --- /dev/null +++ b/algorithms/greek/stemmer.tt @@ -0,0 +1,27 @@ +[% header('Greek stemming algorithm') %] + +

Links to resources

+ + + +

This is an implementation of the stemmer described in:

+ +
+ Ntais, Georgios. Development of a Stemmer for the Greek Language. Diss. Royal Institute of Technology, 2006. +
+ +

with additional improvements from:

+ +
+ Saroukos, Spyridon. Enhancing a Greek language stemmer. University of Tampere, 2008. +
+ +

The full algorithm in Snowball

+ +[% highlight_file('greek') %] + +[% footer %] diff --git a/algorithms/hindi/stemmer.html b/algorithms/hindi/stemmer.html new file mode 100644 index 0000000..fefc6fb --- /dev/null +++ b/algorithms/hindi/stemmer.html @@ -0,0 +1,474 @@ + + + + + + + + + + Hindi stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Hindi stemming algorithm

+ + +

Links to resources

+ + + +

+This is an implementation of the "Lightweight Stemmer for Hindi" described in: +

+ +
+ A. Ramanathan and D. Rao (2003) A Lightweight Stemmer for Hindi +
+ +

+The major difference in our implementation is that rather than transliterating +to the Latin alphabet we instead work in the original Devanagari script. We +have modified the suffixes in the list by converting them back to Devanagari +like so: +

+ +
    +
  • within the suffixes, "a" after a consonant is dropped since +consonants have an implicit "a". +
  • within the suffixes, a vowel other than "a" after a consonant +is a dependent vowel (vowel sign); a vowel (including "a") after a +non-consonant is an independent vowel. +
  • to allow for the vowel at the start of each suffix being dependent or +independent, we include each suffix twice. For the dependent version, a +leading "a" is dropped and we check that the suffix is preceded by a +consonant (which will have an implicit "a"). +
+ +

+The transliterations of our stems would end with "a" when our +stems end in a consonant, so we also include the character virama in the +list of suffixes to remove (this affects 222 words from our sample vocabulary). +

+ +

+Aside from this, our implementation attempts to be faithful to the algorithm +described in the paper, though in a few places we've had to resolve ambiguities +in the paper: +

+ +
    + +
  • +We assume that the whole word doesn't count as a valid suffix to remove, so we +remove the longest suffix from the list which leaves at least one character. +The paper doesn't seem to clearly state either way which is intended, but producing +an empty stem seems unhelpful in general. If we instead allowed an empty stem +to be produced this would result in a different stem for 47 words out of the +65,140 in our sample vocabulary from Hindi wikipedia. +
  • + +
  • +We add a to the list of suffixes to remove in figure 3. This needed for +the example given right at the end of section 5 to work (conflating BarawIya +and BarawIyawA, and which §3.1 a.v strongly suggests should be in the list: +"Thus, the following suffix deletions (longest possible match) are required +to reduce inflected forms of masculine nouns to a common stem: a A i [...]" +Adding a only affect 2 words out of the 65,140 in our sample vocabulary. +
  • + +
  • +We've also assumed that Mh in the suffix list isn't meant to match +M followed by h. Only one of the 65,140 words in the +sample vocabulary stems differently due to this (and that word +seems to be a typo). +
  • + +
+ +

The full algorithm in Snowball

+ +
// An implementation of "A Lightweight Stemmer for Hindi":
+// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf
+
+externals ( stem )
+
+stringescapes {}
+
+// The transliteration scheme used for our stringdefs matches that used in the
+// paper, as documented in the appendix.  It appears to match the WX notation
+// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently
+// uses 'z' for Anunasika whereas the paper uses Mh.
+//
+// We discriminate dependent vowels by adding a leading "_" to their stringdef
+// names (mnemonic: the _ signifies removing the implicit a from the preceding
+// character).
+
+// Vowels and sonorants:
+stringdef a  '{U+0905}'
+stringdef A  '{U+0906}'
+stringdef i  '{U+0907}'
+stringdef I  '{U+0908}'
+stringdef u  '{U+0909}'
+stringdef U  '{U+090A}'
+stringdef q  '{U+090B}'
+stringdef e  '{U+090F}'
+stringdef E  '{U+0910}'
+stringdef o  '{U+0913}'
+stringdef O  '{U+0914}'
+
+// Vowel signs:
+stringdef _A '{U+093E}'
+stringdef _i '{U+093F}'
+stringdef _I '{U+0940}'
+stringdef _u '{U+0941}'
+stringdef _U '{U+0942}'
+stringdef _q '{U+0943}'
+stringdef _e '{U+0947}'
+stringdef _E '{U+0948}'
+stringdef _o '{U+094B}'
+stringdef _O '{U+094C}'
+
+// Diacritics:
+stringdef M  '{U+0902}'
+stringdef H  '{U+0903}'
+stringdef Mh '{U+0901}'
+stringdef Z  '{U+093C}' // Nukta
+stringdef virama '{U+094D}'
+
+// Velar consonants:
+stringdef k  '{U+0915}'
+stringdef K  '{U+0916}'
+stringdef g  '{U+0917}'
+stringdef G  '{U+0918}'
+stringdef f  '{U+0919}'
+
+// Palatal consonants:
+stringdef c  '{U+091A}'
+stringdef C  '{U+091B}'
+stringdef j  '{U+091C}'
+stringdef J  '{U+091D}'
+stringdef F  '{U+091E}'
+
+// Retroflex consonants:
+stringdef t  '{U+091F}'
+stringdef T  '{U+0920}'
+stringdef d  '{U+0921}'
+stringdef D  '{U+0922}'
+stringdef N  '{U+0923}'
+
+// Dental consonants:
+stringdef w  '{U+0924}'
+stringdef W  '{U+0925}'
+stringdef x  '{U+0926}'
+stringdef X  '{U+0927}'
+stringdef n  '{U+0928}'
+
+// Labial consonants:
+stringdef p  '{U+092A}'
+stringdef P  '{U+092B}'
+stringdef b  '{U+092C}'
+stringdef B  '{U+092D}'
+stringdef m  '{U+092E}'
+
+// Semi-vowels:
+stringdef y  '{U+092F}'
+stringdef r  '{U+0930}'
+stringdef l  '{U+0932}'
+stringdef v  '{U+0935}'
+
+// Fricatives:
+stringdef S  '{U+0936}'
+stringdef R  '{U+0937}'
+stringdef s  '{U+0938}'
+stringdef h  '{U+0939}'
+
+stringdef lY '{U+0933}'
+
+// Precomposed characters - letters + nukta:
+stringdef nZ '{U+0929}' // ≡ {n}{Z}
+stringdef rZ '{U+0931}' // ≡ {r}{Z}
+stringdef lYZ '{U+0934}' // ≡ {lY}{Z}
+stringdef kZ '{U+0958}' // ≡ {k}{Z}
+stringdef KZ '{U+0959}' // ≡ {K}{Z}
+stringdef gZ '{U+095A}' // ≡ {g}{Z}
+stringdef jZ '{U+095B}' // ≡ {j}{Z}
+stringdef dZ '{U+095C}' // ≡ {d}{Z}
+stringdef DZ '{U+095D}' // ≡ {D}{Z}
+stringdef PZ '{U+095E}' // ≡ {P}{Z}
+stringdef yZ '{U+095F}' // ≡ {y}{Z}
+
+groupings ( consonant )
+
+routines ( CONSONANT )
+
+define consonant '{k}{K}{g}{G}{f}' +
+                 '{c}{C}{j}{J}{F}' +
+                 '{t}{T}{d}{D}{N}' +
+                 '{w}{W}{x}{X}{n}' +
+                 '{p}{P}{b}{B}{m}' +
+                 '{y}{r}{l}{v}' +
+                 '{S}{R}{s}{h}' +
+                 '{lY}' +
+                 '{Z}' + // Nukta
+                 // Precomposed characters - letter and nukta:
+                 '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}'
+
+backwardmode ( define CONSONANT as ( consonant ) )
+
+define stem as (
+    // We assume in this implementation that the whole word doesn't count
+    // as a valid suffix to remove, so we remove the longest suffix from
+    // the list which leaves at least one character.  This change affects
+    // 47 words out of the 65,140 in the sample vocabulary from Hindi
+    // wikipedia.
+    //
+    // The trick here is we use `next` in forward mode to advance the cursor
+    // to the second character, then `backwards` swaps the cursor and limit.
+    next
+    backwards (
+        [substring] among (
+            // The list below is derived from figure 3 in the paper.
+            //
+            // We perform the stemming on the Devanagari characters rather than
+            // transliterating to Latin, so we have adapted the list below to
+            // reflect this by converting suffixes back to Devanagari as
+            // follows:
+            //
+            // * within the suffixes, "a" after a consonant is dropped since
+            //   consonants have an implicit "a".
+            //
+            // * within the suffixes, a vowel other than "a" after a consonant
+            //   is a dependent vowel (vowel sign); a vowel (including "a")
+            //   after a non-consonant is an independent vowel.
+            //
+            // * to allow the vowel at the start of each suffix being dependent
+            //   or independent, we include each suffix twice.  For the
+            //   dependent version, a leading "a" is dropped and we check that
+            //   the suffix is preceded by a consonant (which will have an
+            //   implicit "a").
+            //
+            // * we add '{a}', which is needed for the example given right at
+            //   the end of section 5 to work (conflating BarawIya and
+            //   BarawIyawA), and which 3.1 a.v strongly suggests should be in
+            //   the list:
+            //
+            //     Thus, the following suffix deletions (longest possible
+            //     match) are required to reduce inflected forms of masculine
+            //     nouns to a common stem:
+            //     a A i [...]
+            //
+            //   Adding '{a}' only affect 2 words out of the 65,140 in the
+            //   sample vocabulary.
+            //
+            // * The transliterations of our stems would end with "a" when our
+            //   stems end in a consonant, so we also include {virama} in the
+            //   list of suffixes to remove (this affects 222 words from the
+            //   sample vocabulary).
+            //
+            // We've also assumed that Mh in the suffix list always means {Mh}
+            // and never {M}{h}{virama}.  Only one of the 65,140 words in the
+            // sample vocabulary stems differently due to this (and that word
+            // seems to be a typo).
+
+            '{virama}'
+
+            '{a}'
+            '{A}'
+            '{i}'
+            '{I}'
+            '{u}'
+            '{U}'
+            '{e}'
+            '{o}'
+            '{e}{M}'
+            '{o}{M}'
+            '{A}{M}'
+            '{u}{A}{M}'
+            '{u}{e}{M}'
+            '{u}{o}{M}'
+            '{A}{e}{M}'
+            '{A}{o}{M}'
+            '{i}{y}{_A}{M}'
+            '{i}{y}{_o}{M}'
+            '{A}{i}{y}{_A}{M}'
+            '{A}{i}{y}{_o}{M}'
+            '{A}{Mh}'
+            '{i}{y}{_A}{Mh}'
+            '{A}{i}{y}{_A}{Mh}'
+            '{a}{w}{_A}{e}{M}'
+            '{a}{w}{_A}{o}{M}'
+            '{a}{n}{_A}{e}{M}'
+            '{a}{n}{_A}{o}{M}'
+            '{a}{w}{_A}'
+            '{a}{w}{_I}'
+            '{I}{M}'
+            '{a}{w}{_I}{M}'
+            '{a}{w}{_e}'
+            '{A}{w}{_A}'
+            '{A}{w}{_I}'
+            '{A}{w}{_I}{M}'
+            '{A}{w}{_e}'
+            '{a}{n}{_A}'
+            '{a}{n}{_I}'
+            '{a}{n}{_e}'
+            '{A}{n}{_A}'
+            '{A}{n}{_e}'
+            '{U}{M}{g}{_A}'
+            '{U}{M}{g}{_I}'
+            '{A}{U}{M}{g}{_A}'
+            '{A}{U}{M}{g}{_I}'
+            '{e}{M}{g}{_e}'
+            '{e}{M}{g}{_I}'
+            '{A}{e}{M}{g}{_e}'
+            '{A}{e}{M}{g}{_I}'
+            '{o}{g}{_e}'
+            '{o}{g}{_I}'
+            '{A}{o}{g}{_e}'
+            '{A}{o}{g}{_I}'
+            '{e}{g}{_A}'
+            '{e}{g}{_I}'
+            '{A}{e}{g}{_A}'
+            '{A}{e}{g}{_I}'
+            '{A}{y}{_A}'
+            '{A}{e}'
+            '{A}{I}'
+            '{A}{I}{M}'
+            '{i}{e}'
+            '{A}{o}'
+            '{A}{i}{e}'
+            '{a}{k}{r}'
+            '{A}{k}{r}'
+
+            '{_A}'
+            '{_i}'
+            '{_I}'
+            '{_u}'
+            '{_U}'
+            '{_e}'
+            '{_o}'
+            '{_e}{M}'
+            '{_o}{M}'
+            '{_A}{M}'
+            '{_u}{A}{M}'
+            '{_u}{e}{M}'
+            '{_u}{o}{M}'
+            '{_A}{e}{M}'
+            '{_A}{o}{M}'
+            '{_i}{y}{_A}{M}'
+            '{_i}{y}{_o}{M}'
+            '{_A}{i}{y}{_A}{M}'
+            '{_A}{i}{y}{_o}{M}'
+            '{_A}{Mh}'
+            '{_i}{y}{_A}{Mh}'
+            '{_A}{i}{y}{_A}{Mh}'
+            '{_I}{M}'
+            '{_A}{w}{_A}'
+            '{_A}{w}{_I}'
+            '{_A}{w}{_I}{M}'
+            '{_A}{w}{_e}'
+            '{_A}{n}{_A}'
+            '{_A}{n}{_e}'
+            '{_U}{M}{g}{_A}'
+            '{_U}{M}{g}{_I}'
+            '{_A}{U}{M}{g}{_A}'
+            '{_A}{U}{M}{g}{_I}'
+            '{_e}{M}{g}{_e}'
+            '{_e}{M}{g}{_I}'
+            '{_A}{e}{M}{g}{_e}'
+            '{_A}{e}{M}{g}{_I}'
+            '{_o}{g}{_e}'
+            '{_o}{g}{_I}'
+            '{_A}{o}{g}{_e}'
+            '{_A}{o}{g}{_I}'
+            '{_e}{g}{_A}'
+            '{_e}{g}{_I}'
+            '{_A}{e}{g}{_A}'
+            '{_A}{e}{g}{_I}'
+            '{_A}{y}{_A}'
+            '{_A}{e}'
+            '{_A}{I}'
+            '{_A}{I}{M}'
+            '{_i}{e}'
+            '{_A}{o}'
+            '{_A}{i}{e}'
+            '{_A}{k}{r}'
+
+            /* Suffixes with a leading implicit a: */
+            '{w}{_A}{e}{M}' CONSONANT
+            '{w}{_A}{o}{M}' CONSONANT
+            '{n}{_A}{e}{M}' CONSONANT
+            '{n}{_A}{o}{M}' CONSONANT
+            '{w}{_A}' CONSONANT
+            '{w}{_I}' CONSONANT
+            '{w}{_I}{M}' CONSONANT
+            '{w}{_e}' CONSONANT
+            '{n}{_A}' CONSONANT
+            '{n}{_I}' CONSONANT
+            '{n}{_e}' CONSONANT
+            '{k}{r}' CONSONANT
+        )
+        delete
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/hindi/stemmer.tt b/algorithms/hindi/stemmer.tt new file mode 100644 index 0000000..8300a82 --- /dev/null +++ b/algorithms/hindi/stemmer.tt @@ -0,0 +1,83 @@ +[% header('Hindi stemming algorithm') %] + +

Links to resources

+ + + +

+This is an implementation of the "Lightweight Stemmer for Hindi" described in: +

+ +
+ A. Ramanathan and D. Rao (2003) A Lightweight Stemmer for Hindi +
+ +

+The major difference in our implementation is that rather than transliterating +to the Latin alphabet we instead work in the original Devanagari script. We +have modified the suffixes in the list by converting them back to Devanagari +like so: +

+ + + +

+The transliterations of our stems would end with "a" when our +stems end in a consonant, so we also include the character virama in the +list of suffixes to remove (this affects 222 words from our sample vocabulary). +

+ +

+Aside from this, our implementation attempts to be faithful to the algorithm +described in the paper, though in a few places we've had to resolve ambiguities +in the paper: +

+ + + +

The full algorithm in Snowball

+ +[% highlight_file('hindi') %] + +[% footer %] diff --git a/algorithms/hungarian/stemmer.html b/algorithms/hungarian/stemmer.html new file mode 100644 index 0000000..fd38c93 --- /dev/null +++ b/algorithms/hungarian/stemmer.html @@ -0,0 +1,741 @@ + + + + + + + + + + Hungarian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Hungarian stemming algorithm

+ + +

Contributed by Anna Tordai University of Amsterdam +

+ +

Links to resources

+ + + +

+Here is a sample of Hungarian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+babaháznak
+babakocsi
+babakocsijáért
+babakocsit
+babakocsiért
+babból
+bab
+babgulyás
+babgulyást
+babona
+babonákkal
+babonás
+babrálgatta
+babrálni
+babrál
+babrált
+babrálva
+babusgatnak
+baba
+babái
+babák
+babákkal
+babázni
+babérfa
+babérokat
+babért
+bacchánsnők
+badacsonyi
+badarság
+badarságok
+baedeker
+baglyokat
+bagolyszemüveges
+bagót
+bajbajutott
+bajbajutottak
+bajbajutottakat
+bajbajutottakon
+bajlódjanak
+bajlódni
+
+babaház
+babakocs
+babakocs
+babakocs
+babakocs
+bab
+bab
+babgulyás
+babgulyás
+babon
+babona
+babonás
+babrálgatt
+babráln
+babrál
+babrál
+babrálv
+babusgat
+ba
+baba
+baba
+baba
+babázn
+babérf
+babér
+bab
+bacchánsnő
+badacsony
+badarság
+badarság
+baedeker
+bagly
+bagolyszemüveges
+bagó
+bajbajutot
+bajbajutott
+bajbajutott
+bajbajutott
+bajlód
+bajlódn
+
+muattta
+mukkot
+mulandóság
+mulandóságot
+mulasszátok
+mulasztanak
+mulasztotta
+mulasztottam
+mulasztották
+mulaszt
+mulaszthatom
+mulasztás
+mulasztásban
+mulasztásból
+mulasztásnál
+mulasztással
+mulasztásának
+mulasztásánál
+mulasztásáért
+mulasztási
+mulasztásos
+mulasztó
+mulathatnánk
+mulathattunk
+mulatna
+mulat
+mulatnak
+mulatni
+mulattak
+mulattat
+mulattatta
+mulatott
+mulatozott
+mulatozáshoz
+mulatozást
+mulatság
+mulatságnak
+mulatságot
+mulatságos
+mulatt
+
+muattt
+muk
+mulandóság
+mulandóság
+mulasszát
+mulaszt
+mulasztott
+mulasztott
+mulasztotta
+mulasz
+mulaszthat
+mulasztás
+mulasztás
+mulasztás
+mulasztás
+mulasztás
+mulasztás
+mulasztás
+mulasztás
+mulasztás
+mulasztásos
+mulasztó
+mulathatna
+mulathatt
+mulatn
+mul
+mulat
+mulatn
+mulatt
+mulatt
+mulattatt
+mulatot
+mulatozot
+mulatozás
+mulatozás
+mulatság
+mulatság
+mulatság
+mulatságos
+mulat
+
+
+ +

+This stemming algorithm removes the inflectional suffixes of nouns. Nouns are +inflected for case, person/possession and number. +

+ +

+Letters in Hungarian include the following accented forms, +

+ +
+ á   é   í   ó   ö   ő   ú   ü   ű +
+ +

+The following letters are vowels: +

+ +
+ a   á   e   é   i   í   o   ó   ö   ő   u   ú   + ü   ű +
+ +

+The following letters are digraphs: +

+ +
+ cs   dz   dzs   gy   ly   ny   ty   zs +
+ +

+A double consonant is defined as: +

+ +
+ bb   cc   ccs   dd   ff   gg   ggy   jj   kk   ll   lly   mm +   nn   nny   pp   rr   ss   ssz   tt   tty   vv   zz   zzs +
+ +

+If the word begins with a vowel, R1 is defined as the region after the +first consonant or digraph in the word. If the word begins with a consonant, it +is defined as the region after the first vowel in the word. If the word does +not contain both a vowel and consonant, R1 is the null region at the end of +the word. +

+ +

+For example: +

+ +
+    t ó b a n           consonant-vowel
+       |.....|          R1 is 'a b a n'
+
+    a b l a k a n       vowel-consonant
+       |.........|      R1 is 'l a k a n'
+
+    a c s o n y         vowel-digraph
+         |.....|        R1 is 'o n y'
+
+    c v s
+     --->|<---          null R1 region
+
+ +

+‘Delete if in R1’ means that the suffix should be removed if it is in +region R1 but not if it is outside. +

+ +

+Do steps 1 to 9 in turn +

+ +

+Step 1: Remove instrumental case +

+ +
+ Search for one of the following suffixes and perform the action indicated. +
+
al   el +
delete if in R1 and preceded by a double consonant, and + remove one of the double consonants. (In the case of consonant plus digraph, such as ccs, remove a c). +
+
+ +

+Step 2: Remove frequent cases +

+ +
+ Search for the longest among the following suffixes and perform the action indicated. +
+
ban   ben   ba   be   ra   re   nak   nek   val   vel   tól +   től   ról   ről   ból   ből   hoz   hez   höz   + nál   nél   ig   at   et   ot   öt   ért   képp   + képpen   kor   ul   ül   vá   vé   onként   enként   + anként   ként   en   on   an   ön   n   t + + +
delete if in R1 +
if the remaining word ends á replace by a +
if the remaining word ends é replace by e +
+
+ +

+Step 3: Remove special cases: +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
án   ánként +
replace by a if in R1 +
én +
replace by e if in R1 +
+
+ +

+Step 4: Remove other cases: +

+ +
+ Search for the longest among the following suffixes and perform the action indicated +
+
astul   estül   stul   stül +
delete if in R1 +
ástul +
replace with a if in R1 +
éstül +
replace with e if in R1 +
+
+ +

+Step 5: Remove factive case +

+ +
+ Search for one of the following suffixes and perform the action indicated. +
+
á   é +
delete if in R1 and preceded by a double consonant, and + remove one of the double consonants (as in step 1). +
+
+ +

+Step 6: Remove owned +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
oké   öké   aké   eké   ké   éi   é +
delete if in R1 +
áké   áéi +
replace with a if in R1 +
éké   ééi   éé +
replace with e if in R1 +
+
+ +

+Step 7: Remove singular owner suffixes +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
ünk   unk   nk   juk   jük   uk   ük   em   om   am   m +   od   ed   ad   öd   d   ja   je   a   e o +
delete if in R1 +
ánk ájuk ám ád á +
replace with a if in R1 +
énk éjük ém éd é +
replace with e if in R1 +
+
+ +

+Step 8: Remove plural owner suffixes +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
jaim   jeim   aim   eim   im   jaid   jeid   aid   eid   id   + jai   jei   ai   ei   i   jaink   jeink   eink   aink   ink   + jaitok   jeitek   aitok   eitek   itek   jeik   jaik   aik   eik   + ik + +
delete if in R1 +
áim   áid   ái   áink   áitok   áik +
replace with a if in R1 +
éim   éid     éi   éink   éitek   éik +
replace with e if in R1 +
+
+ +

+Step 9: Remove plural suffixes +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
ák +
replace with a if in R1 +
replace with e if in R1 +
ök   ok   ek   ak   k +
delete if in R1 +
+
+ +

The full algorithm in Snowball

+ +
/*
+Hungarian Stemmer
+Removes noun inflections
+*/
+
+routines (
+    mark_regions
+    R1
+    v_ending
+    case
+    case_special
+    case_other
+    plural
+    owned
+    sing_owner
+    plur_owner
+    instrum
+    factive
+    undouble
+    double
+)
+
+externals ( stem )
+
+integers ( p1 )
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a'  '{U+00E1}'  //a-acute
+stringdef e'  '{U+00E9}'  //e-acute
+stringdef i'  '{U+00ED}'  //i-acute
+stringdef o'  '{U+00F3}'  //o-acute
+stringdef o"  '{U+00F6}'  //o-umlaut
+stringdef oq  '{U+0151}' //o-double acute
+stringdef u'  '{U+00FA}'  //u-acute
+stringdef u"  '{U+00FC}'  //u-umlaut
+stringdef uq  '{U+0171}' //u-double acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
+
+define mark_regions as (
+
+    $p1 = limit
+
+    (v goto non-v
+     among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
+     setmark p1)
+    or
+
+    (non-v gopast v setmark p1)
+)
+
+backwardmode (
+
+    define R1 as $p1 <= cursor
+
+    define v_ending as (
+        [substring] R1 among(
+            '{a'}' (<- 'a')
+            '{e'}' (<- 'e')
+        )
+    )
+
+    define double as (
+        test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
+        'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
+    )
+
+    define undouble as (
+        next [hop 1] delete
+    )
+
+    define instrum as(
+        [substring] R1 among(
+            'al' (double)
+            'el' (double)
+        )
+        delete
+        undouble
+    )
+
+
+    define case as (
+        [substring] R1 among(
+            'ban' 'ben'
+            'ba' 'be'
+            'ra' 're'
+            'nak' 'nek'
+            'val' 'vel'
+            't{o'}l' 't{oq}l'
+            'r{o'}l' 'r{oq}l'
+            'b{o'}l' 'b{oq}l'
+            'hoz' 'hez' 'h{o"}z'
+            'n{a'}l' 'n{e'}l'
+            'ig'
+            'at' 'et' 'ot' '{o"}t'
+            '{e'}rt'
+            'k{e'}pp' 'k{e'}ppen'
+            'kor'
+            'ul' '{u"}l'
+            'v{a'}' 'v{e'}'
+            'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
+            'k{e'}nt'
+            'en' 'on' 'an' '{o"}n'
+            'n'
+            't'
+        )
+        delete
+        v_ending
+    )
+
+    define case_special as(
+        [substring] R1 among(
+            '{e'}n' (<- 'e')
+            '{a'}n' (<- 'a')
+            '{a'}nk{e'}nt' (<- 'a')
+        )
+    )
+
+    define case_other as(
+        [substring] R1 among(
+            'astul' 'est{u"}l' (delete)
+            'stul' 'st{u"}l' (delete)
+            '{a'}stul' (<- 'a')
+            '{e'}st{u"}l' (<- 'e')
+        )
+    )
+
+    define factive as(
+        [substring] R1 among(
+            '{a'}' (double)
+            '{e'}' (double)
+        )
+        delete
+        undouble
+    )
+
+    define plural as (
+        [substring] R1 among(
+            '{a'}k' (<- 'a')
+            '{e'}k' (<- 'e')
+            '{o"}k' (delete)
+            'ak' (delete)
+            'ok' (delete)
+            'ek' (delete)
+            'k' (delete)
+        )
+    )
+
+    define owned as (
+        [substring] R1 among (
+            'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
+            '{e'}k{e'}' (<- 'e')
+            '{a'}k{e'}' (<- 'a')
+            'k{e'}' (delete)
+            '{e'}{e'}i' (<- 'e')
+            '{a'}{e'}i' (<- 'a')
+            '{e'}i'  (delete)
+            '{e'}{e'}' (<- 'e')
+            '{e'}' (delete)
+        )
+    )
+
+    define sing_owner as (
+        [substring] R1 among(
+            '{u"}nk' 'unk' (delete)
+            '{a'}nk' (<- 'a')
+            '{e'}nk' (<- 'e')
+            'nk' (delete)
+            '{a'}juk' (<- 'a')
+            '{e'}j{u"}k' (<- 'e')
+            'juk' 'j{u"}k' (delete)
+            'uk' '{u"}k' (delete)
+            'em' 'om' 'am' (delete)
+            '{a'}m' (<- 'a')
+            '{e'}m' (<- 'e')
+            'm' (delete)
+            'od' 'ed' 'ad' '{o"}d' (delete)
+            '{a'}d' (<- 'a')
+            '{e'}d' (<- 'e')
+            'd' (delete)
+            'ja' 'je' (delete)
+            'a' 'e' 'o' (delete)
+            '{a'}' (<- 'a')
+            '{e'}' (<- 'e')
+        )
+    )
+
+    define plur_owner as (
+        [substring] R1 among(
+            'jaim' 'jeim' (delete)
+            '{a'}im' (<- 'a')
+            '{e'}im' (<- 'e')
+            'aim' 'eim' (delete)
+            'im' (delete)
+            'jaid' 'jeid' (delete)
+            '{a'}id' (<- 'a')
+            '{e'}id' (<- 'e')
+            'aid' 'eid' (delete)
+            'id' (delete)
+            'jai' 'jei' (delete)
+            '{a'}i' (<- 'a')
+            '{e'}i' (<- 'e')
+            'ai' 'ei' (delete)
+            'i' (delete)
+            'jaink' 'jeink' (delete)
+            'eink' 'aink' (delete)
+            '{a'}ink' (<- 'a')
+            '{e'}ink' (<- 'e')
+            'ink'
+            'jaitok' 'jeitek' (delete)
+            'aitok' 'eitek' (delete)
+            '{a'}itok' (<- 'a')
+            '{e'}itek' (<- 'e')
+            'itek' (delete)
+            'jeik' 'jaik' (delete)
+            'aik' 'eik' (delete)
+            '{a'}ik' (<- 'a')
+            '{e'}ik' (<- 'e')
+            'ik' (delete)
+        )
+    )
+)
+
+define stem as (
+    do mark_regions
+    backwards (
+      do instrum
+        do case
+        do case_special
+        do case_other
+        do factive
+        do owned
+        do sing_owner
+        do plur_owner
+        do plural
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/hungarian/stemmer.tt b/algorithms/hungarian/stemmer.tt new file mode 100644 index 0000000..18e64e4 --- /dev/null +++ b/algorithms/hungarian/stemmer.tt @@ -0,0 +1,243 @@ +[% header('Hungarian stemming algorithm') %] + +

Contributed by Anna Tordai University of Amsterdam +

+ +

Links to resources

+ + + +[% algorithm_vocab([40, 'babaháznak', 'muattta']) %] + +

+This stemming algorithm removes the inflectional suffixes of nouns. Nouns are +inflected for case, person/possession and number. +

+ +

+Letters in Hungarian include the following accented forms, +

+ +
+ á   é   í   ó   ö   ő   ú   ü   ű +
+ +

+The following letters are vowels: +

+ +
+ a   á   e   é   i   í   o   ó   ö   ő   u   ú   + ü   ű +
+ +

+The following letters are digraphs: +

+ +
+ cs   dz   dzs   gy   ly   ny   ty   zs +
+ +

+A double consonant is defined as: +

+ +
+ bb   cc   ccs   dd   ff   gg   ggy   jj   kk   ll   lly   mm +   nn   nny   pp   rr   ss   ssz   tt   tty   vv   zz   zzs +
+ +

+If the word begins with a vowel, R1 is defined as the region after the +first consonant or digraph in the word. If the word begins with a consonant, it +is defined as the region after the first vowel in the word. If the word does +not contain both a vowel and consonant, R1 is the null region at the end of +the word. +

+ +

+For example: +

+ +
+    t ó b a n           consonant-vowel
+       |.....|          R1 is 'a b a n'
+
+    a b l a k a n       vowel-consonant
+       |.........|      R1 is 'l a k a n'
+
+    a c s o n y         vowel-digraph
+         |.....|        R1 is 'o n y'
+
+    c v s
+     --->|<---          null R1 region
+
+ +

+‘Delete if in R1’ means that the suffix should be removed if it is in +region R1 but not if it is outside. +

+ +

+Do steps 1 to 9 in turn +

+ +

+Step 1: Remove instrumental case +

+ +
+ Search for one of the following suffixes and perform the action indicated. +
+
al   el +
delete if in R1 and preceded by a double consonant, and + remove one of the double consonants. (In the case of consonant plus digraph, such as ccs, remove a c). +
+
+ +

+Step 2: Remove frequent cases +

+ +
+ Search for the longest among the following suffixes and perform the action indicated. +
+
ban   ben   ba   be   ra   re   nak   nek   val   vel   tól +   től   ról   ről   ból   ből   hoz   hez   höz   + nál   nél   ig   at   et   ot   öt   ért   képp   + képpen   kor   ul   ül   vá   vé   onként   enként   + anként   ként   en   on   an   ön   n   t + + +
delete if in R1 +
if the remaining word ends á replace by a +
if the remaining word ends é replace by e +
+
+ +

+Step 3: Remove special cases: +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
án   ánként +
replace by a if in R1 +
én +
replace by e if in R1 +
+
+ +

+Step 4: Remove other cases: +

+ +
+ Search for the longest among the following suffixes and perform the action indicated +
+
astul   estül   stul   stül +
delete if in R1 +
ástul +
replace with a if in R1 +
éstül +
replace with e if in R1 +
+
+ +

+Step 5: Remove factive case +

+ +
+ Search for one of the following suffixes and perform the action indicated. +
+
á   é +
delete if in R1 and preceded by a double consonant, and + remove one of the double consonants (as in step 1). +
+
+ +

+Step 6: Remove owned +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
oké   öké   aké   eké   ké   éi   é +
delete if in R1 +
áké   áéi +
replace with a if in R1 +
éké   ééi   éé +
replace with e if in R1 +
+
+ +

+Step 7: Remove singular owner suffixes +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
ünk   unk   nk   juk   jük   uk   ük   em   om   am   m +   od   ed   ad   öd   d   ja   je   a   e o +
delete if in R1 +
ánk ájuk ám ád á +
replace with a if in R1 +
énk éjük ém éd é +
replace with e if in R1 +
+
+ +

+Step 8: Remove plural owner suffixes +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
jaim   jeim   aim   eim   im   jaid   jeid   aid   eid   id   + jai   jei   ai   ei   i   jaink   jeink   eink   aink   ink   + jaitok   jeitek   aitok   eitek   itek   jeik   jaik   aik   eik   + ik + +
delete if in R1 +
áim   áid   ái   áink   áitok   áik +
replace with a if in R1 +
éim   éid     éi   éink   éitek   éik +
replace with e if in R1 +
+
+ +

+Step 9: Remove plural suffixes +

+ +
+ Search for the longest among the following suffixes and perform the action + indicated. +
+
ák +
replace with a if in R1 +
replace with e if in R1 +
ök   ok   ek   ak   k +
delete if in R1 +
+
+ +

The full algorithm in Snowball

+ +[% highlight_file('hungarian') %] + +[% footer %] diff --git a/algorithms/hungarian/stop.txt b/algorithms/hungarian/stop.txt new file mode 100644 index 0000000..2599a8d --- /dev/null +++ b/algorithms/hungarian/stop.txt @@ -0,0 +1,203 @@ + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/algorithms/index.html b/algorithms/index.html new file mode 100644 index 0000000..287ec3d --- /dev/null +++ b/algorithms/index.html @@ -0,0 +1,170 @@ + + + + + + + + + + Stemming algorithms - Snowball + + + + + + + + + + +
+
+ +
+

Stemming algorithms

+ + +

Stemming for various European languages

+ +

+We present stemming algorithms (with implementations in Snowball) for the +following languages: +

+ + + +

+There are two English stemmers, the original Porter stemmer, +and an improved stemmer which has been called Porter2. Read the accounts of them to +learn a bit more about using Snowball. +

+ +

+Each formal algorithm should be compared with the corresponding Snowball program. +

+ +

+Surprisingly, among the Indo-European languages (*), the French stemmer turns out to be the most complicated, whereas +the Russian stemmer, despite its large number of suffixes, is very simple. In +fact it is interesting that English, with its minimal use of i-suffixes, +has such a complex stemmer. This is partly due to the delicate nature of +i-suffix removal (undoubling the p after removing ing from hopping etc), +and partly to the wealth of forms of d-suffixes, deriving as they do from +the mixed Romance and Germanic ancestry of the language. +

+ +

+Note that by i-suffix we mean inflexional suffix, and by d-suffix, +derivational suffix (*). +

+ +

Other Stemming Algorithms

+ +

+We also provide Snowball implementations of some algorithms developed by other parties: +

+ + + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/index.tt b/algorithms/index.tt new file mode 100644 index 0000000..58c622e --- /dev/null +++ b/algorithms/index.tt @@ -0,0 +1,92 @@ +[% header('Stemming algorithms') %] + +

Stemming for various European languages

+ +

+We present stemming algorithms (with implementations in Snowball) for the +following languages: +

+ + + +

+There are two English stemmers, the original Porter stemmer, +and an improved stemmer which has been called Porter2. Read the accounts of them to +learn a bit more about using Snowball. +

+ +

+Each formal algorithm should be compared with the corresponding Snowball program. +

+ +

+Surprisingly, among the Indo-European languages (*), the French stemmer turns out to be the most complicated, whereas +the Russian stemmer, despite its large number of suffixes, is very simple. In +fact it is interesting that English, with its minimal use of i-suffixes, +has such a complex stemmer. This is partly due to the delicate nature of +i-suffix removal (undoubling the p after removing ing from hopping etc), +and partly to the wealth of forms of d-suffixes, deriving as they do from +the mixed Romance and Germanic ancestry of the language. +

+ +

+Note that by i-suffix we mean inflexional suffix, and by d-suffix, +derivational suffix (*). +

+ +

Other Stemming Algorithms

+ +

+We also provide Snowball implementations of some algorithms developed by other parties: +

+ + + +[% footer %] diff --git a/algorithms/indonesian/stemmer.html b/algorithms/indonesian/stemmer.html new file mode 100644 index 0000000..e50f4ac --- /dev/null +++ b/algorithms/indonesian/stemmer.html @@ -0,0 +1,385 @@ + + + + + + + + + + Indonesian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Indonesian stemming algorithm

+ + +

Links to resources

+ + + +

+This is an implementation of the "Porter Stemmer for Bahasa Indonesia" described +in: +

+ +
+ Tala F Z (2003) A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia. M.S. thesis, University of Amsterdam. +
+ +

+It would be more accurately described as "Porter-style" or "Porter-inspired" +since Martin Porter wasn't directly involved in its development. +

+ +

+Our implementation attempts to be faithful to the algorithm described in the +paper, but we have had to address some places in the paper which are unclear, +and a case where an example doesn't match the described algorithm. +

+ +
    +
  • +

    +In table 2.7 on page 9, the additional condition on the remaining stem for +removing the suffix "i" reads "V|K...c1c1, c1 +≠ s, c2 ≠ i and prefix ∉ {ber, ke, peng}". +

    + +

    +The meaning of this is unclear in several ways, and none of the +examples given of the stemmer's behaviour in the paper help to +resolve these issues. +

    + +

    +Notice that c2 isn't actually used - the most obvious explanation +seems to be that "c1c1" should read +"c1c2", or maybe "c2c1". +

    + +

    +Elsewhere the paper defines V... as meaning "the stem starts with +a vowel" and K... as meaning "the stem starts with a consonant". +

    + +

    +In other places where it says X|Y... it seems the | binds more +tightly, so it's (V|K)...cicj not +V|(K...cicj). That seems a bit +odd as the first letter must be either a vowel or a consonant, so +that really just means "ends cicj". However, nowhere in +the paper uses or defines a notation such as ...X, which may explain this +seemingly redundant way of specifying this. +

    + +

    +The conditions elsewhere on prefix removal (e.g. V...) are clearly +on the stem left after the prefix is removed. None of the other +rules for suffix removal have conditions on the stem, but for +consistency with the prefix rules we might expect that the +cicj test is on what's left after removing the +"i" suffix. +

    + +

    +However, studying Indonesian wordlists and discussion with a native +speaker leads us to conclude that the purpose of this check is to +protect words of foreign origin (e.g. "televisi", "organisasi", +"komunikasi") from stemming, and the common feature of these is +that the word ends "-si", so we conclude that the condition here +should be read as "word does not end -si", and this is what we +have implemented. +

    +
  • + +
  • +

    +On page 29, the example "kompas Q.31" says "Both Nazief and Porter stemmer +converted the word peledakan (blast, explotion) to ledak (to +blast, to explode)". However, the algorithm as described doesn't behave in +this way - grammatically the prefix pe- occurs as a variation of both the +first-order derivational prefix peng- and the second-order derivational prefix +per-, but table 2.5 doesn't include "pe", only table 2.6 does, so "peledakan" +is handled (incorrectly) as having prefix "per" not "peng", and so we remove +derivational suffix "kan" rather than "an" to give stem leda. +(Porter-style stemmers remove the longest suffix they can amongst those +available, which this paper notes in the last paragraph on page 15). +

    + +

    +We resolve this by amending the condition on suffix "kan" to "prefix ∉ +{ke, peng, per}", which seems to make the stemmer's behaviour match all the +examples in the paper except for one: "perbaikan" is shown in table 3.4 +as stemming to "bai", but with this change it now stems to "baik". The +table notes that "baik" is the actual root so this deviation is an +improvement. In a sample vocabulary derived from the most common words in +id.wikipedia.org, this change only affects 0.12% of words (76 out of 64,587, +including "peledakan" and "perbaikan"). +

    +
  • + +
  • +The paper has the condition on removal of prefix "bel" and "pel" as +just "ajar" not "ajar..." but it seems that the latter must be what +is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". +This change only affects a very small number of words (11 out of +64,587), and only for the better. +
  • +
+ +

The full algorithm in Snowball

+ +
// An implementation of the "Porter Stemmer for Bahasa Indonesia" from:
+// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf
+
+integers (
+    // The paper defines measure as the number of vowels in the word.  We
+    // count this initially, then adjust the count each time we remove a
+    // prefix or suffix.
+    measure
+
+    // Numeric code for the type of prefix removed:
+    //
+    // 0 other/none
+    // 1 'di' or 'meng' or 'ter'
+    // 2 'per'
+    // 3 'ke' or 'peng'
+    // 4 'ber'
+    //
+    // Some of these have variant forms, so e.g. "meng" includes "men", "me",
+    // "meny", "mem".
+    //
+    // Note that the value of prefix is only used in remove_suffix (and
+    // routines it calls) so we don't need to worry about
+    // remove_second_order_prefix overwriting a value of prefix set by
+    // remove_first_order_prefix since remove_suffix gets called between
+    // the two.
+    prefix
+)
+
+groupings ( vowel )
+
+routines (
+    remove_particle
+    remove_possessive_pronoun
+    remove_first_order_prefix
+    remove_second_order_prefix
+    remove_suffix
+    KER
+    SUFFIX_KAN_OK
+    SUFFIX_AN_OK
+    SUFFIX_I_OK
+    VOWEL
+)
+
+externals ( stem )
+
+stringescapes {}
+
+backwardmode (
+
+    define remove_particle as (
+        [substring] among (
+            'kah' 'lah' 'pun' (delete $measure-=1)
+        )
+    )
+
+    define remove_possessive_pronoun as (
+        [substring] among (
+            'ku' 'mu' 'nya' (delete $measure-=1)
+        )
+    )
+
+    // prefix not in {ke, peng, per}
+    define SUFFIX_KAN_OK as (
+        // On page 29, the example "kompas Q.31" says "Both Nazief and Porter
+        // stemmer converted the word peledakan (blast, explotion [sic]) to
+        // ledak (to blast, to explode)".  However, the algorithm as described
+        // doesn't behave in this way - grammatically the prefix pe- occurs as a
+        // variation of both the first-order derivational prefix peng- and the
+        // second-order derivational prefix per-, but table 2.5 doesn't include
+        // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
+        // as having prefix "per" not "peng", and so we remove derivational
+        // suffix "kan" rather than "an" to give stem leda.  (Porter-style
+        // stemmers remove the longest suffix they can amongst those available,
+        // which this paper notes in the last paragraph on page 15).
+        //
+        // We resolve this by amending the condition on suffix "kan" to
+        // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's
+        // behaviour match all the examples in the paper except for one:
+        // "perbaikan" is shown in table 3.4 as stemming to "bai", but with
+        // this change it now stems to "baik".  The table notes that "baik" is
+        // the actual root so this deviation is an improvement.  In a sample
+        // vocabulary derived from the most common words in id.wikipedia.org,
+        // this change only affects 0.12% of words (76 out of 64,587, including
+        // "peledakan" and "perbaikan").
+        $prefix != 3 and $prefix != 2
+    )
+
+    // prefix not in {di, meng, ter}
+    define SUFFIX_AN_OK as ( $prefix != 1 )
+
+    define SUFFIX_I_OK as (
+        // prefix not in {ke, peng, ber}
+        $prefix <= 2
+
+        // The rest of the condition from the paper is:
+        //   V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i
+        //
+        // The meaning of this is unclear in several ways, and none of the
+        // examples given of the stemmer's behaviour in the paper help to
+        // resolve these issues.
+        //
+        // Notice that c₂ isn't actually used - the most obvious explanation
+        // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁".
+        //
+        // Elsewhere the paper defines V... as meaning "the stem starts with
+        // a vowel" and K... as meaning "the stem starts with a consonant".
+        //
+        // In other places where it says X|Y... it seems the | binds more
+        // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ).  That seems a bit
+        // odd as the first letter must be either a vowel or a consonant, so
+        // that really just means "ends cᵢcⱼ".  However, nowhere in the paper
+        // uses or defines a notation such as ...X, which may explain this
+        // seemingly redundant way of specifying this.
+        //
+        // The conditions elsewhere on prefix removal (e.g. V...) are clearly
+        // on the stem left after the prefix is removed.  None of the other
+        // rules for suffix removal have conditions on the stem, but for
+        // consistency with the prefix rules we might expect that the cᵢcⱼ
+        // test is on what's left *after* removing the "i" suffix.
+        //
+        // However, studying Indonesian wordlists and discussion with a native
+        // speaker leads us to conclude that the purpose of this check is to
+        // protect words of foreign origin (e.g. "televisi", "organisasi",
+        // "komunikasi") from stemming, and the common feature of these is
+        // that the word ends "-si", so we conclude that the condition here
+        // should be read as "word does not end -si", and this is what we
+        // have implemented.
+        not 's'
+    )
+
+    define remove_suffix as (
+        [substring] among (
+            'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK
+                (delete $measure-=1)
+        )
+    )
+)
+
+define vowel 'aeiou'
+
+define VOWEL as ( vowel )
+
+define KER as ( non-vowel 'er' )
+
+define remove_first_order_prefix as (
+    [substring] among (
+        'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1)
+        'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1)
+        'meny' VOWEL ($prefix=1 <-'s' $measure-=1)
+        'peny' VOWEL ($prefix=3 <-'s' $measure-=1)
+        'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete)
+        'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete)
+    )
+)
+
+define remove_second_order_prefix as (
+    // The paper has the condition on removal of prefix "bel" and "pel" as
+    // just "ajar" not "ajar..." but it seems that the latter must be what
+    // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar".
+    // This change only affects a very small number of words (11 out of
+    // 64,587) and only for the better.
+    [substring] among (
+        'per' 'pe' (delete $prefix=2 $measure-=1)
+        'pelajar' (<-'ajar' $measure-=1)
+        'ber' (delete $prefix=4 $measure-=1)
+        'belajar' (<-'ajar' $prefix=4 $measure-=1)
+        'be' KER (delete $prefix=4 $measure-=1)
+    )
+)
+
+define stem as (
+    $measure = 0
+    do ( repeat ( gopast vowel $measure+=1 ) )
+    $measure > 2
+    $prefix = 0
+    backwards (
+        do remove_particle
+        $measure > 2
+        do remove_possessive_pronoun
+    )
+    $measure > 2
+    test (
+        remove_first_order_prefix
+        do (
+            test ($measure > 2 backwards remove_suffix)
+            $measure > 2 remove_second_order_prefix
+        )
+    ) or (
+        do remove_second_order_prefix
+        do ($measure > 2 backwards remove_suffix)
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/indonesian/stemmer.tt b/algorithms/indonesian/stemmer.tt new file mode 100644 index 0000000..ffffe04 --- /dev/null +++ b/algorithms/indonesian/stemmer.tt @@ -0,0 +1,124 @@ +[% header('Indonesian stemming algorithm') %] + +

Links to resources

+ + + +

+This is an implementation of the "Porter Stemmer for Bahasa Indonesia" described +in: +

+ +
+ Tala F Z (2003) A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia. M.S. thesis, University of Amsterdam. +
+ +

+It would be more accurately described as "Porter-style" or "Porter-inspired" +since Martin Porter wasn't directly involved in its development. +

+ +

+Our implementation attempts to be faithful to the algorithm described in the +paper, but we have had to address some places in the paper which are unclear, +and a case where an example doesn't match the described algorithm. +

+ + + +

The full algorithm in Snowball

+ +[% highlight_file('indonesian') %] + +[% footer %] diff --git a/algorithms/indonesian/stop.txt b/algorithms/indonesian/stop.txt new file mode 100644 index 0000000..c433b01 --- /dev/null +++ b/algorithms/indonesian/stop.txt @@ -0,0 +1,91 @@ +yang | that +dan | and +di | in +dari | from +ini | this +pada kepada | at, to [person] +ada adalah | there is, is +dengan | with +untuk | for +dalam | in the +oleh | by +sebagai | as +juga | also, too +ke | to +atau | or +tidak | not +itu | that +sebuah | a +tersebut | the +dapat | can, may +ia | he/she, yes +telah | already +satu | one +memiliki | have +mereka | they +bahwa | that +lebih | more, more than +karena | because, since +seorang | one person, same +akan | will, about to +seperti | as, like +secara | on +kemudian | later, then +beberapa | some +banyak | many +antara | between +setelah | after +yaitu | that is +hanya | only +hingga | to +serta | along with +sama | same, and +dia | he/she/it (informal) +tetapi | but +namun | however +melalui | through +bisa | can +sehingga | so +ketika | when +suatu | a +sendiri | own (adverb) +bagi | for +semua | all +harus | must +setiap | each, every +maka | then +maupun | as well +tanpa | without +saja | only +jika | if +bukan | not +belum | not yet +sedangkan | while +yakni | i.e. +meskipun | although +hampir | almost +kita | we/us (inclusive) +demikian | thereby +daripada | from/than/instead of +apa | what/which/or/eh +ialah | is +sana | there +begitu | so +seseorang | someone +selain | besides +terlalu | too +ataupun | or +saya | me/I (formal) +bila | if/when +bagaimana | how +tapi | but +apabila | when/if +kalau | if +kami | we/us (exclusive) +melainkan | but (rather) +boleh | may,can +aku | I/me (informal) +anda | you (formal) +kamu | you (informal) +beliau | he/she/it (formal) +kalian | you (plural) diff --git a/algorithms/irish/stemmer.html b/algorithms/irish/stemmer.html new file mode 100644 index 0000000..664838e --- /dev/null +++ b/algorithms/irish/stemmer.html @@ -0,0 +1,457 @@ + + + + + + + + + + Irish Gaelic stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Irish Gaelic stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Irish vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+ábharú
+ábhbar
+abhcach
+abhchóide
+abhcóid
+abhcóide
+abhcóideacht
+abhcóidí
+abhcóidíocht
+abhcóidíochta
+abheadh
+ábhéile
+abheimídne
+abhfad
+abhfos
+abhfus
+abhhaile
+abhí
+abhlach
+abhlainn
+abhlainne
+abhlainneach
+abhlaireacht
+abhlann
+abhlóir
+abhlóird
+abhlóirí
+abhlóra
+abhna
+abhóg
+
+ábharú
+ábhbar
+abhcach
+abhchóide
+abhcóid
+abhcóide
+abhcóid
+abhcóidí
+abhcóid
+abhcóid
+abh
+ábhéile
+abheimídne
+abhfad
+abhfos
+abhfus
+abhhaile
+abhí
+abhlach
+abhlainn
+abhlainne
+abhlainn
+abhlair
+abhlann
+abhlóir
+abhlóird
+abhlóirí
+abhlóra
+abhna
+abhóg
+
+pábháil
+pábhaile
+pábhailí
+pábhaillí
+pábháilte
+pábhála
+pábhálaithe
+pabhar
+pabhsae
+pabhsaeir
+pabhsaer
+pabhsaetha
+paca
+páca
+pacaeirí
+pacaí
+pacáil
+pacáilte
+pacáiltear
+pacaire
+pacaireachta
+pacáiste
+pácáiste
+pacaistí
+pacáistí
+pacáistín
+pacáistíocht
+pacáistíochta
+pacáistítear
+pacáistithe
+
+pábh
+pábhaile
+pábhailí
+pábhaillí
+pábháilte
+pábhála
+pábhálaithe
+pabhar
+pabhsae
+pabhsaeir
+pabhsaer
+pabhsaetha
+paca
+páca
+pacaeirí
+pacaí
+pac
+pacáilte
+pacáil
+pacaire
+pacair
+pacáiste
+pácáiste
+pacaistí
+pacáistí
+pacáistín
+pacáist
+pacáist
+pacáistí
+pacáistithe
+
+
+ +

+This basic stemmer for Irish was developed and contributed by Jim +O’Regan. +

+ +

+One thing that should be taken into account with Irish is the initial +mutation (n-eclipsis and h-prothesis) which causes problems if words +are simply folded to lowercase before stemming in the way that is +usually assumed by Snowball stemmers. A Snowball version of an algorithm to +fold to lowercase while taking this into account would look something like: +

+ +
stringescapes {}
+
+stringdef A'   '{U+00C1}'
+stringdef E'   '{U+00C9}'
+stringdef I'   '{U+00CD}'
+stringdef O'   '{U+00D3}'
+stringdef U'   '{U+00DA}'
+stringdef a'   '{U+00E1}'
+stringdef e'   '{U+00E9}'
+stringdef i'   '{U+00ED}'
+stringdef o'   '{U+00F3}'
+stringdef u'   '{U+00FA}'
+
+define tolower_irish as (
+ [substring] among (
+   'nA' (<- 'n-a')
+   'nE' (<- 'n-e')
+   'nI' (<- 'n-i')
+   'nO' (<- 'n-o')
+   'nU' (<- 'n-u')
+   'n{A'}' (<- 'n-{a'}')
+   'n{E'}' (<- 'n-{e'}')
+   'n{I'}' (<- 'n-{i'}')
+   'n{O'}' (<- 'n-{o'}')
+   'n{U'}' (<- 'n-{u'}')
+
+   'tA' (<- 't-a')
+   'tE' (<- 't-e')
+   'tI' (<- 't-i')
+   'tO' (<- 't-o')
+   'tU' (<- 't-u')
+   't{A'}' (<- 't-{a'}')
+   't{E'}' (<- 't-{e'}')
+   't{I'}' (<- 't-{i'}')
+   't{O'}' (<- 't-{o'}')
+   't{U'}' (<- 't-{u'}')
+ )
+)
+
+ + +

+The following characters are vowels for the purposes of this algorithm: + +

+ a e i o u á é í ó ú +
+ +

+The algorithm first addresses the initial mutation, then regions are determined +based on the word after this first step: +

+ +
    +
  • RV is the region after the first vowel, or the end of the word +if it contains no vowels.
  • +
  • R1 is the region after the first non-vowel following a vowel, or the +end of the word if there is no such non-vowel.
  • +
  • R2 is the region after the first non-vowel following a vowel in +R1, or the end of the word if there is no such non-vowel.
  • +
+ +

The full algorithm in Snowball

+ +
routines (
+  R1 R2 RV
+  initial_morph
+  mark_regions
+  noun_sfx
+  deriv
+  verb_sfx
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* Accented characters */
+
+stringdef a'   '{U+00E1}'  // a-acute
+stringdef e'   '{U+00E9}'  // e-acute
+stringdef i'   '{U+00ED}'  // i-acute
+stringdef o'   '{U+00F3}'  // o-acute
+stringdef u'   '{U+00FA}'  // u-acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit  // defaults
+
+    do (
+        gopast v setmark pV
+        gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define initial_morph as (
+  [substring] among (
+    'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
+    (delete)
+
+    // verbs
+    'd{'}'
+    (delete)
+    'd{'}fh'
+    (<- 'f')
+    // other contractions
+    'm{'}' 'b{'}'
+    (delete)
+
+    'sh'
+    (<- 's')
+
+    'mb'
+    (<- 'b')
+    'gc'
+    (<- 'c')
+    'nd'
+    (<- 'd')
+    'bhf'
+    (<- 'f')
+    'ng'
+    (<- 'g')
+    'bp'
+    (<- 'p')
+    'ts'
+    (<- 's')
+    'dt'
+    (<- 't')
+
+    // Lenition
+    'bh'
+    (<- 'b')
+    'ch'
+    (<- 'c')
+    'dh'
+    (<- 'd')
+    'fh'
+    (<- 'f')
+    'gh'
+    (<- 'g')
+    'mh'
+    (<- 'm')
+    'ph'
+    (<- 'p')
+    'th'
+    (<- 't')
+  )
+)
+
+backwardmode (
+
+  define RV as $pV <= cursor
+  define R1 as $p1 <= cursor
+  define R2 as $p2 <= cursor
+
+  define noun_sfx as (
+    [substring] among (
+      'amh' 'eamh' 'abh' 'eabh'
+      'aibh' 'ibh' 'aimh' 'imh'
+      'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
+      (R1 delete)
+      'ire' 'ir{i'}' 'aire' 'air{i'}'
+      (R2 delete)
+    )
+  )
+  define deriv as (
+    [substring] among (
+      'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
+      (R2 delete)  //siopadóireacht -> siopadóir but not poblacht -> pobl
+      'arcacht' 'arcachta{i'}' 'arcachta'
+      (<- 'arc') // monarcacht -> monarc
+      'gineach' 'gineas' 'ginis'
+      (<- 'gin')
+      'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
+      (<- 'graf')
+      'paite' 'patach' 'pataigh' 'patacha'
+      (<- 'paite')
+      '{o'}ideach' '{o'}ideacha' '{o'}idigh'
+      (<- '{o'}id')
+    )
+  )
+  define verb_sfx as (
+    [substring] among (
+      'imid' 'aimid' '{i'}mid' 'a{i'}mid'
+      'faidh' 'fidh'
+      (RV delete)
+      'ain'
+      'eadh' 'adh'
+      '{a'}il'
+      'tear' 'tar'
+      (R1 delete)
+    )
+  )
+)
+
+define stem as (
+  do initial_morph
+  do mark_regions
+  backwards (
+    do noun_sfx
+    do deriv
+    do verb_sfx
+  )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/irish/stemmer.tt b/algorithms/irish/stemmer.tt new file mode 100644 index 0000000..1293bc2 --- /dev/null +++ b/algorithms/irish/stemmer.tt @@ -0,0 +1,52 @@ +[% header('Irish Gaelic stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([30, 'ábharú', 'pábháil']) %] + +

+This basic stemmer for Irish was developed and contributed by Jim +O’Regan. +

+ +

+One thing that should be taken into account with Irish is the initial +mutation (n-eclipsis and h-prothesis) which causes problems if words +are simply folded to lowercase before stemming in the way that is +usually assumed by Snowball stemmers. A Snowball version of an algorithm to +fold to lowercase while taking this into account would look something like: +

+ +[% highlight_file('tolower_irish') %] + +

+The following characters are vowels for the purposes of this algorithm: + +

+ a e i o u á é í ó ú +
+ +

+The algorithm first addresses the initial mutation, then regions are determined +based on the word after this first step: +

+ + + +

The full algorithm in Snowball

+ +[% highlight_file('irish') %] + +[% footer %] diff --git a/algorithms/irish/stop.txt b/algorithms/irish/stop.txt new file mode 100644 index 0000000..9ff88d7 --- /dev/null +++ b/algorithms/irish/stop.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/algorithms/italian/stemmer.html b/algorithms/italian/stemmer.html new file mode 100644 index 0000000..1751089 --- /dev/null +++ b/algorithms/italian/stemmer.html @@ -0,0 +1,642 @@ + + + + + + + + + + Italian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Italian stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Italian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+abbandonata
+abbandonate
+abbandonati
+abbandonato
+abbandonava
+abbandonerà
+abbandoneranno
+abbandonerò
+abbandono
+abbandonò
+abbaruffato
+abbassamento
+abbassando
+abbassandola
+abbassandole
+abbassar
+abbassare
+abbassarono
+abbassarsi
+abbassassero
+abbassato
+abbassava
+abbassi
+abbassò
+abbastanza
+abbatté
+abbattendo
+abbattere
+abbattersi
+abbattesse
+abbatteva
+abbattevamo
+abbattevano
+abbattimento
+abbattuta
+abbattuti
+abbattuto
+abbellita
+abbenché
+abbi
+
+abbandon
+abbandon
+abbandon
+abbandon
+abbandon
+abbandon
+abbandon
+abbandon
+abband
+abbandon
+abbaruff
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbass
+abbast
+abbatt
+abbatt
+abbatt
+abbatt
+abbattess
+abbatt
+abbatt
+abbatt
+abbatt
+abbatt
+abbatt
+abbatt
+abbell
+abbenc
+abbi
+
+pronto
+pronuncerà
+pronuncia
+pronunciamento
+pronunciare
+pronunciarsi
+pronunciata
+pronunciate
+pronunciato
+pronunzia
+pronunziano
+pronunziare
+pronunziarle
+pronunziato
+pronunzio
+pronunziò
+propaga
+propagamento
+propaganda
+propagare
+propagarla
+propagarsi
+propagasse
+propagata
+propagazione
+propaghino
+propalate
+propende
+propensi
+propensione
+propini
+propio
+propizio
+propone
+proponendo
+proponendosi
+proponenti
+proponeva
+proponevano
+proponga
+
+pront
+pronunc
+pronunc
+pronunc
+pronunc
+pronunc
+pronunc
+pronunc
+pronunc
+pronunz
+pronunz
+pronunz
+pronunz
+pronunz
+pronunz
+pronunz
+propag
+propag
+propagand
+propag
+propag
+propag
+propag
+propag
+propag
+propaghin
+propal
+prop
+propens
+propension
+propin
+prop
+propiz
+propon
+propon
+propon
+proponent
+propon
+propon
+propong
+
+
+ +

The stemming algorithm

+ +

+Italian can include the following accented forms: +

+ +
+ á   é   í   ó   ú   à   è   ì   ò   ù +
+ +

+First, replace all acute accents by grave accents. And, as in French, put u after +q, and u, i between vowels into upper case. +(See note on vowel marking.) +

+ +

+The vowels are then +

+ +
+ a   e   i   o   u   à   è   ì   ò   ù +
+ +

+R2 +(see the note on R1 and R2) +and RV have the same definition as in the + Spanish stemmer. +

+ +

+First exceptional cases are checked for. These need to match the whole word, and currently are: +

+ +
    +
  • divano: replace with divan (to avoid conflating with diva) [Added 2022-11-16] +
+ +

+If found then handle as described and that's it. +

+ +

+Otherwise always do steps 0 and 1. +

+ +

+Step 0: Attached pronoun +

+ +
+ Search for the longest among the following suffixes +
+ ci   gli   la   le   li   lo   mi   ne   si   ti   vi +   sene   gliela   gliele   glieli   glielo   gliene +   mela   mele   meli   melo   mene +   tela   tele   teli   telo   tene +   cela   cele   celi   celo   cene +   vela   vele   veli   velo   vene +
+

+ following one of +

+
+ (a) ando   endo
+ (b) ar   er   ir +
+

+ in RV. In case of (a) the suffix is deleted, in case (b) it is replace + by e (guardandogliguardando, accomodarciaccomodare) +

+
+ +

+Step 1: Standard suffix removal +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
anza   anze   ico   ici   ica   ice   iche   ichi   ismo   ismi   abile   abili   ibile   ibili +   ista   iste   isti   istà   istè   istì   oso   osi   osa   ose   mente +   atrice   atrici   ante   anti +
delete if in R2 +
azione   azioni   atore   atori + delete if in R2 +
if preceded by ic, delete if in R2 +
logia   logie +
replace with log if in R2 +
uzione   uzioni   usione   usioni +
replace with u if in R2 +
enza   enze +
replace with ente if in R2 +
amento   amenti   imento   imenti +
delete if in RV +
amente +
delete if in R1 +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by os, ic or abil, delete if in R2 +
ità +
delete if in R2 +
if preceded by abil, ic or iv, delete if in R2 +
ivo   ivi   iva   ive +
delete if in R2 +
if preceded by at, delete if in R2 (and if further preceded by ic, + delete if in R2) +
+
+ +

+Do step 2 if no ending was removed by step 1. +

+ +

+Step 2: Verb suffixes +

+ +
+ Search for the longest among the following suffixes in RV, and if found, + delete. +
+ ammo   ando   ano   are   arono   + asse   assero   assi   assimo   ata   ate   + ati   ato   ava   avamo   avano   avate   avi   avo   emmo +   enda   ende   endi   endo   erà   erai   eranno   ere +   erebbe   erebbero   erei   eremmo   eremo   ereste   + eresti   erete   erò   erono   essero   ete   eva   evamo +   evano   evate   evi   evo   Yamo   iamo   immo   irà +   irai   iranno   ire   irebbe   irebbero   irei   iremmo +   iremo   ireste   iresti   irete   irò   irono   isca   + iscano   isce   isci   isco   iscono   issero   ita   ite +   iti   ito   iva   ivamo   ivano   ivate   ivi   ivo   + ono   uta   ute   uti   uto   ar   ir +
+ +

+Always do steps 3a and 3b. +

+ +
+ +

+Step 3a +

+ +
+ Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a + preceding i if it is in RV (crocchicrocch, crocchiocrocch) +
+ +

+Step 3b +

+ +
+ Replace final ch (or gh) with c (or g) if in RV (crocchcrocc) +
+ +

+Finally, +

+ +
+ turn I and U back into lower case +
+ +

The same algorithm in Snowball

+ +
routines (
+           exceptions
+           prelude postlude mark_regions
+           RV R1 R2
+           attached_pronoun
+           standard_suffix
+           verb_suffix
+           vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v AEIO CG )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a'   '{U+00E1}'
+stringdef a`   '{U+00E0}'
+stringdef e'   '{U+00E9}'
+stringdef e`   '{U+00E8}'
+stringdef i'   '{U+00ED}'
+stringdef i`   '{U+00EC}'
+stringdef o'   '{U+00F3}'
+stringdef o`   '{U+00F2}'
+stringdef u'   '{U+00FA}'
+stringdef u`   '{U+00F9}'
+
+define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
+
+define prelude as (
+    test repeat (
+        [substring] among(
+            '{a'}' (<- '{a`}')
+            '{e'}' (<- '{e`}')
+            '{i'}' (<- '{i`}')
+            '{o'}' (<- '{o`}')
+            '{u'}' (<- '{u`}')
+            'qu'   (<- 'qU')
+            ''     (next)
+        )
+    )
+    repeat goto (
+        v [ ('u' ] v <- 'U') or
+            ('i' ] v <- 'I')
+    )
+)
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit // defaults
+
+    do (
+        ( v (non-v gopast v) or (v gopast non-v) )
+        or
+        ( non-v (non-v gopast v) or (v next) )
+        setmark pV
+    )
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define postlude as repeat (
+
+    [substring] among(
+        'I'  (<- 'i')
+        'U'  (<- 'u')
+        ''   (next)
+    )
+
+)
+
+backwardmode (
+
+    define RV as $pV <= cursor
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define attached_pronoun as (
+        [substring] among(
+            'ci' 'gli' 'la' 'le' 'li' 'lo'
+            'mi' 'ne' 'si'  'ti' 'vi'
+            // the compound forms are:
+            'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
+            'mela' 'mele' 'meli' 'melo' 'mene'
+            'tela' 'tele' 'teli' 'telo' 'tene'
+            'cela' 'cele' 'celi' 'celo' 'cene'
+            'vela' 'vele' 'veli' 'velo' 'vene'
+        )
+        among( (RV)
+            'ando' 'endo'   (delete)
+            'ar' 'er' 'ir'  (<- 'e')
+        )
+    )
+
+    define standard_suffix as (
+        [substring] among(
+
+            'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
+            'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
+            'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
+            'atrice' 'atrici'
+            'ante' 'anti' // Note 1
+               ( R2 delete )
+            'azione' 'azioni' 'atore' 'atori'
+               ( R2 delete
+                 try ( ['ic'] R2 delete )
+               )
+            'logia' 'logie'
+               ( R2 <- 'log' )
+            'uzione' 'uzioni' 'usione' 'usioni'
+               ( R2 <- 'u' )
+            'enza' 'enze'
+               ( R2 <- 'ente' )
+            'amento' 'amenti' 'imento' 'imenti'
+               ( RV delete )
+            'amente' (
+                R1 delete
+                try (
+                    [substring] R2 delete among(
+                        'iv' ( ['at'] R2 delete )
+                        'os' 'ic' 'abil'
+                    )
+                )
+            )
+            'it{a`}' (
+                R2 delete
+                try (
+                    [substring] among(
+                        'abil' 'ic' 'iv' (R2 delete)
+                    )
+                )
+            )
+            'ivo' 'ivi' 'iva' 'ive' (
+                R2 delete
+                try ( ['at'] R2 delete ['ic'] R2 delete )
+            )
+        )
+    )
+
+    define verb_suffix as setlimit tomark pV for (
+        [substring] among(
+            'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
+            'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
+            'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
+            'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
+            'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
+            'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
+            'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
+            'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
+            'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
+            'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
+            'ono' 'uta' 'ute' 'uti' 'uto'
+
+            'ar' 'ir' // but 'er' is problematical
+                (delete)
+        )
+    )
+
+    define AEIO 'aeio{a`}{e`}{i`}{o`}'
+    define CG 'cg'
+
+    define vowel_suffix as (
+        try (
+            [AEIO] RV delete
+            ['i'] RV delete
+        )
+        try (
+            ['h'] CG RV delete
+        )
+    )
+)
+
+define exceptions as (
+    ['divano' atlimit ] <- 'divan' // Otherwise "divano" stems to "div" and collides with "diva"
+)
+
+define stem as (
+    exceptions or (
+        do prelude
+        do mark_regions
+        backwards (
+            do attached_pronoun
+            do (standard_suffix or verb_suffix)
+            do vowel_suffix
+        )
+        do postlude
+    )
+)
+
+/*
+    Note 1: additions of 15 Jun 2005
+*/
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/italian/stemmer.tt b/algorithms/italian/stemmer.tt new file mode 100644 index 0000000..de88f5c --- /dev/null +++ b/algorithms/italian/stemmer.tt @@ -0,0 +1,185 @@ +[% header('Italian stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'abbandonata', 'pronto']) %] + +

The stemming algorithm

+ +

+Italian can include the following accented forms: +

+ +
+ á   é   í   ó   ú   à   è   ì   ò   ù +
+ +

+First, replace all acute accents by grave accents. And, as in French, put u after +q, and u, i between vowels into upper case. +(See note on vowel marking.) +

+ +

+The vowels are then +

+ +
+ a   e   i   o   u   à   è   ì   ò   ù +
+ +

+R2 +(see the note on R1 and R2) +and RV have the same definition as in the + Spanish stemmer. +

+ +

+First exceptional cases are checked for. These need to match the whole word, and currently are: +

+ + + +

+If found then handle as described and that's it. +

+ +

+Otherwise always do steps 0 and 1. +

+ +

+Step 0: Attached pronoun +

+ +
+ Search for the longest among the following suffixes +
+ ci   gli   la   le   li   lo   mi   ne   si   ti   vi +   sene   gliela   gliele   glieli   glielo   gliene +   mela   mele   meli   melo   mene +   tela   tele   teli   telo   tene +   cela   cele   celi   celo   cene +   vela   vele   veli   velo   vene +
+

+ following one of +

+
+ (a) ando   endo
+ (b) ar   er   ir +
+

+ in RV. In case of (a) the suffix is deleted, in case (b) it is replace + by e (guardandogliguardando, accomodarciaccomodare) +

+
+ +

+Step 1: Standard suffix removal +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
anza   anze   ico   ici   ica   ice   iche   ichi   ismo   ismi   abile   abili   ibile   ibili +   ista   iste   isti   istà   istè   istì   oso   osi   osa   ose   mente +   atrice   atrici   ante   anti +
delete if in R2 +
azione   azioni   atore   atori + delete if in R2 +
if preceded by ic, delete if in R2 +
logia   logie +
replace with log if in R2 +
uzione   uzioni   usione   usioni +
replace with u if in R2 +
enza   enze +
replace with ente if in R2 +
amento   amenti   imento   imenti +
delete if in RV +
amente +
delete if in R1 +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by os, ic or abil, delete if in R2 +
ità +
delete if in R2 +
if preceded by abil, ic or iv, delete if in R2 +
ivo   ivi   iva   ive +
delete if in R2 +
if preceded by at, delete if in R2 (and if further preceded by ic, + delete if in R2) +
+
+ +

+Do step 2 if no ending was removed by step 1. +

+ +

+Step 2: Verb suffixes +

+ +
+ Search for the longest among the following suffixes in RV, and if found, + delete. +
+ ammo   ando   ano   are   arono   + asse   assero   assi   assimo   ata   ate   + ati   ato   ava   avamo   avano   avate   avi   avo   emmo +   enda   ende   endi   endo   erà   erai   eranno   ere +   erebbe   erebbero   erei   eremmo   eremo   ereste   + eresti   erete   erò   erono   essero   ete   eva   evamo +   evano   evate   evi   evo   Yamo   iamo   immo   irà +   irai   iranno   ire   irebbe   irebbero   irei   iremmo +   iremo   ireste   iresti   irete   irò   irono   isca   + iscano   isce   isci   isco   iscono   issero   ita   ite +   iti   ito   iva   ivamo   ivano   ivate   ivi   ivo   + ono   uta   ute   uti   uto   ar   ir +
+ +

+Always do steps 3a and 3b. +

+ +
+ +

+Step 3a +

+ +
+ Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a + preceding i if it is in RV (crocchicrocch, crocchiocrocch) +
+ +

+Step 3b +

+ +
+ Replace final ch (or gh) with c (or g) if in RV (crocchcrocc) +
+ +

+Finally, +

+ +
+ turn I and U back into lower case +
+ +

The same algorithm in Snowball

+ +[% highlight_file('italian') %] + +[% footer %] diff --git a/algorithms/italian/stop.txt b/algorithms/italian/stop.txt new file mode 100644 index 0000000..a20bb95 --- /dev/null +++ b/algorithms/italian/stop.txt @@ -0,0 +1,295 @@ + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/algorithms/kraaij_pohlmann/kraij-pohlmann-uplift-dutch-stemmer.zip b/algorithms/kraaij_pohlmann/kraij-pohlmann-uplift-dutch-stemmer.zip new file mode 100644 index 0000000..db38905 Binary files /dev/null and b/algorithms/kraaij_pohlmann/kraij-pohlmann-uplift-dutch-stemmer.zip differ diff --git a/algorithms/kraaij_pohlmann/stemmer.html b/algorithms/kraaij_pohlmann/stemmer.html new file mode 100644 index 0000000..17ca0d0 --- /dev/null +++ b/algorithms/kraaij_pohlmann/stemmer.html @@ -0,0 +1,376 @@ + + + + + + + + + + The Kraaij-Pohlmann stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

The Kraaij-Pohlmann stemming algorithm

+ + +

Links to resources

+ + + +

+The Kraaij-Pohlmann stemming algorithm is an ANSI C program for stemming in Dutch. Although +advertised as an algorithm, it is in fact a program without an accompanying +algorithmic description. It is possible to produce a fairly clean Snowball +version, but only by sacrificing exact functional equivalence. But that does not +matter too much, since in the demonstration vocabulary only 32 words out of over +45,000 stem differently. Here they are: +

+ +
+
+source ANSI C stemmer Snowball stemmer +
airways airways airway +
algerije algerije alrije +
assays assays assay +
bruys bruys bruy +
cleanaways cleanaways cleanaway +
creys creys crey +
croyden croyd croy +
edele edel edeel +
essays essays essay +
gedijen gedij dij +
geoff of off +
gevrey gevrey vrey +
geysels ysel gey +
grootmeesteres grootmee grootmeest +
gròotmeesteres gròotmee gròotmeest +
hectares hectaar hect +
huys huys huy +
kayen kayen kaay +
lagerwey lagerwey larwey +
mayen mayen maay +
meesteres meester meest +
oppasseres oppasser oppas +
pays pays pay +
royale royale royaal +
schilderes schilder schild +
summerhayes summerhayes summerhaye +
tyumen tyuum tyum +
verheyen verheyen verheey +
verleideres verleider verleid +
ytsen yts ytsen +
yves yve yves +
zangeres zanger zang +
+ +

+The Kraaij-Pohmann stemmer can make fairly drastic reductions to a word. For +example, infixed ge is removed, so geluidgevoelige stems to +luidvoel. Often, therefore, the original word cannot be easily guessed from +the stemmed form. +

+ +

+Here then is the Snowball equivalent of the Kraaij-Pohlmann algorithm. +

+ +
strings ( ch )
+integers ( p1 p2 )
+booleans ( Y_found stemmed GE_removed )
+
+routines (
+
+   R1 R2
+   C V VX
+   lengthen_V
+   Step_1 Step_2 Step_3 Step_4 Step_7
+   Step_6 Step_1c
+   Lose_prefix
+   Lose_infix
+   measure
+)
+
+externals ( stem )
+
+groupings ( v v_WX AOU AIOU )
+
+stringescapes {}
+
+define v        'aeiouy'
+define v_WX     v + 'wx'
+define AOU      'aou'
+define AIOU     'aiou'
+
+backwardmode (
+
+    define R1 as ($p1 <= cursor)
+    define R2 as ($p2 <= cursor)
+
+    define V  as test (v or 'ij')
+    define VX as test (next v or 'ij')
+    define C  as test (not 'ij' non-v)
+
+    define lengthen_V as do (
+        non-v_WX [ (AOU] test (non-v or atlimit)) or
+                   ('e'] test (non-v or atlimit
+                               not AIOU
+                               not (next AIOU non-v)))
+        ->ch insert ch
+    )
+
+    define Step_1 as
+    (
+        [substring] among (
+
+            '{'}s' (delete)
+            's'    (R1 not ('t' R1) C delete)
+            'ies'  (R1 <-'ie')
+            'es'
+                   (('ar' R1 C ] delete lengthen_V) or
+                    ('er' R1 C ] delete) or
+                    (R1 C <-'e'))
+
+            'aus'  (R1 V <-'au')
+            'en'   (('hed' R1 ] <-'heid') or
+                    ('nd' delete) or
+                    ('d' R1 C ] delete) or
+                    ('i' or 'j' V delete) or
+                    (R1 C delete lengthen_V))
+            'nde'  (<-'nd')
+        )
+    )
+
+    define Step_2 as
+    (
+        [substring] among (
+            'je'   (('{'}t' ] delete) or
+                    ('et'   ] R1 C delete) or
+                    ('rnt'  ] <-'rn') or
+                    ('t'    ] R1 VX delete) or
+                    ('ink'  ] <-'ing') or
+                    ('mp'   ] <-'m') or
+                    ('{'}'  ] R1 delete) or
+                    (] R1 C delete))
+            'ge'   (R1 <-'g')
+            'lijke'(R1 <-'lijk')
+            'ische'(R1 <-'isch')
+            'de'   (R1 C delete)
+            'te'   (R1 <-'t')
+            'se'   (R1 <-'s')
+            're'   (R1 <-'r')
+            'le'   (R1 delete attach 'l' lengthen_V)
+            'ene'  (R1 C delete attach 'en' lengthen_V)
+            'ieve' (R1 C <-'ief')
+        )
+    )
+
+    define Step_3 as
+    (
+        [substring] among (
+            'atie'  (R1 <-'eer')
+            'iteit' (R1 delete lengthen_V)
+            'heid'
+            'sel'
+            'ster'  (R1 delete)
+            'rder'  (<-'r')
+            'ing'
+            'isme'
+            'erij'  (R1 delete lengthen_V)
+            'arij'  (R1 C <-'aar')
+            'fie'   (R2 delete attach 'f' lengthen_V)
+            'gie'   (R2 delete attach 'g' lengthen_V)
+            'tst'   (R1 C <-'t')
+            'dst'   (R1 C <-'d')
+        )
+    )
+
+    define Step_4 as
+    (
+        (   [substring] among (
+                'ioneel'  (R1 <-'ie')
+                'atief'   (R1 <-'eer')
+                'baar'    (R1 delete)
+                'naar'    (R1 V <-'n')
+                'laar'    (R1 V <-'l')
+                'raar'    (R1 V <-'r')
+                'tant'    (R1 <-'teer')
+                'lijker'
+                'lijkst'  (R1 <-'lijk')
+                'achtig'
+                'achtiger'
+                'achtigst'(R1 delete)
+                'eriger'
+                'erigst'
+                'erig'
+                'end'     (R1 C delete lengthen_V)
+            )
+        )
+        or
+        (   [substring] among (
+                'iger'
+                'igst'
+                'ig'      (R1 C delete lengthen_V)
+            )
+        )
+    )
+
+    define Step_7 as
+    (
+        [substring] among (
+            'kt'   (<-'k')
+            'ft'   (<-'f')
+            'pt'   (<-'p')
+        )
+    )
+
+    define Step_6 as
+    (
+        [substring] among (
+            'bb'   (<-'b')
+            'cc'   (<-'c')
+            'dd'   (<-'d')
+            'ff'   (<-'f')
+            'gg'   (<-'g')
+            'hh'   (<-'h')
+            'jj'   (<-'j')
+            'kk'   (<-'k')
+            'll'   (<-'l')
+            'mm'   (<-'m')
+            'nn'   (<-'n')
+            'pp'   (<-'p')
+            'qq'   (<-'q')
+            'rr'   (<-'r')
+            'ss'   (<-'s')
+            'tt'   (<-'t')
+            'vv'   (<-'v')
+            'ww'   (<-'w')
+            'xx'   (<-'x')
+            'zz'   (<-'z')
+            'v'    (<-'f')
+            'z'    (<-'s')
+        )
+    )
+
+    define Step_1c as
+    (
+        [substring] among ( (R1 C)
+            'd' (not ('n' R1) delete)
+            't' (not ('h' R1) delete)
+        )
+    )
+)
+
+define Lose_prefix as (
+    ['ge'] test hop 3 (goto v goto non-v)
+    set GE_removed
+    delete
+)
+
+define Lose_infix as (
+    next
+    gopast (['ge']) test hop 3 (goto v goto non-v)
+    set GE_removed
+    delete
+)
+
+define measure as (
+    $p1 = limit
+    $p2 = limit
+    do(
+        repeat non-v  atleast 1 ('ij' or v)  non-v  setmark p1
+        repeat non-v  atleast 1 ('ij' or v)  non-v  setmark p2
+    )
+
+)
+define stem as (
+
+    unset Y_found
+    unset stemmed
+    do ( ['y'] <-'Y' set Y_found )
+    do repeat(goto (v  ['y'])<-'Y' set Y_found )
+
+    measure
+
+    backwards (
+            do (Step_1 set stemmed )
+            do (Step_2 set stemmed )
+            do (Step_3 set stemmed )
+            do (Step_4 set stemmed )
+    )
+    unset GE_removed
+    do (Lose_prefix and measure)
+    backwards (
+            do (GE_removed Step_1c)
+        )
+    unset GE_removed
+    do (Lose_infix and measure)
+    backwards (
+            do (GE_removed Step_1c)
+        )
+    backwards (
+            do (Step_7 set stemmed )
+            do (stemmed or GE_removed Step_6)
+        )
+    do(Y_found  repeat(goto (['Y']) <-'y'))
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/kraaij_pohlmann/stemmer.tt b/algorithms/kraaij_pohlmann/stemmer.tt new file mode 100644 index 0000000..4fad378 --- /dev/null +++ b/algorithms/kraaij_pohlmann/stemmer.tt @@ -0,0 +1,69 @@ +[% header('The Kraaij-Pohlmann stemming algorithm') %] + +

Links to resources

+ + + +

+The Kraaij-Pohlmann stemming algorithm is an ANSI C program for stemming in Dutch. Although +advertised as an algorithm, it is in fact a program without an accompanying +algorithmic description. It is possible to produce a fairly clean Snowball +version, but only by sacrificing exact functional equivalence. But that does not +matter too much, since in the demonstration vocabulary only 32 words out of over +45,000 stem differently. Here they are: +

+ +
+
+source ANSI C stemmer Snowball stemmer +
airways airways airway +
algerije algerije alrije +
assays assays assay +
bruys bruys bruy +
cleanaways cleanaways cleanaway +
creys creys crey +
croyden croyd croy +
edele edel edeel +
essays essays essay +
gedijen gedij dij +
geoff of off +
gevrey gevrey vrey +
geysels ysel gey +
grootmeesteres grootmee grootmeest +
gròotmeesteres gròotmee gròotmeest +
hectares hectaar hect +
huys huys huy +
kayen kayen kaay +
lagerwey lagerwey larwey +
mayen mayen maay +
meesteres meester meest +
oppasseres oppasser oppas +
pays pays pay +
royale royale royaal +
schilderes schilder schild +
summerhayes summerhayes summerhaye +
tyumen tyuum tyum +
verheyen verheyen verheey +
verleideres verleider verleid +
ytsen yts ytsen +
yves yve yves +
zangeres zanger zang +
+ +

+The Kraaij-Pohmann stemmer can make fairly drastic reductions to a word. For +example, infixed ge is removed, so geluidgevoelige stems to +luidvoel. Often, therefore, the original word cannot be easily guessed from +the stemmed form. +

+ +

+Here then is the Snowball equivalent of the Kraaij-Pohlmann algorithm. +

+ +[% highlight_file('kraaij_pohlmann') %] + +[% footer %] diff --git a/algorithms/lithuanian/stemmer.html b/algorithms/lithuanian/stemmer.html new file mode 100644 index 0000000..58aeb9e --- /dev/null +++ b/algorithms/lithuanian/stemmer.html @@ -0,0 +1,462 @@ + + + + + + + + + + Lithuanian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Lithuanian stemming algorithm

+ + +

Links to resources

+ + + +

+This algorithm was contributed by Dainius Jocas. +

+ +

+Its intended domain of use is information retrieval, and so handling of nouns +is considered more important than that of verbs, adjectives, etc. +

+ +

The full algorithm in Snowball

+ +
externals ( stem )
+
+// escape symbols for substituting lithuanian characters
+stringescapes { }
+
+/* Special characters in Unicode Latin Extended-A */
+// ' nosine
+stringdef a'   '{U+0105}'  // ą a + ogonek
+stringdef e'   '{U+0119}'  // ę e + ogonek
+stringdef i'   '{U+012F}'  // į i + ogonek
+stringdef u'   '{U+0173}'  // ų u + ogonek
+
+// . taskas
+stringdef e.   '{U+0117}'  // ė e + dot
+
+// - ilgoji
+stringdef u-   '{U+016B}'  // ū u + macron
+
+// * varnele
+stringdef c*   '{U+010D}'  // č c + caron (haček)
+stringdef s*   '{U+0161}'  // š s + caron (haček)
+stringdef z*   '{U+017E}'  // ž z + caron (haček)
+
+// [C](VC)^m[V|C]
+// definitions of variables for
+// p1 - position of m = 0
+integers ( p1 )
+
+// groupings
+// v - lithuanian vowels
+groupings ( v )
+
+// v - all lithuanian vowels
+define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}'
+
+// all lithuanian stemmer routines: 4 steps
+routines (
+  step2 step1 fix_chdz fix_gd fix_conflicts
+)
+
+backwardmode (
+
+  define step1 as (
+    setlimit tomark p1 for ([substring]) among (
+      // Daiktavardžiai (Nouns)
+      // I linksniuotė (declension I)
+      'as' 'ias' 'is' 'ys'        // vyras, kelias, brolis, gaidys
+      'o'    'io'                 // vyro, kelio
+      'ui'   'iui'                // vyrui, keliui
+      '{a'}' 'i{a'}' '{i'}'       // vyrą, kelią, brolį
+      'u'    'iu'                 // vyru, keliu
+      'e'    'yje'                // vyre, kelyje
+      'y'    'au'    'i'          // kely, brolau, broli,
+      'an'                        // nusižengiman
+
+      'ai'   'iai'                // vyrai, keliai
+      '{u'}' 'i{u'}'              // vyrų, kelių
+      'ams'  'am'                 // vyrams, vyram
+      'iams' 'iam'                // broliams, broliam
+      'us'   'ius'                // vyrus, brolius
+      'ais'  'iais'               // vyrais, keliais
+      'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos
+      'uosna' 'iuosna'            // vyruosna, keliuosna
+      'ysna'                      // žutysna
+
+      'asis' 'aisi'               // sukimasis, sukimaisi
+      'osi'  '{u'}si'             // sukimosi, sukimųsi
+      'uisi'                      // sukimuisi
+      '{a'}si'                    // sukimąsi
+      'usi'                       // sukimusi
+      'esi'                       // sukimesi
+
+      'uo'                        // mėnuo
+
+
+      // II linksniuote (declension II)
+      'a'  'ia'                   // galva, vysnios
+      'os' 'ios'                  // galvos, vysnios
+      'oj' 'oje' 'ioje'           // galvoje, vysnioje
+      'osna' 'iosna'              // galvosna, vyšniosna
+      'om' 'oms' 'ioms'           // galvoms, vysnioms
+      'omis' 'iomis'              // galvomis, vysniomis
+      'ose' 'iose'                // galvose, vysniose
+      'on' 'ion'                  // galvon, vyšnion
+
+
+      // III linksniuote (declension III)
+      '{e.}'                      // gervė
+      '{e.}s'                     // gervės
+      'ei'                        // gervei
+      '{e'}'                      // gervę
+      '{e.}j' '{e.}je'            // gervėj, gervėje
+      '{e.}ms'                    // gervėms
+      'es'                        // gerves
+      '{e.}mis'                   // gervėmis
+      '{e.}se'                    // gervėse
+      '{e.}sna'                   // gervėsna
+      '{e.}n'                     // žydaitėn
+
+
+      // IV linksniuote (declension IV)
+      'aus' 'iaus'                // sūnaus, skaičiaus
+      'umi' 'iumi'                // sūnumi, skaičiumi
+      'uje' 'iuje'                // sūnuje, skaičiuje
+      'iau'                       // skaičiau
+
+      '{u-}s'                     // sūnūs
+      'ums'                       // sūnums
+      'umis'                      // sūnumis
+      'un' 'iun'                  // sūnun, administratoriun
+
+
+      // V linksniuote (declension V)
+      'ies' 'ens' 'enio' 'ers'    // avies, vandens, sesers
+      'eniui' 'eriai'             // vandeniui, eriai
+      'en{i'}' 'er{i'}'           // vandenį, seserį
+      'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria
+      'enyje' 'eryje'             // vandenyje, seseryje
+      'ie' 'enie' 'erie'          // avie, vandenie, seserie
+
+      'enys' 'erys'               // vandenys, seserys
+      // 'en{u'}' konfliktas su 'žandenų' 'antenų'
+      'er{u'}'                    // seserų
+      'ims' 'enims' 'erims'       // avims, vandemins, seserims
+      'enis'                      // vandenis
+      'imis'                      // žebenkštimis
+      'enimis'                    // vandenimis
+      'yse' 'enyse' 'eryse'       // avyse, vandenyse, seseryse
+
+
+      // Būdvardžiai (Adjectives)
+      // (i)a linksniuotė
+      'iem' 'iems'                // geriem, geriems
+      'ame' 'iame'                // naujame, mediniame
+
+
+      // Veiksmažodžiai (Verbs)
+      // Tiesioginė nuosaka (indicative mood)
+      // esamasis laikas (present tense)
+      // (i)a asmenuotė (declension (i)a)
+      'uosi' 'iuosi'              // dirbuosi, traukiuosi
+      'iesi'                      // dirbiesi
+      'asi' 'iasi'                // dirbasi, traukiasi
+      'am{e.}s' 'iam{e.}s'        // dirbamės, traukiamės
+      'at' 'ate' 'iat' 'iate'     // dirbat, dirbate, ariat, traukiate
+      'at{e.}s' 'iat{e.}s'        // dirbatės, traukiatės
+
+      // i asmenuotė (declension i)
+      'isi'                       // tikisi
+      'im'                        // mylim
+      // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime'
+      'im{e.}s'                   // tikimės
+      'it' 'ite'                  // mylit, mylite, tikitės
+      // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės
+
+      // o asmenuotė (declension o)
+      'ome'                       // mokome
+      'ot' 'ote'                  // mokot, mokote
+
+      // būtasis laikas
+      // o asmenuotė (declension o)
+      '{e.}jo' '{e.}josi'         // tikėjo, tikėjosi
+      'ot{e.}s'                   // tikėjotės/bijotės
+
+      // ė asmenuotė (declension ė)
+      'eisi'                      // mokeisi
+      '{e.}si'                    // mokėsi
+      '{e.}m' '{e.}me'            // mokėm, mokėme
+      '{e.}m{e.}s'                // mokėmės
+      '{e.}t' '{e.}te'            // mokėt, mokėte
+      '{e.}t{e.}s'                // mokėtės
+
+      // būtasis dažninis laikas (frequentative past tense)
+      'ausi'                      // mokydavausi
+      'om{e.}s'                   // mokydavomės/bijomės
+
+
+      // būsimasis laikas (future tense)
+      'siu' 'siuosi'              // dirbsiu, mokysiuosi
+      'si' 'siesi'                // dirbsi, dirbsiesi
+      's' 'ysis'                  // dirbs, mokysis
+      'sim' 'sime'                // dirbsim, dirbsime
+      'sit' 'site'                // gersit, gersite
+
+      // tariamoji nuosaka (subjunctive mood)
+      '{c*}iau' '{c*}iausi'       // dirbčiau
+      'tum' 'tumei'               // dirbtum, dirbtumei
+      'tumeis' 'tumeisi'          // mokytumeis, mokytumeisi
+      // 't{u'}' nes blogai batutų -> batų
+      't{u'}si'                   // mokytųsi
+      // 'tume' konfliktas su 'šventume'
+      'tum{e.}m'                  // dirbtumėm
+      'tum{e.}me'                 // dirbtumėme
+      'tum{e.}m{e.}s'             // mokytumėmės
+      'tute' 'tum{e.}t'           // dirbtute, dirbtumėt
+      'tum{e.}te'                 // dirbtumėte
+      'tum{e.}t{e.}s'             // mokytumėtės
+
+      // liepiamoji nuosaka (imperative mood)
+      'k' 'ki'                    // dirbk, dirbki, mokykis
+      // 'kis' konfliktas viln-išk-is
+      // 'kime' konfliktas, nes pirkime
+      'kim{e.}s'                  // mokykimės
+
+      // bendratis (infinitive)
+      'uoti' 'iuoti'              // meluoti, dygsniuoti
+      'auti' 'iauti'              // draugauti, girtuokliauti
+      'oti' 'ioti'                // dovanoti, meškerioti
+      '{e.}ti'                    // auklėti
+      'yti'                       // akyti
+      'inti'                      // auginti
+      'in{e.}ti'                  // blusinėti
+      'enti'                      // gyventi
+      'tel{e.}ti'                 // bumbtelėti
+      'ter{e.}ti'                 // bumbterėti
+
+      'ti'                        // skalbti
+      // 'tis' konfliktas, nes rytme-tis -> rytme
+
+      // dalyviai (participles)
+      '{a'}s' 'i{a'}s' '{i'}s'    // dirbąs, žaidžiąs, gulįs
+      't{u'}s'                    // suktųs -> suk
+      'sim{e.}s'                  // suksimės
+      'sit{e.}s'                  // suksitės
+      'kite'                      // supkite
+    )
+
+    delete
+  )
+
+  define step2 as repeat (
+    setlimit tomark p1 for ([substring]) among (
+      // daiktavardziu priesagos (Noun suffixes)
+
+      // budvardziu priesagos (Adjective suffixes)
+      // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is
+      'ing'                       // tvark-ing-as
+      'i{s*}k'                    // lenk-išk-as
+      '{e.}t'                     // dem-ėt-as
+      'ot'                        // garban-ot-as
+      'uot' 'iuot'                // lang-uot-as, akin-iuot-as
+      // 'tin', nes augintinis    // dirb-tin-is
+      // 'ut', nes batutas, degutas etc. // maž-ut-is
+      'yt'                        // maž-yt-is
+      'iuk'                       // maž-iuk-as
+      'iul'                       // maž-ul-is
+      '{e.}l'                     // maž-ėl-is
+      'yl'                        // maž-yl-is
+      'u{c*}iuk'                  // maž-učiuk-as
+      'uliuk'                     // maž-uliuk-as
+      'ut{e.}ait'                 // maž-utėlait-is
+      'ok'                        // did-ok-as
+      'iok'                       // višč-iok-as
+      'sv' '{s*}v' 'zgan'         // sal-sv-as, pilk-šv-as, bal-zgan-as
+      'op' 'iop'                  // dvej-op-as, viener-iop-as
+      'ain'                       // apval-ain-as
+      'yk{s*}t' 'yk{s*}{c*}'      // ten-ykšt-is, vakar-ykšč-ias
+
+      // laisniai
+      'esn'                       // did-esn-is
+      'aus' 'iaus'                // nauj-aus-ias, ger-iaus-ias
+
+      // ivardziuotiniai budvardziai (Pronominal adjectives)
+      // vyriska gimine (Male gender)
+      'ias'                       // žaliasis
+      'oj' 'ioj'                  // gerojo, žaliojo
+      'aj' 'iaj'                  // gerajam, žaliajam
+      '{a'}j' 'i{a'}j'            // garąjį, žaliąjį
+      'uoj' 'iuoj'                // geruoju, žaliuoju
+      'iej'                       // gerieji
+      '{u'}j' 'i{u'}j'            // gerųjų, žaliųjų
+      'ies'                       // geriesiems
+      'uos' 'iuos'                // geruosius, žaliuosius
+      'ais' 'iais'                // geraisiais, žaliaisiais
+
+      // moteriska gimine (Female gender)
+      'os' 'ios'                  // gerosios, žaliosios
+      '{a'}s' 'i{a'}s'            // gerąsios, žaliąsias
+
+      // būtasis dažninis laikas (frequentative past tense)
+      'dav'                       // ei-dav-o
+
+      // dalyvių priesagos (particple suffix)
+      'ant' 'iant'
+      'int'                       // tur-int-is
+      '{e.}j'                     // tur-ėj-o
+      '{e'}'                      //
+      '{e.}j{e'}'
+      '{e'}s'                     // dirb-ęs-is
+
+      'siant'                     // dirb-siant
+
+      // pusdalyviai (participle)
+      'dam'                       // bėg-dam-as
+
+      'auj'                       // ūkinink-auj-a
+      'jam'
+      'iau'
+      'am'                        // baiminim-ams-i
+    )
+
+    delete
+  )
+
+  define fix_conflicts as (
+    [substring] among (
+      // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite'
+      'aite' (<-'ait{e.}')
+      // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės'
+      'ait{e.}s' (<-'ait{e.}')
+
+      // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės'
+      'uot{e.}s' (<-'uot{e.}')
+      // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote'
+      'uote' (<-'uot{e.}')
+
+      // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime'
+      '{e.}jime' (<-'{e.}jimas')
+
+      // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu'
+      'esiu' (<-'esys')
+      // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu'
+      'asius' (<-'asys')
+
+      // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime'
+      'avime' (<-'avimas')
+      'ojime' (<-'ojimas')
+
+      // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės'
+      'okat{e.}s' (<-'okat{e.}')
+      // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate'
+      'okate' (<-'okat{e.}')
+    )
+  )
+
+  define fix_chdz as (
+    [substring] among (
+      '{c*}' (<-'t')
+      'd{z*}' (<-'d')
+    )
+  )
+
+  define fix_gd as (
+    [substring] among (
+      'gd' (<-'g')
+      // '{e.}k' (<-'{e.}g')
+    )
+  )
+
+)
+
+define stem as (
+
+  $p1 = limit
+
+  do (
+    // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'.
+    try (test 'a' $(len > 6) hop 1)
+
+    gopast v  gopast non-v  setmark p1
+  )
+
+  backwards (
+    do fix_conflicts
+    do step1
+    do fix_chdz
+    do step2
+    do fix_chdz
+    do fix_gd
+  )
+
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/lithuanian/stemmer.tt b/algorithms/lithuanian/stemmer.tt new file mode 100644 index 0000000..8db874a --- /dev/null +++ b/algorithms/lithuanian/stemmer.tt @@ -0,0 +1,22 @@ +[% header('Lithuanian stemming algorithm') %] + +

Links to resources

+ + + +

+This algorithm was contributed by Dainius Jocas. +

+ +

+Its intended domain of use is information retrieval, and so handling of nouns +is considered more important than that of verbs, adjectives, etc. +

+ +

The full algorithm in Snowball

+ +[% highlight_file('lithuanian') %] + +[% footer %] diff --git a/algorithms/lovins/festschrift.html b/algorithms/lovins/festschrift.html new file mode 100644 index 0000000..33266fb --- /dev/null +++ b/algorithms/lovins/festschrift.html @@ -0,0 +1,2405 @@ + + + + + + + + + + Lovins revisited - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Lovins revisited

+ + +

+This is a revised version of Martin Porter’s paper which was published as part +of the Karen Sparck Jones Festschrift of 2005. +

+ +

+Charting a New Course: Progress in Natural Language Processing and +Information Retrieval: A Festschrift for Professor Karen Sparck Jones, edited +by John Tait, Amsterdam: Kluwer, 2005. +

+ +

Lovins Revisited

+ +

+Martin Porter, December 2001 (revised November 2008). +

+ +

Abstract

+
+ The Lovins stemming algorithm for English is analysed, and compared + with the Porter stemming algorithm, using Snowball, a language designed + specifically for the development of stemming algorithms. It is shown + how the algorithms manage to function in a similar way, while appearing + to be quite different. The Porter algorithm is recoded in the style of + the Lovins algorithm, which leads to the discovery of a few possible + improvements. +
+ +

Preamble

+ +

+This is a festschrift paper, so I am allowed to begin on a personal note. +In 1979 I was working with Keith van Rijsbergen and Stephen Robertson on a +British Library funded IR project to investigate the selection of good +index terms, and one of the things we found ourselves having to do was to +establish a document test collection from some raw data that had been sent +to us on a magnetic tape by Peter Vaswani of the National Physical +Laboratory. I was the tame programmer in the project, so it was my job to +set up the test collection. +

+ +

+On the whole it did not prove too difficult. The data we received was a +collection of about 11,000 documents (titles and short abstracts), 93 +queries — in a free text form, and relevance judgements. All the text was +in upper case without punctuation, and there were one or two marker +characters to act as field terminators. By modern standards the data was +really very small indeed, but at the time it was considerably larger than +any of the other test collections we had. What you had to do was to cast it +into a standard form +for experimental work. You represented terms and documents by numbers, and +created flat files in text form corresponding to the queries, relevance +assessments, and term to document index. One process however was less +straightforward. On their way to becoming numeric terms, the words of the +source text were put through a process of linguistic normalisation called +suffix stripping, in which certain derivational and inflectional suffixes +attached to the words were removed. There was a standard piece of software +used in Cambridge at that time to do this, written in 1971 by Keith +Andrews (Andrews, 1971) as part of a Diploma Project. +One of the courses in +Cambridge is the one year post-graduate Diploma in Computer Science. Each +student on the course is required to do a special project, which includes +writing a significant piece of software — significant in the sense of being +both useful and substantial. +Keith's piece of software was more useful than most, and it continued to be +used as a suffix stripping program, or stemmer, for many years after it was +written. +

+ +

+Now by an odd chance I was privy to much of Keith Andrews’ original +thinking at the time that he was doing the work. The reason for this was +that in 1971 I was looking for a house in Cambridge, and the base I was +operating from was a sleeping bag on the living room floor of an old friend +called John Dawson, who was Keith’s diploma supervisor. Keith used to come round +and discuss stemming algorithms with him, while I formed a mute audience. I +learnt about the Lovins stemming algorithm of 1968 (Lovins, 1968), +and must I think have +at least looked at her paper then, since I know it was not new to me when I +saw it again in 1979. Their view of Lovins’ work was that it did not go far +enough. There needed to be many more suffixes, and more complex rules to +determine the criteria for their removal. Much of their discussion was +about new suffixes to add to the list, and removal rules. It was interesting +therefore to find myself needing to use Andrews’ work eight years later, +and questioning some of its assumptions. Did you need that many suffixes? +Did the rules need to be so complicated? Perhaps one would do better to +break composite suffixes into smaller units and remove them piecemeal. +And perhaps syllables would be a better count of stem length than letters. +So I wrote my own stemmer, which became known as the Porter stemmer, and +which was published in 1980 (Porter, 1980). +

+ +

+I must explain where Karen Sparck Jones fits into all of this. Keith +Andrews’ piece of work was originally suggested by Karen as a Diploma +student project, and she was able to use the Andrews stemmer in her IR +experiments throughout the seventies. In 1979 however Karen had moved much +more into the field of Natural Language Processing and Artificial +Intelligence, and by then had two or three research students in that field +just writing up their PhDs (only one of whom I really got to know — John +Tait, the editor of this volume). So we were in contact, but not working +together. That again was an odd chance: that Karen had been my research +supervisor in a topic other than IR, and that when later I was doing IR +research at Cambridge I was not working with Karen. While I was engaged on +writing the stemmer, Karen showed some justifiable irritation that I had +become interested in a topic so very remote from the one for which we had +received the British Library funding. Nevertheless, she came into my room +one day, said, ‘Look, if you're getting interested in stemming, you’d +better read this,’ and handed me the 1968 issue of Mechanical +Translation that contains the Lovins paper. I still have this issue with +Karen’s name across the top. (And I hope she didn't expect it back!) +

+ +

+Another 20 years have gone by, and I have been studying the Lovins stemmer +again, really because I was looking for examples to code up in Snowball, a +small string processing language I devised in the latter half of 2001 +particularly adapted for writing stemming algorithms. Lovins’ stemmer +strikes me now as a fine piece of work, for which she never quite received +the credit she deserved. It was the first stemmer for English set out as +an algorithm that described the stemming process exactly. She explained +how it was intended to be used to improve IR performance, in just the way +in which stemmers are used today. It is not seriously short of suffixes: +the outstanding omissions are the plural forms ements and ents +corresponding to her ement and ent, and it is easy enough to add +them into the definition. It performs well in practice. In fact it is +still in use, and can be downloaded in various languages from the net (1). +The tendency since 1980 has been to attach the name ‘Porter’ to any +language stemming process that does not use a dictionary, even when it is +quite dissimilar to the original Porter stemmer (witness the Dutch Porter +stemmer of Kraaij and Pohlmann (2) (Kraaij, 1994 and Kraaij, 1995), but +the priority really belongs to Lovins. It also has one clear advantage +over the Porter algorithm, in that it involves fewer steps. Coded up well, +it should run a lot faster. +

+ +

+A number of things intrigued me. Why are the Lovins and Porter stemmers so +different, when what they do looks so similar? Could the stemmer, in some +sense, be brought up-to-date? Could the Porter stemmer be cast into the +Lovins form, and so run faster? +

+ +

+This paper is about the answers for these questions. In discovering them, I +have learned a lot more about my own stemmer. +

+ +

Why stem?

+ +

+It may be worth saying a little on what stemming is all about. We can imagine +a document with the title, +

+ +
+ Pre-raphaelitism: A Study of Four Critical Approaches +
+ +

+and a query, containing the words +

+ +
+ PRE-RAPHAELITE CRITICISM +
+ +

+We want to match query against title so that ‘Pre-raphaelitism’ matches +‘PRE-RAPHAELITE’ and ‘Critical’ matches ‘CRITICISM’. This leads to the +idea of removing endings from words as part of the process of extracting index +terms from documents, a similar process of ending removal being applied to +queries prior to the match. For example, we would like to remove the endings +from +

+ +
+critical
+critically
+criticism
+criticisms
+critics
+
+ +

+so that each word is reduced to ‘critic’. This is the stem, from which the +other words are formed, so the process as a whole is called stemming. It is +a feature of English morphology that the part of the word we want to remove is +at the end — the suffix. But the same is broadly true of French, German and other +languages of the Indo-European group. It is also true of numerous languages +outside Indo-European, Finnish for example, although there is a +boundary beyond which it is not true. So Chinese, where words are simple +units without affixes, and Arabic, where the stem is modified by +prefixes and infixes as well as suffixes, lie outside the +boundary. As an IR technique it therefore has wide applicability. In developing +stemmers two points were recognised quite early on. One is that the +morphological regularities that you find in English (or other languages) mean +that you can attempt to do stemming by a purely algorithmic process. Endings +al, ally, ism etc. occur throughout English vocabulary, and are +easy to detect and remove: you don’t need access to an on-line dictionary. The +other is that the morphological irregularities of English set a limit to the +success of an algorithmic approach. Syntactically, what look like endings may +not be endings (offspring is not offspr + ing), and the list of +endings seems to extend indefinitely (trapez-oid, likeli-hood, +guardian-ship, Tibet-an, juven-ilia, Roman-esque, ox-en +...) It is difficult to gauge where to set the cut-off for these rarer forms. +Semantically, the addition of a suffix may alter the meaning of a word a +little, a lot, or completely, and morphology alone cannot measure the degree of +change (prove and provable have closely related meanings; probe and +probable do not.) This meant that stemming, if employed at all, became the +most challenging, and the most difficult part of the indexing process. +

+ +

+In the seventies, stemming might be applied as part of the process of +establishing a test collection, and when it was there would not usually be any +attempt to make the stemming process well-defined, or easily repeatable by +another researcher. This was really because the basis for experiment replication +was the normalised data that came out of the stemming process, rather than the +source data plus a description of stemming procedures. Stemming tended to be +applied, and then forgotten about. But by the 1980s, stemming itself was being +investigated. Lennon and others (Lennon, 1981) found no substantial differences +between the use of different stemmers for English. Harman (Harman, 1991) +challenged the effectiveness of stemming altogether, when she reported no +substantial differences between using and not using stemming in a series of +experiments. But later work has been more positive. Krovetz (Krovetz, 1995), for example, +reported small but significant improvements with stemming over a range of test +collections. +

+ +

+Of course, all these experiments assume some IR model which will use stemming in +a particular way, and will measure just those features that tests collections +are, notoriously, able to measure. We might imagine an IR system where the users +have been educated in the advantages and disadvantages to be expected from +stemming, and are able to flag individual search terms to say whether or not +they are to be used stemmed or unstemmed. Stemming sometimes improves, +occasionally degrades, search performance, and this would be the best way of +using it as an IR facility. Again stemming helps regularise the IR vocabulary, +which is very useful when preparing a list of terms to present to a user as +candidates for query expansion. But this advantage too is difficult to quantify. +

+ +

+An evaluative comparison between the Lovins and later stemmers lies in any case +outside the scope of this paper, but it is important to +bear in mind that it is not a straightforward undertaking. +

+ +

The Lovins Stemmer

+ +

+Structurally, the Lovins stemmer is in four parts, collected together in +four Appendices A, B, C and D in her paper. Part A is a list of 294 +endings, each with a letter which identifies a condition for whether or +not the ending should be removed. (I will follow Lovins in using ‘ending’ +rather than ‘suffix’ as a name for the items on the list.) +Part A therefore looks like this: +

+ +
+ .11.
+ alistically   B
+ arizability   A
+ izationally   B
+ .10.
+ antialness   A
+ arisations   A
+ arizations   A
+ entialness   A
+ .09.
+ allically   C
+ antaneous   A
+ antiality   A
+ . . .
+
+ .01.
+ a   A
+ e   A
+ i   A
+ o   A
+ s   W
+ y   B +
+ +

+Endings are banked by length, from 11 letters down to 1. Each bank is tried +in turn until an ending is found which matches the end of the word to be +stemmed and leaves a stem which satisfies the given condition, when the +ending is removed. For example condition C says that the stem must have at +least 4 letters, so bimetallically would lose allically leaving a +stem bimet of length 5, but metallically would not reduce to +met, since its length is only 3. +

+ +

+There are 29 such conditions, called A to Z, AA, BB and CC, and they +constitute part B of the stemmer. Here they are (* stands for any letter): +

+ +
+ +
A No restrictions on stem +
B Minimum stem length = 3 +
C Minimum stem length = 4 +
D Minimum stem length = 5 +
E Do not remove ending after e +
F Minimum stem length = 3 and do not remove ending after e +
G Minimum stem length = 3 and remove ending only after f +
H Remove ending only after t or ll +
I Do not remove ending after o or e +
J Do not remove ending after a or e +
K Minimum stem length = 3 and remove ending only after l, i or +u*e +
L Do not remove ending after u, x or s, unless s follows +o +
M Do not remove ending after a, c, e or m +
N Minimum stem length = 4 after s**, elsewhere = 3 +
O Remove ending only after l or i +
P Do not remove ending after c +
Q Minimum stem length = 3 and do not remove ending after l or +n +
R Remove ending only after n or r +
S Remove ending only after dr or t, unless t follows t +
T Remove ending only after s or t, unless t follows o +
U Remove ending only after l, m, n or r +
V Remove ending only after c +
W Do not remove ending after s or u +
X Remove ending only after l, i or u*e +
Y Remove ending only after in +
Z Do not remove ending after f +
AA Remove ending only after d, f, ph, th, l, er, or, es or t +
BB Minimum stem length = 3 and do not remove ending after met or +ryst +
CC Remove ending only after l +
+
+ +

+There is an implicit assumption in each condition, A included, that the minimum +stem length is 2. +

+ +

+This is much less complicated than it seems at first. Conditions A to D +depend on a simple measure of minimum stem length, and E and F are slight +variants of A and B. Out of the 294 endings, 259 use one of these +6 conditions. The remaining 35 endings use the other 23 conditions, so +conditions G, H ... CC have less than 2 suffixes each, on average. What is +happening here is that Lovins is trying to capture a rule which gives a +good removal criterion for one ending, or a small number of similar +endings. She does not explain the thinking behind the conditions, but it is +often not too difficult to reconstruct. Here for example are the last few +conditions with their endings, +

+ +
+ +Y (early,   ealy,   eal,   ear). collinearly, multilinear are +stemmed.
+ +Z (eature). misfeature does not lose eature.
+ +AA (ite). acolouthite, hemimorphite lose ite, ignite and +requite retain it.
+ +BB (allic,   als,   al). Words ending metal, crystal retain +al.
+ +CC (inity). crystallinitycrystall, but affinity, +infinity are unaltered. + +
+ +

+Part C of the Lovins stemmer is a set of 35 transformation rules used to +adjust the letters at the end of the stem. These rules are invoked after the +stemming step proper, irrespective of whether an ending was actually +removed. Here are about half of them, with examples to show the type of +transformation intended (letters in square brackets indicate the full form +of the words), +

+ +
+ +
1) bb →     b rubb[ing] → rub +
ll l controll[ed] → control +
mm m trimm[ed] → trim +
rr r abhorr[ing] → abhor +
2) iev ief believ[e] → belief +
3) uct uc induct[ion] → induc[e] +
4) umpt um consumpt[ion] → consum[e] +
5) rpt rb absorpt[ion] → absorb +
6) urs ur recurs[ive] → recur +
7a) metr meter     parametr[ic] → paramet[er] +
8) olv olut dissolv[ed] → dissolut[ion] +
11) dex dic index → indic[es] +
16) ix ic matrix → matric[es] +
18) uad uas persuad[e] → persuas[ion] +
19) vad vas evad[e] → evas[ion] +
20) cid cis decid[e] → decis[ion] +
21) lid lis elid[e] → elis[ion] +
31) ert ers convert[ed] → convers[ion] +
33) yt ys analytic → analysis +
34) yz ys analyzed → analysed +
+
+ +

+Finally, part D suggests certain relaxed matching rules between query terms +and index terms when the stemmer has been used to set up an IR system, but +we can regard that as not being part of the stemmer proper. +

+ +

The Lovins stemmer in Snowball

+ +

+Snowball is a string processing language designed with the idea of making +the definition of stemming algorithms much more rigorous. The Snowball +compiler translates a Snowball script into a thread-safe ANSI C module, +where speed of execution is a major design consideration. The resulting +stemmers are pleasantly fast, and will process one million or so words a +second on a high-performance modern PC. The Snowball website (3) gives a +full description of the language, and also presents stemmers for a range of +natural languages. Each stemmer is written out as a formal algorithm, with +the corresponding Snowball script following. The algorithm definition acts +as program comment for the Snowball script, and the Snowball script gives a +precise definition to the algorithm. The ANSI C code with the +same functionality can also be inspected, and sample vocabularies in source +and stemmed form can be used for test purposes. +An essential function of +the Snowball script is therefore comprehensibility — it should be fully understood +by the reader of the script, and Snowball has been designed with this in mind. +It contrasts interestingly in this respect with a system like Perl. +Perl has a very big definition. Writing your own scripts in Perl is easy, +after the initial learning hurdle, but understanding other scripts can be +quite hard. The size of the language means that there are many different +ways of doing the same thing, which gives programmers the opportunity of +developing highly idiosyncratic styles. Snowball has a small, tight +definition. Writing Snowball is much less easy than writing Perl, but on +the other hand once it is written it is fairly easy to understand +(or at least one hopes that it is). This is +illustrated by the Lovins stemmer in Snowball, which is given in Appendix +1. There is a very easy and natural correspondence +between the different parts of the stemmer definition in Lovins' original +paper and their Snowball equivalents. +For example, the Lovins conditions A, B ... CC code up very neatly +into routines with the same name. Taking condition L, +

+ +
+ L   Do not remove ending after u, x or s, unless s follows + o +
+ +

+corresponds to +

+ +
    define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
+
+ + +

+When  L  is called, we are the right end of the stem, moving left towards the +front of the word. Each Lovins condition has an implicit test for a stem of +length 2, and this is done by test hop 2 + + +, which sees if it is possible to +hop two places left. If it is not, the routine immediately returns with a +false signal, otherwise it carries on. It tests that the character at the +right hand end is not u, and also not x, and also not s following a letter +which is not o. This is equivalent to the Lovins condition. Here is not of +course the place to give the exact semantics, but the you can quickly get +the feel of the language by comparing the 29 Lovins conditions with their +Snowball definitions. +

+ +

+Something must be said about the among + + + feature of Snowball however, +since this is central to the efficient implementation of stemmers. It is +also the one part of Snowball that requires just a little effort to +understand. +

+ +

+At its simplest, among + + + can be used to test for alternative strings. The +among + + +s used in the definition of condition AA and the  undouble +routine have this form. In Snowball you can write +

+ +
    'sh' or 's' or 't'  'o' or 'i'  'p'
+
+ + +

+which will match the various forms shop, ship, sop, sip, top, tip. The +order is important, because if 'sh' + + + and 's' + + + are swapped over, the +'s' + + + would match the first letter of ship, while 'o' + + + or 'i' + + +would fail to match with the following 'h' + + + — in other words the pattern +matching has no backtracking. But it can also be written as +

+ +
    among('sh' 's' 't') among('i' 'o') 'p'
+
+ + +

+The order of the strings in each among + + + is not important, because the +match will be with the longest of all the strings that can match. In +Snowball the implementation of among + + + is based on the binary-chop idea, +but has been carefully optimised. For example, in the Lovins stemmer, the +main among + + + in the  endings  routine has 294 different strings of average +length 5.2 characters. A search for an ending involves accessing a number +of characters within these 294 strings. The order is going to be +Klog2294, or 8.2K, where K is a number that one hopes will +be small, although one must certainly expect it to be greater than 1. It +turns out that, for the successive words of a standard test vocabulary, +K averages to 1.6, so for each word there are about 13 character +comparisons needed to determine whether it has one of the Lovins endings. +

+ +

+Each string in an among + + + construction can be followed by a routine name. The +routine returns a true/false signal, and then the among + + + searches for the +longest substring whose associated routine gives a true signal. A string not +followed by a routine name can be thought of as a string which is associated +with a routine that does nothing except give a true signal. This is the way +that the among + + + in the  endings  routine works, where indeed every string is +followed by a routine name. +

+ +

+More generally, lists of strings in the among + + + construction can be followed +by bracketed commands, which are obeyed if one of the strings in the list is +picked out for the longest match. The syntax is then +

+ +
+    among( S11 S12 ... (C1)
+           S21 S22 ... (C2)
+           ...
+
+           Sn1 Sn2 ... (Cn)
+         )
+
+ +

+where the  Sij  are strings, optionally followed by their routine names, +and the  Ci  are Snowball command sequences. The semantics is a bit +like a switch in C, where the switch is on a string rather than a numerical +value: +

+ +
+    switch(...) {
+        case S11: case S12: ... C1; break;
+        case S21: case S22: ... C2; break;
+        ...
+
+        case Sn1: case Sn2: ... Cn; break;
+    }
+
+ +

+The among + + + in the  respell  routine has this form. +

+ +

+The full form however is to use among + + + with a preceding substring + + +, with +substring + + + and among + + + possibly separated by further commands. +substring + + +triggers the test for the longest matching substring, and the among + + + then +causes the corresponding bracketed command to be obeyed. At a simple +level this can be used to cut down the size of the code, in that +

+ +
+    substring C among( S11 S12 ... (C1)
+                       S21 S22 ... (C2)
+                       ...
+
+                       Sn1 Sn2 ... (Cn)
+                     )
+
+ +

+is a shorter form of +

+ +
+    among( S11 S12 ... (C C1)
+           S21 S22 ... (C C2)
+           ...
+
+           Sn1 Sn2 ... (C Cn)
+         )
+
+ +

+More importantly, substring + + + and among + + + can work in different contexts. For +example, substring + + + could be used to test for the longest string, matching from +right to left, while the commands in the among + + + could operate in a left to +right direction. In the Lovins stemmer, substring + + + is used in this style: +

+ +
    [substring] among ( ... )
+
+ + +

+The two square brackets are in fact individual commands, so before the among + + +come three commands. [ + + + sets a lower marker, substring + + + is obeyed, searching +for the strings in the following among, and then ] + + + sets an upper marker. +The region between the lower and upper markers is called the slice, and this +may subsequently be copied, replaced or deleted. +

+ +

+It was possible to get the Lovins stemmer working in Snowball very quickly. +The Sourceforge versions (1) could be used to get the long list of endings and +to help with the debugging. There was however one problem, that rules 24 and +30 of part C conflicted. They are given as +

+ +
+ 24) end → ens except following s
+ ...
+ 30) end → ens except following m +
+ +

+This had not been noticed in the Sourceforge implementations, but +immediately gave rise to a compilation error in Snowball. Experience +suggested that I was very unlikely to get this problem resolved. Only a few +months before, I had hit a point in a stemming algorithm where +something did not quite make sense. The algorithm had been published just a +few years ago, and contacting one at least of the authors was quite easy. +But I never sorted it out. The author I traced was not au fait +with the linguistic background, and the language expert had been swallowed +up in the wilds of America. So what chance would I have here? Even if I was +able to contact Lovins, it seemed to me inconceivable that she would have +any memory of, or even interest in, a tiny problem in a paper which she +published 33 years ago. But the spirit of academic enquiry forced me to +venture the attempt. After pursuing a number of red-herrings, email contact +was finally made. +

+ +

+Her reply was a most pleasant surprise. + +

+

+ ... The explanation is both mundane and exciting. You have just found + a typo in the MT article, which I was unaware of all these years, and I + suspect has puzzled a lot of other people too. The original paper, an + MIT-published memorandum from June 1968, has rule 30 as +

+ +

+       ent → ens except following m +

+ +

+ and that is undoubtedly what it should be ... +

+
+ +

An analysis of the Lovins stemmer

+ +

+It is very important in understanding the Lovins stemmer to know something +of the IR background of the late sixties. In the first place there was an +assumption that IR was all, or mainly, about the retrieval of +technical scientific papers, and research projects were set up accordingly. +I remember being shown, in about 1968, a graph illustrating the +‘information explosion’, as it was understood at the time, which showed +just the rate of growth of publications of scientific papers in various +different domains over the previous 10 or 20 years. Computing resources +were very precious, and they could not be wasted by setting up IR systems +for information that was, by comparison, merely frivolous (articles in +popular magazines, say). And even in 1980, when I was working in IR, the +data I was using came from the familiar, and narrow, scientific domain. +Lovins was working with Project Intrex (Overhage, 1966), where the data came from +papers in materials science and engineering. +

+ +

+Secondly, the idea of indexing on every word in a document, or even looking +at every word before deciding whether or not to put it into an index, would +have seemed quite impractical, even though it might have been recognised as +theoretically best. In the first place, the computing resources necessary to +store and analyse complete documents in machine readable form were absent, and in the +second, the rigidities of the printing industry almost guaranteed that one +would never get access to them. +A stemmer, therefore, would be seen as something not +applied to general text but to certain special words, and in the case of the +Lovins stemmer, the plan was to apply it to the subject terms that were used +to categorize each document. Subsequently it would be used with each word +in a query, where it +was hoped that the vocabulary of the queries would match the vocabulary of +the catalogue of subject terms. +

+ +

+This accounts for: — +

+ +
    +
  1. The emphasis on the scientific vocabulary. This can be seen in the +endings, which include oidal, on, oid, ide, for words like colloidal, +proton, spheroid, nucleotide. It can be seen in the transformation rules, +with their concern for Greek sis and Latin ix suffixes. And also it can be +seen in in the word samples of the paper (magnesia, magnesite, magnesian, +magnesium, magnet, magnetic, magneto etc. of Fig. 2). +
  2. + +
  3. The slight shortage of plural forms. The subject terms would naturally +have been mainly in the singular, and one might also expect the same of +query terms. +
  4. + +
  5. The surprising shortness of the allowed minimum stems — usually 2 +letters. A controlled technical vocabulary will contain longish words, and +the problem of minimum stem lengths only shows up with shorter words. +
  6. +
+ +

+If we take a fairly ordinary vocabulary of modern English, derived from +non-scientific writing, it is interesting to see how much of the Lovins +stemmer does not actually get used. We use vocabulary V, derived from a +sample of modern texts from Project Gutenberg (4). V can be inspected +at (5). It contains 29,401 words, and begins +

+ a   aback   abandon   abandoned   abandoning   abandonment   + abandons   abasement   abashed   abate   abated ... +
+We find that 22,311, or about 76%, of the words in V have one of the +294 endings removed if passed through the Lovins stemmer. Of this 76%, over a +half (55%) of the removals are done by just five of the endings, the breakdown +being, +
+ s (13%)   ed (12%)   e (10%)   ing (10%)   es (6%)   y (4%) +
+If, on the other hand, you look at the least frequent endings, 51% of them +do only 1.4% of the removals. So of the ones removed, half the endings in +V +correspond to 2% of the endings in the stemmer, and 1.4% of the endings in +V +correspond to half the endings in the stemmer. In fact 62 of the endings +(about a fifth) do not lead to any ending removals in V at all. These are +made up of the rarer ‘scientific’ endings, such as aroid and oidal, and +long endings, such as alistically and entiality. +

+ +

+This helps explain why the Porter and Lovins stemmers behave in a fairly +similar way despite the fact that they look completely different — it is +because most of the work is being done in just a small part of the stemmer, +and in that part there is a lot of overlap. Porter and Lovins stem 64% of +the words in V identically, which is quite high. (By contrast, an +erroneous but plausibly written Perl script +advertised on the Web as an implementation of the Porter stemmer +still proves to stem only 86% of the words in V +to the same forms that are produced by the Porter stemmer.) +

+ +

+A feature of the Lovins stemmer that is worth looking at in some detail is +the transformation rules. People who come to the problem of stemming for +the first time usually devote a lot of mental energy to the issue of +morphological irregularity which they are trying to address. +

+ +

+A good starting point is the verbs of English. Although grammatically +complex, the morphological forms of the English verb are few, and are +illustrated by the pattern harm, harms, harming, harmed, where the basic +verb form adds s, ing and ed to make the other three forms. There are +certain special rules: to add s to a verb ending ss an e is inserted, +so pass becomes passes, and adding e and ing replaces a final e of +the verb (love to loves), and can cause consonant doubling (hop to +hopped), but +apart from this all verbs in the language follow the basic pattern with the +exception of a finite class of irregular verbs. +In a regular verb, the addition of ed to the basic verb creates both the +past form (‘I harmed’) and the p.p. (past participle) form (‘I have +harmed’). An irregular verb, such as ring, forms its past in some other +way (‘I rang’), and may have a distinct p.p. (‘I have rung’). +The irregular verbs have a +different past form, and sometimes a separate p.p. form. +It is easy to think up more examples, + +

+
stem past p.p. +
+
ring rang rung +
rise rose risen +
sleep slept slept +
fight       fought       fought +
come came come +
go went gone +
hit hit hit +
+ +How many of these verbs are there altogether? On 20 Jan 2000, in order to +test the hypothesis that the number is consistently over-estimated, I asked +this question in a carefully worded email to a mixed group of +about 50 +well-educated +work colleagues (business rather than academic people). Ten of them replied, +and here are the +guesses they made: + +
+ 20,   25,   25,   50,   180,   200,   426,   25000,   10%,   20% +
+ +The last two numbers mean 10% and 20% of all English verbs. +My hypothesis was of course wrong. The truth is that most people have no +idea at all how many irregular verbs there are in English. +In +fact there are around 135 (see section 3.3 of Palmer, 1965). +If a stemming algorithm handles suffix removal +of all regular verbs correctly, the question arises as to whether it is +worth making it do the same for the irregular forms. Conflating fought and +fight, for example, could be useful in IR queries about boxing. It seems +easy: you make a list of the irregular verbs and create a mapping of the +past and p.p. forms to the main form. We can call the process +English verb respelling. But when you try it, numerous problems arise. Are +forsake, beseech, cleave really verbs of contemporary English? If so, what +is the p.p. of cleave? +Or take the verb stride, which is common enough. What is its p.p.? My +Concise Oxford English Dictionary says it is stridden (6), but have we ever +heard this word used? (‘I have stridden across the paving.’) +

+ +

+To compose a realistic list for English verb respelling we therefore need to +judge word rarity. But among the commoner verb forms even greater problems +arise because of their use as homonyms. A rose is a type of flower, so +is it wise +to conflate rose and rise? Is it wise to conflate +saw and see when saw can mean a cutting instrument? +

+ +

+We suddenly get to +the edge of what it is useful to include in a stemming algorithm. So long as +a stemming algorithm is built around general rules, the full impact of the +stemmer on a vocabulary need not be studied too closely. It is sufficient to +know that the stemmer, judiciously used, improves retrieval performance. But +when we look at its effect on individual words these issues can no longer be +ignored. To build even a short list of words into a stemmer for special +treatment takes us into the area of the dictionary-based stemmer, and the +problem of determining, for a pair of related words in the dictionary, a +measure of semantic similarity which tells us whether or not the words +should be conflated together. +

+ +

+About half the transformation rules in the Lovins stemmer deal with a +problem which is similar to that posed by the irregular verbs of English, +and which ultimately goes back to the irregular forms of second conjugation +verbs in Latin. We can call it Latin verb respelling. Verbs like +induce, consume, commit are perfectly regular in modern English, but +the adjectival and noun forms induction, consumptive, commission that +derive from them correspond to p.p. forms in Latin. +You can see the descendants of these Latin irregularities +in modern Italian, which has commettere with p.p. +commesso, like our commit and commission, and scendere with +p.p. sceso like our ascend and ascension (although scendere +means ‘to go down’ rather than ‘to go up’). +

+ +

+Latin verb respelling often seems to be more the territory of a stemmer than +English verb respelling, presumably because Latin verb irregularities +correspond to consonantal changes at the end of the stem, where the +stemmer naturally operates, while English verb irregularities more often +correspond to vowel changes in the middle. Lovins was no doubt +particularly interested in Latin verb respelling because so many of the +words affected have scientific usages. +

+ +

+We can judge that Latin verb respellings constitute a small set because the +number of second conjugation verbs of Latin form a small, fixed set. Again, +looking at Italian, a modern list of irregular verbs contains 150 basic forms +(nearly all of them second conjugation), not unlike the number of forms in +English. Extra verbs are formed with prefixes. Corresponding English words +that exhibit the Latin verb respelling problem +will be a subset of this system. In fact we +can offer a Snowball script that does the Latin verb respelling with more +care. It should be invoked, in the Porter stemmer, after removal of ive or +ion endings only, + +

define prefix as (
+
+    among (
+
+        'a' 'ab' 'ad' 'al' 'ap' 'col' 'com' 'con' 'cor' 'de'
+        'di' 'dis' 'e' 'ex' 'in' 'inter' 'o' 'ob' 'oc' 'of'
+        'per' 'pre' 'pro' 're' 'se' 'sub' 'suc' 'trans'
+    ) atlimit
+)
+
+define second_conjugation_form as (
+
+    [substring] prefix among (
+
+        'cept'    (<-'ceiv')    //-e    con de re
+        'cess'    (<-'ced')     //-e    con ex inter pre re se suc
+        'cis'     (<-'cid')     //-e    de (20)
+        'clus'    (<-'clud')    //-e    con ex in oc (26)
+        'curs'    (<-'cur')     //      re (6)
+        'dempt'   (<-'deem')    //      re
+        'duct'    (<-'duc')     //-e    de in re pro (3)
+        'fens'    (<-'fend')    //      de of
+        'hes'     (<-'her')     //-e    ad (28)
+        'lis'     (<-'lid')     //-e    e col (21)
+        'lus'     (<-'lud')     //-e    al de e
+        'miss'    (<-'mit')     //      ad com o per re sub trans (29)
+        'pans'    (<-'pand')    //      ex (23)
+        'plos'    (<-'plod')    //-e    ex
+        'prehens' (<-'prehend') //      ap com
+        'ris'     (<-'rid')     //-e    de (22)
+        'ros'     (<-'rod')     //-e    cor e
+        'scens'   (<-'scend')   //      a
+        'script'  (<-'scrib')   //-e    de in pro
+        'solut'   (<-'solv')    //-e    dis re (8)
+        'sorpt'   (<-'sorb')    //      ab (5)
+        'spons'   (<-'spond')   //      re (25)
+        'sumpt'   (<-'sum')     //      con pre re (4)
+        'suas'    (<-'suad')    //-e    dis per (18)
+        'tens'    (<-'tend')    //      ex in pre (24)
+        'trus'    (<-'trud')    //-e    ob (27)
+        'vas'     (<-'vad')     //-e    e (19)
+        'vers'    (<-'vert')    //      con in re (31)
+        'vis'     (<-'vid')     //-e    di pro
+    )
+)
+
+ + +This means that if suas, for example, is preceded by one of the strings +in prefix + + +, and there is nothing more before the prefix string (which is +what the +atlimit + + +command tests), it is replaced by suad. So dissuas(ion) goes to +dissuad(e) +and persuas(ive) to persuad(e). Of course, asuas(ion), absuas(ion), +adsuas(ion) and so on would get the same treatment, but not being words of +English that does not really matter. The corresponding Lovins rules are +shown in brackets. +This is not quite the end +of the story, however, because the Latin forms ex + cedere (‘go +beyond’) pro + cedere (‘go forth’), and sub + cedere +(‘go after’) give rise to verbs which, +by an oddity of English orthography, have an extra letter e: exceed, proceed, +succeed. They can be sorted out in a final respelling step: +

+ +
define final_respell as (
+
+    [substring] atlimit among(
+
+        'exced'     (<-'exceed')
+        'proced'    (<-'proceed')
+        'succed'    (<-'succeed')
+        /* extra forms here perhaps */
+    )
+)
+
+ + +

+As you might expect, close inspection of this process creates doubts in +the same way as for English verb respelling. (Should we really conflate +commission and commit? etc.) +

+ +

+The other transformation rules are concerned with unusual plurals, mainly +of Latin or Greek origin, er and re differences, as in parameter and +parametric, and the sis/tic connection of certain words of Greek origin: +analysis/analytic, paralysis/paralytic ... (rule 33), and +hypothesis/hypothetic, kinesis/kinetic ... (rule 32). Again, these +irregularities might be tackled by forming explicit word lists. Certainly +rule 30, given as, +

+ +
+ ent → ens except following m, +
+ +

+goes somewhat wild when given a general English vocabulary (dent becomes +dens for example), although it is the only rule that might be said to +have a damaging effect. +

+ +

A Lovins shape for the Porter stemmer

+ +

+The 1980 paper (Porter, 1980) may be said to define the ‘pure’ Porter stemmer. +The stemmer distributed at (7) can be called the ‘real’ Porter +stemmer, and differs from the pure stemmer in three small respects, which +are carefully explained. This disparity does not require much excuse, +since the oldest traceable encodings of the stemmer have always contained +these differences. There is also a revised stemmer for English, called +‘Porter2’ and still subject to slight changes. Unless otherwise stated, +it is the real Porter stemmer which is being studied below. +

+ +

+The Porter stemmer differs from the Lovins stemmer in a number of +respects. In the first place, it only takes account of fairly common +features of English. So rare suffixes are not included, and there is no +equivalent of Lovins’ transformation rules, other than her rule (1), the +undoubling of terminal double letters. Secondly, it removes suffixes only +when the residual stem is fairly substantial. Some suffixes are removed +only when at least one syllable is left, and most are removed only when at least two +syllables are left. (One might say that this is based on a guess about the +way in which the meanings of a stem is related to its length in syllables (8).) +The Porter stemmer is therefore ‘conservative’ in its removal +of suffixes, or at least that is how it has often been described. Thirdly, +it removes suffixes in a series of steps, often reducing a compound suffix +to its first part, so a step might reduce ibility to ible, where +ibility is thought of as being ible + ity. Although the +description of the whole stemmer is a bit complicated, the total number of +suffixes is quite small — about 60. +

+ +

+The Porter stemmer has five basic steps. Step 1 removes an +inflectional suffix. There are only three of these: ed and ing, which are +verbal, and s, which is verbal (he sings), plural (the songs) or possessive +(the horses’ hooves), although the rule for s removal is the same in all +three cases. Step 1 may also restore an e (hoping → hope), undouble a +double letter pair (hopping → hop), or change y to i (poppy → +poppi, to match with poppies → poppi.) Steps 2 to 4 remove derivational +suffixes. So +ibility may reduce to ible in step 2, and ible itself may be removed in step +4. Step 5 is for removing final e, and undoubling ll. +

+ +

+A clear advantage of the Lovins stemmer over the Porter stemmer is speed. +The Porter stemmer has five steps of suffix removal to the Lovins stemmer’s +one. It is instructive therefore to try and cast the Porter stemmer into +the shape of the Lovins stemmer, if only for the promise of certain speed +advantages. As we will see, we learn a few other things from the exercise +as well. +

+ +

+First we need a list of endings. The Lovins endings were built up by hand, +but we can construct a set of endings for the Porter stemmer by writing an +ending generator that follows the algorithm definition. From an analysis of +the suffixes in steps 2 to 4 of the Porter stemmer we can construct +the following diagram: +

+ +Diagram showing ending combinations for the Porter stemmer + +

+This is not meant to be a linguistic analysis of the suffix structure of +English, but is merely intended to show how the system of endings works in +the stemming algorithm. Suffixes combine if their boxes are connected by +an arrow. So ful combines with ness to make fulness. + +

+ ful + nessfulness +
+ +The combination is not always a concatenation of the strings +however, for we have, + +
+ able + ityability
+ able + lyably
+ ate + ionation
+ ible + ityibility
+ ible + lyibly
+ ize + ate + ionization +
+ +The path from ize to ion goes via ate, so we can form ization, but there is +no suffix izate. Three of the suffixes, ator, ance and ence, do not connect +into the rest of the diagram, and ance, ence also appear in the forms +ancy, ency. The letter to the left of the box is going to be the +condition for the +removal of the suffix in the box, so + +
+      B +-------+ n
+        |  ism  |
+        +-------+
+
+ +means that ism will be removed if it follows a stem that satisfies +condition B. On the right of the box is either n, v or hyphen. n means the +suffix is of noun type. So if a word ends ism it is a noun. v means verb +type. hyphen means neither: ly (adverbial) and ful, ous (adjectival) are of +this type. If a suffix is a noun type it can have a plural form (criticism, +criticisms), so we have to generate isms as well as ism. Again, the +combining is not just concatenation, + +
+ ity + sities
+ ness + snesses +
+ +If a suffix has v type, it has s, ed and ing forms, + +
+ ize + sizes
+ ize + edized
+ ize + ingizing +
+ +Type v therefore includes type n, and we should read this type as ‘verb or +noun’, rather than just ‘verb’. For example, condition, with suffix ion, is +both verb (‘They have been conditioned to behave like that’) and noun +(‘It is subject to certain conditions’). +

+ +

+The diagram is therefore a scheme for generating combined derivational +suffixes, each combination possibly terminated with an inflectional suffix. +A problem is that it contains a loop in + +

+ izeateionalize → ... +
+ +suggesting suffixes of the form izationalizational... We break the loop by +limiting the number of joined derivational suffixes of diagram 1 to four. +(Behaviour of the Porter stemmer shows that removal of five combined +derivation suffixes is never desirable, even supposing five ever combine.) +We can then generate 181 endings, with their removal codes. But 75 of these +suffixes do not occur as endings in V, and they can be eliminated as rare +forms, leaving 106. Alphabetically, the endings begin, + +
+ abilities   ability   able   ables   ably   al   alism   + (alisms)   alities   ality   alization   (alizationed)   + (alizationing)   (alizations)   alize   alized   (alizer)   + (alizered)   (alizering)   (alizers)   (alizes)   (alizing)   + ally   alness   (alnesses)   als   ance   ances   ancies   + ancy ... +
+ +The eliminated rare forms are shown bracketed. +

+ +

+The 106 endings are arranged in a file as a list of strings followed by +condition letter, + +

+    'abilities'     B
+    'ability'       B
+    'able'          B
+    'ables'         B
+    'ably'          B
+    'al'            B
+    ....
+
+ +This ending list is generated by running the ANSI C program shown in +Appendix 4, and line-sorting the result into a file, +and this file is called in by the get + + + directive in the Snowball script of +Appendix 2, which is the Porter stemming algorithm laid out in the style of +the Lovins algorithm. In fact, precise equivalence cannot be achieved, but +in V only 137 words stem differently, which is 0.4% of V. There are 10 +removal conditions, compared with Lovins’ 29, and 11 transformation or +respelling rules, compared with Lovins’ 35. We can describe the process in +Lovins style, once we have got over a few preliminaries. +

+ +

+We have to distinguish y as a vowel from y as a consonant. We treat initial +y, and y before vowel, as a consonant, and make it upper case. Thereafter +a, e, i, o, u and y are vowels, and the other lower case letters and Y are +consonants. If [C] stands for zero or more consonants, C for one or more +consonants, and V for one or more vowels, then a stem of shape [C]VC has +length 1s (1 syllable), of shape [C]VCVC length 2s, and so on. +

+ +

+A stem ends with a short vowel if the ending has the form cvx, where c is a +consonant, v a vowel, and x a consonant other than w, x or Y. +(Short vowel endings with ed and ing imply loss of an e from +the stem, as in removing = remove + ing.) +

+ +

+Here are the removal conditions, +

+ +
+ +
A   Minimum stem length = 1s +
B Minimum stem length = 2s +
C Minimum stem length = 2s and remove ending only after s or t +
D Minimum stem length = 2s and do not remove ending after m +
E Remove ending only after e or ous after minimum stem length 1s +
F Remove ending only after ss or i +
G Do not remove ending after s +
H Remove ending only if stem contains a vowel +
I Remove ending only if stem contains a vowel and does not end in e +
J Remove ending only after ee after minimum stem length 1s +
+
+ +

+In condition J the stem must end ee, and the part of the stem before the +ee must have minimum length 1s. Condition E is similar. +

+ +

+Here are the respelling rules, defined with the help of the removal +conditions. In each case, the stem being tested does not include the string +at the end which has been identified for respelling. + +

+
1)   Remove e if A, or if B and the stem does not end with a short vowel +
2) Remove l if B and the stem ends with l +
3) enci/encyenc if A, otherwise → enci +
4) anci/ancyanc if A, otherwise → anci +
5) allyal if A, otherwise → alli +
6) entlyent if A, otherwise → entli +
7) atorat if A +
8) logi/logylog if A, otherwise → log +
9) bli/blybl if A, otherwise → bli +
10) bilbl if stem ends vowel after A +
11) y/Yi if stem contains a vowel +
+ +The 106 endings are distributed among conditions A to E as A(5), B(87), +C(8), D(3) and E(1). F to J deal with the purely inflectional endings: F +with es, G with s, H with ing and ings, I with ed and J with d. +There is however one point at which the Lovins structure breaks down, in that +removal of ed and ing(s) after conditions I and H requires a special +adjustment that cannot be left to a separate transformation rule. It is to +undouble the last letter, and to restore a final e if the stem has length 1s +and ends with a short vowel (so shopping loses a p and becomes shop, +sloping gains an e and becomes slope.) +

+ +

+The Porter stemmer cast into this form runs significantly faster than the +multi-stage stemmer — about twice as fast in tests with Snowball. +

+ +

+We will call the Porter stemmer P, the Lovins stemmer L, and this Lovins +version of the Porter stemmer LP. As we have said, P and LP are not identical, +but stem 137 of the 29,401 words of V differently. +

+ +

+A major cause of difference is unexpected suffix combinations. These can be +subdivided into combinations of what seem to be suffixes but are not, and +rare combinations of valid suffixes. +

+ +

+The first case is illustrated by the word disenchanted. P stems this to +disench, first taking off suffix ed, and then removing ant, which is +a suffix in English, although not a suffix in this word. P also stems +disenchant to disench, so the two words disenchant and +disenchanted are conflated by P, even though they make an error in the +stemming process. But ant is a noun type suffix, and so does not combine +with ed. anted is therefore omitted from the suffix list of LP, so LP +stems disenchanted to disenchant, but disenchant to disench. +

+ +

+This illustrates a frequently encountered problem in stemming. S1 +and S2 are suffixes of a language, but the combination +S1S2 is +not. A word has the form xS1, where x is some string, but in +xS1, S1 is not actually a suffix, but part of the stem. +S2 is a valid suffix for this word, so xS1S2 is +another word in the language. An algorithmic stemmer stems xS1 to +x in error. If presented with xS1S2 it can either +(a) stem it to xS1, knowing S1 cannot be a suffix in +this context, or (b) stem it to x, ignoring the knowledge to be +derived from the presence of S2. (a) gives the correct stemming +of at least xS1S2, although the stemming of xS1 +will be wrong, while (b) overstems both words, but at least achieves +their conflation. In other words (a) fails to conflate the two forms, but +may achieve correct conflations of xS1S2 with similar forms +xS1S3, xS1S4 etc., while (b) conflates +the two forms, but at the risk of additional false conflations. Often a study +of the results of a stemming strategy on a sample vocabulary leads one to +prefer approach (b) to (a) for certain classes of ending. This is +true in particular of the inflectional endings of English, which is why the +removals in step 1 of P are not remembered in some state variable, which +records whether the ending just removed is verb-type, noun-or-verb-type etc. +On balance you get better results by throwing that information away, and then +the many word pairs on the pattern of disenchant / disenchanted will +conflate together. +

+ +

+Other examples from V can be given: in misrepresenting, ent is +not a suffix, and enting not a valid suffix combination); in +witnessed, ness is not a suffix, and nessed not a valid +suffix combination. +

+ +

+This highlights a disadvantage of stemmers that work with a fixed list of +endings. To get the flexibility of context-free ending removal, we need to +build in extra endings which are not grammatically correct (like anted = +ant + ed), and this adds considerably to the burden of constructing +the list. In fact L does not include anted, but it does include for +example antic (ant + ic), which may be serving a similar +purpose. +

+ +

+For the second case, the rare combinations of valid suffixes, one may instance +ableness. Here again the multi-step stemmer makes life easier. P removes +ness in step 3 and able in step 4, but without making any necessary +connection. L has ableness as an ending, dictionaries contain many +ableness words, and it is an easy matter to make the connection across from +able to ness in diagram 1 and generate extra endings. Nevertheless the +ending is very rare in actual use. For example, Dickens’ Nicholas Nickleby +contains no examples, Bleak House contains two, in the same sentence: +

+ +
+ I was sure you would feel it yourself and would excuse the + reasonableness of MY feelings when coupled with the known + excitableness of my little woman. +
+ +

+reasonableness is perhaps the commonest word in English of this form, and +excitableness (instead of excitability) is there for contrast. Thackeray’s +Vanity Fair, a major source in testing out P and Porter2, contains one +word of this form, charitableness. One may say of this word that it is +inevitably rare, because it has no really distinct +meaning from the simpler charity, but that it has to be formed by adding +ableness rather than ability, because the repeated ity in charity + +ability is morphologically unacceptable. Other rare combinations are +ateness, entness +and eds (as in intendeds and beloveds). +fuls is another interesting case. The ful suffix, usually adjectival, +can sometimes create nouns, giving plurals such as mouthfuls and +spoonfuls. But in longer words sful is a more ‘elegant’ plural +(handbagsful, dessertspoonsful). +

+ +

+These account for most of the differences, but there are a few others. +

+ +

+One is in forms like bricklayersbricklai (P), bricklay (LP). +Terminal y is usefully turned to i to help conflate words where y is changed +to i and es added to form the plural, but this does not happen when +y +follows a vowel. LP improves on P here, but the Porter2 algorithm makes the +same improvement, so we have nothing to learn. +There is also a difference in words endings lle or lles, +quadrillequadril (P), quadrill (LP). This is because e and +l +removal are successive in step 5 of P, and done as alternatives in the +respelling rules +of LP. In LP this is not quite correct, since +Lovins makes it clear that her transformation rules should be +applied in succession. Even so, LP seems better than P, suggesting +that step 5b of P (undouble l) should not have been attempted after e removal +in step 5a. So here is a possible small improvement to Porter2. Another +small, but quite interesting difference, is the condition attached to the +ative ending. The ending generator makes B the removal condition by a +natural process, but in P its removal condition is A. This goes back to step +3 as originally presented in the paper of 1980: + +

+ (m>0) ICATE → IC
+ (m>0) ATIVE →
+ (m>0) ALIZE → AL
+ (m>0) ICITI → IC
+ (m>0) ICAL → IC
+ (m>0) FUL →
+ (m>0) NESS → +
+(m>0) corresponds to A. With removal condition B, the second line would be + +
+ (m>1) ATIVE → +
+ +which looks slightly incongruous. Nevertheless it is probably correct, because we +remove a half suffix from icate, alize, icity and ical when the stem +length is at least s1, and so we should remove the full ate + ive suffix when the stem +length is at least s2. We should not be influenced by ful and ness. +They are ‘native English’ stems, unlike the other five, which +have a ‘Romance’ origin, and for these two condition A has been found to +be more appropriate. In fact putting in this adjustment to Porter2 results in an +improvement in the small class of words thereby affected. +

+ +

Conclusion

+ +

+You never learn all there is to know about a computer program, unless the +program is really very simple. So even after 20 years of regular use, +we can learn something new about P by creating LP and comparing the +two. And in the process we learn a lot about L, the Lovins stemmer itself. +

+ +

+The truth is that the main motivation for studying L was to see how well the +Snowball system could be used for implementing and analyzing Lovins’ +original work, and the interest in what she had actually achieved in 1968 +only came later. I hope that this short account helps clarify her work, and +place it the context of the development of stemmers since then. +

+ +

Notes

+ +

+The http addresses below have a ‘last visited’ date of December 2001. +

+ +
    +
  1. The Lovins stemmer is available at +

    + +
      +
    • http://www.cs.waikato.ac.nz/~eibe/stemmers
    • +
    • http://sourceforge.net/projects/stemmers
    • +
    +
  2. + +
  3. See  http://www-uilots.let.uu.nl/~uplift/
  4. + +
  5. See  http://snowball.sourceforge.net
  6. + +
  7. See  http://promo.net/pg/
  8. + +
  9. See  http://snowball.sourceforge.net/english/voc.txt
  10. + +
  11. In looking at verbs with the pattern ride, rode, ridden, Palmer, +1965, notes that ‘we should perhaps add STRIDE, with past tense strode, +but without a past participle (there is no *stridden).’
  12. + +
  13. See  https://tartarus.org/~martin/PorterStemmer/
  14. + +
  15. Lovins (1968), p. 25, mentions that a stemming algorithm developed by + James L. Dolby in California used a two-syllable minimum stem length as a + condition for most of the stemming.
  16. +
+ +

Bibiliography

+ +

+Andrews K (1971) The development of a fast conflation algorithm for English. +Dissertation for the Diploma in Computer Science, Computer Laboratory, +University of Cambridge. +

+ +

+Harman D (1991) How effective is suffixing? Journal of the American +Society for Information Science, 42: 7-15. +

+ +

+Kraaij W and Pohlmann R (1994) Porter’s stemming algorithm for Dutch. In +Noordman LGM and de Vroomen WAM, eds. Informatiewetenschap 1994: +Wetenschappelijke bijdragen aan de derde STINFON Conferentie, Tilburg, +1994. pp. 167-180. +

+ +

+Kraaij W and Pohlmann R (1995) Evaluation of a Dutch stemming algorithm. +Rowley J, ed. The New Review of Document and Text Management, volume 1, +Taylor Graham, London, 1995. pp. 25-43, +

+ +

+Krovetz B (1995) Word sense disambiguation for large text databases. PhD +Thesis. Department of Computer Science, University of Massachusetts +Amherst. +

+ +

+Lennon M, Pierce DS, Tarry BD and Willett P (1981) An evaluation of some +conflation algorithms for information retrieval. Journal of Information +Science, 3: 177-183. +

+ +

+Lovins JB (1968) Development of a stemming algorithm. Mechanical +Translation and Computational Linguistics, 11: 22-31. +

+ +

+Overhage, CFJ (1966) Plans for project Intrex. Science, 152: +1032-1037. +

+ +

+Palmer FR (1965) A linguistic study of the English verb. London, +Longmans. +

+ +

+Porter MF (1980) An algorithm for suffix stripping. Program, 14: +130-137. +

+ +

Appendix 1

+ +

+The Lovins stemmer in Snowball. +

+ +
stringescapes {}
+
+routines (
+   A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
+
+   endings
+
+   undouble respell
+)
+
+externals ( stem )
+
+backwardmode (
+
+  /* Lovins' conditions A, B ... CC, as given in her Appendix B, where
+     a test for a two letter prefix ('test hop 2') is implicitly
+     assumed. Note that 'e' next 'u' corresponds to her u*e because
+     Snowball is scanning backwards. */
+
+  define A  as ( hop 2 )
+  define B  as ( hop 3 )
+  define C  as ( hop 4 )
+  define D  as ( hop 5 )
+  define E  as ( test hop 2 not 'e' )
+  define F  as ( test hop 3 not 'e' )
+  define G  as ( test hop 3 'f' )
+  define H  as ( test hop 2 't' or 'll' )
+  define I  as ( test hop 2 not 'o' not 'e' )
+  define J  as ( test hop 2 not 'a' not 'e' )
+  define K  as ( test hop 3 'l' or 'i' or ('e' next 'u') )
+  define L  as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
+  define M  as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
+  define N  as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
+  define O  as ( test hop 2 'l' or 'i' )
+  define P  as ( test hop 2 not 'c' )
+  define Q  as ( test hop 2 test hop 3 not 'l' not 'n' )
+  define R  as ( test hop 2 'n' or 'r' )
+  define S  as ( test hop 2 'dr' or ('t' not 't') )
+  define T  as ( test hop 2 's' or ('t' not 'o') )
+  define U  as ( test hop 2 'l' or 'm' or 'n' or 'r' )
+  define V  as ( test hop 2 'c' )
+  define W  as ( test hop 2 not 's' not 'u' )
+  define X  as ( test hop 2 'l' or 'i' or ('e' next 'u') )
+  define Y  as ( test hop 2 'in' )
+  define Z  as ( test hop 2 not 'f' )
+  define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
+                                    'es' 't' ) )
+  define BB as ( test hop 3 not 'met' not 'ryst' )
+  define CC as ( test hop 2 'l' )
+
+
+  /* The system of endings, as given in Appendix A. */
+
+  define endings as (
+    [substring] among(
+    'alistically' B 'arizability' A 'izationally' B
+
+     'antialness' A  'arisations' A  'arizations' A  'entialness' A
+
+      'allically' C   'antaneous' A   'antiality' A   'arisation' A
+      'arization' A   'ationally' B   'ativeness' A   'eableness' E
+      'entations' A   'entiality' A   'entialize' A   'entiation' A
+      'ionalness' A   'istically' A   'itousness' A   'izability' A
+      'izational' A
+
+       'ableness' A    'arizable' A    'entation' A    'entially' A
+       'eousness' A    'ibleness' A    'icalness' A    'ionalism' A
+       'ionality' A    'ionalize' A    'iousness' A    'izations' A
+       'lessness' A
+
+        'ability' A     'aically' A     'alistic' B     'alities' A
+        'ariness' E     'aristic' A     'arizing' A     'ateness' A
+        'atingly' A     'ational' B     'atively' A     'ativism' A
+        'elihood' E     'encible' A     'entally' A     'entials' A
+        'entiate' A     'entness' A     'fulness' A     'ibility' A
+        'icalism' A     'icalist' A     'icality' A     'icalize' A
+        'ication' G     'icianry' A     'ination' A     'ingness' A
+        'ionally' A     'isation' A     'ishness' A     'istical' A
+        'iteness' A     'iveness' A     'ivistic' A     'ivities' A
+        'ization' F     'izement' A     'oidally' A     'ousness' A
+
+         'aceous' A      'acious' B      'action' G      'alness' A
+         'ancial' A      'ancies' A      'ancing' B      'ariser' A
+         'arized' A      'arizer' A      'atable' A      'ations' B
+         'atives' A      'eature' Z      'efully' A      'encies' A
+         'encing' A      'ential' A      'enting' C      'entist' A
+         'eously' A      'ialist' A      'iality' A      'ialize' A
+         'ically' A      'icance' A      'icians' A      'icists' A
+         'ifully' A      'ionals' A      'ionate' D      'ioning' A
+         'ionist' A      'iously' A      'istics' A      'izable' E
+         'lessly' A      'nesses' A      'oidism' A
+
+          'acies' A       'acity' A       'aging' B       'aical' A
+          'alist' A       'alism' B       'ality' A       'alize' A
+          'allic'BB       'anced' B       'ances' B       'antic' C
+          'arial' A       'aries' A       'arily' A       'arity' B
+          'arize' A       'aroid' A       'ately' A       'ating' I
+          'ation' B       'ative' A       'ators' A       'atory' A
+          'ature' E       'early' Y       'ehood' A       'eless' A
+          'elity' A       'ement' A       'enced' A       'ences' A
+          'eness' E       'ening' E       'ental' A       'ented' C
+          'ently' A       'fully' A       'ially' A       'icant' A
+          'ician' A       'icide' A       'icism' A       'icist' A
+          'icity' A       'idine' I       'iedly' A       'ihood' A
+          'inate' A       'iness' A       'ingly' B       'inism' J
+          'inity'CC       'ional' A       'ioned' A       'ished' A
+          'istic' A       'ities' A       'itous' A       'ively' A
+          'ivity' A       'izers' F       'izing' F       'oidal' A
+          'oides' A       'otide' A       'ously' A
+
+           'able' A        'ably' A        'ages' B        'ally' B
+           'ance' B        'ancy' B        'ants' B        'aric' A
+           'arly' K        'ated' I        'ates' A        'atic' B
+           'ator' A        'ealy' Y        'edly' E        'eful' A
+           'eity' A        'ence' A        'ency' A        'ened' E
+           'enly' E        'eous' A        'hood' A        'ials' A
+           'ians' A        'ible' A        'ibly' A        'ical' A
+           'ides' L        'iers' A        'iful' A        'ines' M
+           'ings' N        'ions' B        'ious' A        'isms' B
+           'ists' A        'itic' H        'ized' F        'izer' F
+           'less' A        'lily' A        'ness' A        'ogen' A
+           'ward' A        'wise' A        'ying' B        'yish' A
+
+            'acy' A         'age' B         'aic' A         'als'BB
+            'ant' B         'ars' O         'ary' F         'ata' A
+            'ate' A         'eal' Y         'ear' Y         'ely' E
+            'ene' E         'ent' C         'ery' E         'ese' A
+            'ful' A         'ial' A         'ian' A         'ics' A
+            'ide' L         'ied' A         'ier' A         'ies' P
+            'ily' A         'ine' M         'ing' N         'ion' Q
+            'ish' C         'ism' B         'ist' A         'ite'AA
+            'ity' A         'ium' A         'ive' A         'ize' F
+            'oid' A         'one' R         'ous' A
+
+             'ae' A          'al'BB          'ar' X          'as' B
+             'ed' E          'en' F          'es' E          'ia' A
+             'ic' A          'is' A          'ly' B          'on' S
+             'or' T          'um' U          'us' V          'yl' R
+           '{'}s' A        's{'}' A
+
+              'a' A           'e' A           'i' A           'o' A
+              's' W           'y' B
+
+        (delete)
+    )
+  )
+
+  /* Undoubling is rule 1 of appendix C. */
+
+  define undouble as (
+    test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
+                          'tt')
+    [next] delete
+  )
+
+  /* The other appendix C rules can be done together. */
+
+  define respell as (
+    [substring] among (
+      'iev'  (<-'ief')
+      'uct'  (<-'uc')
+      'umpt' (<-'um')
+      'rpt'  (<-'rb')
+      'urs'  (<-'ur')
+      'istr' (<-'ister')
+      'metr' (<-'meter')
+      'olv'  (<-'olut')
+      'ul'   (not 'a' not 'i' not 'o' <-'l')
+      'bex'  (<-'bic')
+      'dex'  (<-'dic')
+      'pex'  (<-'pic')
+      'tex'  (<-'tic')
+      'ax'   (<-'ac')
+      'ex'   (<-'ec')
+      'ix'   (<-'ic')
+      'lux'  (<-'luc')
+      'uad'  (<-'uas')
+      'vad'  (<-'vas')
+      'cid'  (<-'cis')
+      'lid'  (<-'lis')
+      'erid' (<-'eris')
+      'pand' (<-'pans')
+      'end'  (not 's' <-'ens')
+      'ond'  (<-'ons')
+      'lud'  (<-'lus')
+      'rud'  (<-'rus')
+      'her'  (not 'p' not 't' <-'hes')
+      'mit'  (<-'mis')
+      'ent'  (not 'm' <-'ens')
+        /* 'ent' was 'end' in the 1968 paper - a typo. */
+      'ert'  (<-'ers')
+      'et'   (not 'n' <-'es')
+      'yt'   (<-'ys')
+      'yz'   (<-'ys')
+    )
+  )
+)
+
+define stem as (
+
+  backwards (
+    do endings
+    do undouble
+    do respell
+  )
+)
+
+ + +

Appendix 2

+ +

+The Porter stemmer, cast, as far as is possible, into Lovins form. +

+ +
integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+    endings respell
+    shortv
+    undouble
+    A B C D E F G H I J
+)
+
+externals ( stem )
+
+groupings ( v v_WXY )
+
+define v        'aeiouy'
+define v_WXY    v + 'wxY'
+
+backwardmode (
+
+    define shortv as ( non-v_WXY v non-v )
+
+    define undouble as (
+        among ('bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt')
+        and ([next] delete)
+    )
+
+    define A as $p1 <= cursor
+    define B as $p2 <= cursor
+    define C as (B 's' or 't')
+    define D as (B not 'm')
+    define E as ('e' or 'ous' A)
+
+    define F as ('ss' or 'i')
+    define G as not 's'
+    define H as gopast v
+    define I as (not 'e' gopast v)
+    define J as ('ee' A)
+
+    define endings as (
+
+        [substring] among (
+
+            'ed'   I
+            'ing'  H
+            'ings' H
+
+                   (delete
+                    undouble or (atmark p1  test shortv  <+ 'e')
+                   )
+
+            'd'    J
+            'es'   F
+            's'    G
+
+            get '/home/martin/Snowball/festschrift/endings'
+                   (delete)
+        )
+    )
+
+    define respell as (
+        [substring] among (
+            'e'     (B or (A not shortv) delete)
+            'l'     (B 'l' delete)
+            'enci'
+            'ency'  ((A <- 'enc') or <- 'enci')
+            'anci'
+            'ancy'  ((A <- 'anc') or <- 'anci')
+            'ally'  ((A <- 'al') or <- 'alli')
+            'ently' ((A <- 'ent') or <- 'entli')
+            'ator'  (A <- 'at')
+
+            'logi'
+            'logy'  ((A <- 'log') or <- 'logi')
+            'bli'
+            'bly'   ((A <- 'bl') or <- 'bli')
+            'bil'   (v A <- 'bl')
+            'y' 'Y'
+                    (gopast v  <-'i')
+
+        )
+    )
+)
+
+define stem as (
+    test hop 3
+    unset Y_found
+    do ( ['y'] <-'Y' set Y_found)
+    do repeat(goto (v ['y']) <-'Y' set Y_found)
+
+    $p1 = limit
+    $p2 = limit
+    do(
+        gopast v  gopast non-v  setmark p1
+        gopast v  gopast non-v  setmark p2
+    )
+
+    backwards (
+        do endings
+        do respell
+    )
+    do(Y_found  repeat(goto (['Y']) <-'y'))
+
+)
+
+ + +

Appendix 3

+ +

+The list of 181 endings included by the get + + + directive in the program +of Appendix 2. The numbers to the right show their frequency of occurrence +in the sample vocabulary. The 75 rare endings are shown commented out. +

+ +
    'abilities'     B /*   (3) */
+    'ability'       B /*  (14) */
+    'able'          B /* (293) */
+    'ables'         B /*   (4) */
+    'ably'          B /*  (68) */
+    'al'            B /* (285) */
+    'alism'         B /*   (5) */
+//  'alisms'        B /*   (-) */
+    'alities'       B /*   (7) */
+    'ality'         B /*  (24) */
+    'alization'     B /*   (1) */
+//  'alizationed'   B /*   (-) */
+//  'alizationing'  B /*   (-) */
+//  'alizations'    B /*   (-) */
+    'alize'         B /*   (2) */
+    'alized'        B /*   (4) */
+//  'alizer'        B /*   (-) */
+//  'alizered'      B /*   (-) */
+//  'alizering'     B /*   (-) */
+//  'alizers'       B /*   (-) */
+//  'alizes'        B /*   (-) */
+//  'alizing'       B /*   (-) */
+    'ally'          B /*  (78) */
+    'alness'        B /*   (2) */
+//  'alnesses'      B /*   (-) */
+    'als'           B /*  (46) */
+    'ance'          B /*  (93) */
+    'ances'         B /*  (30) */
+    'ancies'        B /*   (2) */
+    'ancy'          B /*  (18) */
+    'ant'           B /*  (92) */
+    'ants'          B /*  (29) */
+    'ate'           B /* (261) */
+    'ated'          B /* (208) */
+    'ately'         B /*  (38) */
+    'ates'          B /*  (73) */
+    'ating'         B /* (119) */
+    'ation'         B /* (356) */
+    'ational'       B /*   (4) */
+//  'ationalism'    B /*   (-) */
+//  'ationalisms'   B /*   (-) */
+//  'ationalities'  B /*   (-) */
+//  'ationality'    B /*   (-) */
+//  'ationalize'    B /*   (-) */
+//  'ationalized'   B /*   (-) */
+//  'ationalizes'   B /*   (-) */
+//  'ationalizing'  B /*   (-) */
+    'ationally'     B /*   (2) */
+//  'ationalness'   B /*   (-) */
+//  'ationalnesses' B /*   (-) */
+//  'ationals'      B /*   (-) */
+//  'ationed'       B /*   (-) */
+//  'ationing'      B /*   (-) */
+    'ations'        B /* (139) */
+    'ative'         B /*  (40) */
+    'atively'       B /*   (4) */
+//  'ativeness'     B /*   (-) */
+//  'ativenesses'   B /*   (-) */
+    'atives'        B /*   (7) */
+//  'ativities'     B /*   (-) */
+//  'ativity'       B /*   (-) */
+    'ator'          B /*  (25) */
+    'ators'         B /*  (10) */
+    'ement'         B /*  (70) */
+//  'emently'       B /*   (-) */
+    'ements'        B /*  (31) */
+    'ence'          B /* (100) */
+    'ences'         B /*  (25) */
+    'encies'        B /*   (9) */
+    'ency'          B /*  (41) */
+    'ent'           D /* (154) */
+    'ently'         D /*  (53) */
+    'ents'          D /*  (25) */
+    'er'            B /* (613) */
+    'ered'          B /*  (44) */
+    'ering'         B /*  (31) */
+    'ers'           B /* (281) */
+    'ful'           A /* (163) */
+    'fulness'       A /*  (31) */
+//  'fulnesses'     A /*   (-) */
+    'fuls'          A /*   (5) */
+    'ibilities'     B /*   (2) */
+    'ibility'       B /*  (10) */
+    'ible'          B /*  (53) */
+    'ibles'         B /*   (2) */
+    'ibly'          B /*  (14) */
+    'ic'            B /* (142) */
+    'ical'          B /*  (91) */
+//  'icalism'       B /*   (-) */
+//  'icalisms'      B /*   (-) */
+//  'icalities'     B /*   (-) */
+    'icality'       B /*   (1) */
+//  'icalize'       B /*   (-) */
+//  'icalized'      B /*   (-) */
+//  'icalizer'      B /*   (-) */
+//  'icalizered'    B /*   (-) */
+//  'icalizering'   B /*   (-) */
+//  'icalizers'     B /*   (-) */
+//  'icalizes'      B /*   (-) */
+//  'icalizing'     B /*   (-) */
+    'ically'        B /*  (59) */
+//  'icalness'      B /*   (-) */
+//  'icalnesses'    B /*   (-) */
+    'icals'         B /*   (2) */
+    'icate'         B /*   (9) */
+    'icated'        B /*   (7) */
+//  'icately'       B /*   (-) */
+    'icates'        B /*   (4) */
+    'icating'       B /*   (3) */
+    'ication'       B /*  (23) */
+//  'icational'     B /*   (-) */
+//  'icationals'    B /*   (-) */
+//  'icationed'     B /*   (-) */
+//  'icationing'    B /*   (-) */
+    'ications'      B /*   (8) */
+    'icative'       B /*   (2) */
+//  'icatively'     B /*   (-) */
+//  'icativeness'   B /*   (-) */
+//  'icativenesses' B /*   (-) */
+//  'icatives'      B /*   (-) */
+//  'icativities'   B /*   (-) */
+//  'icativity'     B /*   (-) */
+    'icities'       B /*   (1) */
+    'icity'         B /*   (5) */
+    'ics'           B /*  (21) */
+    'ion'           C /* (383) */
+    'ional'         C /*  (18) */
+//  'ionalism'      C /*   (-) */
+//  'ionalisms'     C /*   (-) */
+    'ionalities'    C /*   (1) */
+    'ionality'      C /*   (1) */
+//  'ionalize'      C /*   (-) */
+//  'ionalized'     C /*   (-) */
+//  'ionalizer'     C /*   (-) */
+//  'ionalizered'   C /*   (-) */
+//  'ionalizering'  C /*   (-) */
+//  'ionalizers'    C /*   (-) */
+//  'ionalizes'     C /*   (-) */
+//  'ionalizing'    C /*   (-) */
+    'ionally'       C /*  (12) */
+    'ionalness'     C /*   (1) */
+//  'ionalnesses'   C /*   (-) */
+    'ionals'        C /*   (1) */
+    'ioned'         C /*  (13) */
+    'ioning'        C /*   (3) */
+    'ions'          C /* (192) */
+    'ism'           B /*  (33) */
+    'isms'          B /*   (5) */
+    'ities'         B /*  (62) */
+    'ity'           B /* (236) */
+    'ive'           B /* (132) */
+    'ively'         B /*  (34) */
+    'iveness'       B /*  (14) */
+//  'ivenesses'     B /*   (-) */
+    'ives'          B /*  (12) */
+//  'ivities'       B /*   (-) */
+    'ivity'         B /*   (1) */
+    'ization'       B /*   (4) */
+//  'izational'     B /*   (-) */
+//  'izationals'    B /*   (-) */
+//  'izationed'     B /*   (-) */
+//  'izationing'    B /*   (-) */
+    'izations'      B /*   (1) */
+    'ize'           B /*  (32) */
+    'ized'          B /*  (32) */
+    'izer'          B /*   (3) */
+//  'izered'        B /*   (-) */
+//  'izering'       B /*   (-) */
+    'izers'         B /*   (1) */
+    'izes'          B /*   (6) */
+    'izing'         B /*  (30) */
+    'ly'            E /* (135) */
+    'ment'          B /* (105) */
+//  'mently'        B /*   (-) */
+    'ments'         B /*  (50) */
+    'ness'          A /* (428) */
+    'nesses'        A /*  (21) */
+    'ous'           B /* (340) */
+    'ously'         B /* (130) */
+    'ousness'       B /*  (22) */
+//  'ousnesses'     B /*   (-) */
+
+ + +

Appendix 4

+ +

+An ANSI C program which will generate on  stdout  the raw ending list +(endings with condition letters) from which the list of Appendix 3 is +constructed. +

+ +
#include <stdio.h>
+#include <stdlib.h>
+
+static char * p;
+static k = 0;
+static int depth;
+
+static int add(char * s, int i)
+{   int j = 0;
+    int ch;
+    while (ch = s[j]) {
+        p[i] = ch;
+        j++; i++;
+    }
+    p[i] = 0; k = i;
+}
+
+static void w(int code) { printf("'%s' %c\n", p, 'A' - 1 + code); }
+
+static void wn(int code)
+{   w(code);
+    {   int ch = p[k - 1];
+        if (ch == 'y') p[k - 1] = 'i';
+        printf("'%s", p);
+        if (ch == 'y' || ch == 's') printf("e");
+        printf("s' %c\n", 'A' - 1 + code);
+        p[k - 1] = ch;
+    }
+}
+
+static void wv(int code)
+{   wn(code);
+    {   int ch = p[k - 1];
+        if (ch == 'e') p[k - 1] = 0;
+        printf("'%sed' %c\n", p, 'A' - 1 + code);
+        printf("'%sing' %c\n", p, 'A' - 1 + code);
+        p[k - 1] = ch;
+    }
+}
+
+static void f(void (*gen)(), int i, int code)
+{   if (depth > 2) return;
+    depth++; gen(i, code); depth--;
+}
+
+static void gen_ize(int i, int code);
+static void gen_ism(int i, int code);
+static void gen_ity(int i, int code);
+static void gen_ly(int i, int code);
+static void gen_ness(int i, int code);
+static void gen_ic(int i, int code);
+static void gen_ate(int i, int code);
+static void gen_ive(int i, int code);
+static void gen_tion(int i, int code);
+
+static void gen_al(int i, int code)
+{   i = add("al", i); wn(code);
+    f(gen_ize, i, code); f(gen_ism, i, code); f(gen_ity, i, code); f(gen_ly, i, code);
+    f(gen_ness, i, code);
+}
+
+static void gen_ance(int i, int code)
+{   i = add("ance", i); wn(code);
+    add("y", i - 1); wn(code);
+}
+
+static void gen_ence(int i, int code)
+{   i = add("ence", i); wn(code);
+    add("y", i - 1); wn(code);
+}
+
+static void gen_er(int i, int code)  {  add("er", i); wv(code); }
+
+static void gen_ic(int i, int code)
+{   i = add("ic", i); wn(code);
+    f(gen_ate, i, code); f(gen_ity, i, code); f(gen_al, i, code);
+}
+
+static void gen_able(int i, int code)
+{   add("able", i); wn(code);
+    add("abil", i); f(gen_ity, i + 4, code);
+    add("ab", i); f(gen_ly, i + 2, code);
+}
+
+static void gen_ible(int i, int code)
+{   add("ible", i); wn(code);
+    add("ibil", i); f(gen_ity, i + 4, code);
+    add("ib", i); f(gen_ly, i + 2, code);
+}
+
+static void gen_ant(int i, int code)
+{   add("ant", i); wn(code);
+/*  f(gen_ly, i, code);  */
+}
+
+static void gen_ement(int i, int code)
+{   i = add("ement", i); wn(code);
+    f(gen_ly, i, code);
+}
+
+static void gen_ment(int i, int code)
+{   i = add("ment", i); wn(code);
+    f(gen_ly, i, code);
+}
+
+static void gen_ent(int i, int code)
+{   i = add("ent", i); wn(code);
+    f(gen_ly, i, code);
+}
+
+static void gen_ism(int i, int code)  {  add("ism", i); wn(code); }
+
+static void gen_ate(int i, int code)
+{   add("ate", i); wv(code);
+    f(gen_ly, i + 3, code); f(gen_ive, i + 2, code); f(gen_tion, i + 1, code);
+}
+
+static void gen_ator(int i, int code)  {  add("ator", i); wn(code); }
+
+static void gen_ful(int i, int code)
+{   i = add("ful", i); w(code);
+    f(gen_ness, i, code);
+/*  f(gen_ly, i, code);  */
+}
+
+static void gen_ly(int i, int code)  {  add("ly", i); w(code); }
+static void gen_ness(int i, int code)  {  add("ness", i); wn(code); }
+static void gen_ity(int i, int code)  {  add("ity", i); wn(code); }
+
+static void gen_ous(int i, int code)
+{   i = add("ous", i); w(code);
+    f(gen_ly, i, code); f(gen_ness, i, code);
+}
+
+static void gen_ive(int i, int code)
+{   i = add("ive", i); wn(code);
+    f(gen_ness, i, code);
+    f(gen_ly, i, code);
+    f(gen_ity, i - 1, code);
+}
+
+static void gen_ize(int i, int code)
+{   i = add("ize", i); wv(code);
+    f(gen_er, i - 1, code);
+    add("a", i - 1);
+    depth ++; f(gen_tion, i, code); depth--;
+}
+
+static void gen_tion(int i, int code)
+{   i = add(i == 0 ? "ion" : "tion", i); wv(code);
+    f(gen_al, i, code);
+}
+
+
+main()
+{
+    p = malloc(100);
+    depth = 0;
+
+    gen_al(0, 2);
+    gen_ance(0, 2);
+    gen_ence(0, 2);
+    gen_er(0, 2);
+    gen_ic(0, 2);
+    gen_able(0, 2);
+    gen_ible(0, 2);
+    gen_ant(0, 2);
+    gen_ement(0, 2);
+    gen_ment(0, 2);
+    gen_ent(0, 4);
+    gen_ism(0, 2);
+    gen_ate(0, 2); gen_ator(0, 2);
+    gen_ful(0, 1);
+    gen_ly(0, 5);
+    gen_ness(0, 1);
+    gen_ity(0, 2);
+    gen_ous(0, 2);
+    gen_ive(0, 2);
+    gen_ize(0, 2);
+    gen_tion(0, 3);
+    free(p);
+    return 0;
+}
+
+ +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/lovins/festschrift.tt b/algorithms/lovins/festschrift.tt new file mode 100644 index 0000000..b461436 --- /dev/null +++ b/algorithms/lovins/festschrift.tt @@ -0,0 +1,1740 @@ +[% header('Lovins revisited') %] + +

+This is a revised version of Martin Porter’s paper which was published as part +of the Karen Sparck Jones Festschrift of 2005. +

+ +

+Charting a New Course: Progress in Natural Language Processing and +Information Retrieval: A Festschrift for Professor Karen Sparck Jones, edited +by John Tait, Amsterdam: Kluwer, 2005. +

+ +

Lovins Revisited

+ +

+Martin Porter, December 2001 (revised November 2008). +

+ +

Abstract

+
+ The Lovins stemming algorithm for English is analysed, and compared + with the Porter stemming algorithm, using Snowball, a language designed + specifically for the development of stemming algorithms. It is shown + how the algorithms manage to function in a similar way, while appearing + to be quite different. The Porter algorithm is recoded in the style of + the Lovins algorithm, which leads to the discovery of a few possible + improvements. +
+ +

Preamble

+ +

+This is a festschrift paper, so I am allowed to begin on a personal note. +In 1979 I was working with Keith van Rijsbergen and Stephen Robertson on a +British Library funded IR project to investigate the selection of good +index terms, and one of the things we found ourselves having to do was to +establish a document test collection from some raw data that had been sent +to us on a magnetic tape by Peter Vaswani of the National Physical +Laboratory. I was the tame programmer in the project, so it was my job to +set up the test collection. +

+ +

+On the whole it did not prove too difficult. The data we received was a +collection of about 11,000 documents (titles and short abstracts), 93 +queries — in a free text form, and relevance judgements. All the text was +in upper case without punctuation, and there were one or two marker +characters to act as field terminators. By modern standards the data was +really very small indeed, but at the time it was considerably larger than +any of the other test collections we had. What you had to do was to cast it +into a standard form +for experimental work. You represented terms and documents by numbers, and +created flat files in text form corresponding to the queries, relevance +assessments, and term to document index. One process however was less +straightforward. On their way to becoming numeric terms, the words of the +source text were put through a process of linguistic normalisation called +suffix stripping, in which certain derivational and inflectional suffixes +attached to the words were removed. There was a standard piece of software +used in Cambridge at that time to do this, written in 1971 by Keith +Andrews (Andrews, 1971) as part of a Diploma Project. +One of the courses in +Cambridge is the one year post-graduate Diploma in Computer Science. Each +student on the course is required to do a special project, which includes +writing a significant piece of software — significant in the sense of being +both useful and substantial. +Keith's piece of software was more useful than most, and it continued to be +used as a suffix stripping program, or stemmer, for many years after it was +written. +

+ +

+Now by an odd chance I was privy to much of Keith Andrews’ original +thinking at the time that he was doing the work. The reason for this was +that in 1971 I was looking for a house in Cambridge, and the base I was +operating from was a sleeping bag on the living room floor of an old friend +called John Dawson, who was Keith’s diploma supervisor. Keith used to come round +and discuss stemming algorithms with him, while I formed a mute audience. I +learnt about the Lovins stemming algorithm of 1968 (Lovins, 1968), +and must I think have +at least looked at her paper then, since I know it was not new to me when I +saw it again in 1979. Their view of Lovins’ work was that it did not go far +enough. There needed to be many more suffixes, and more complex rules to +determine the criteria for their removal. Much of their discussion was +about new suffixes to add to the list, and removal rules. It was interesting +therefore to find myself needing to use Andrews’ work eight years later, +and questioning some of its assumptions. Did you need that many suffixes? +Did the rules need to be so complicated? Perhaps one would do better to +break composite suffixes into smaller units and remove them piecemeal. +And perhaps syllables would be a better count of stem length than letters. +So I wrote my own stemmer, which became known as the Porter stemmer, and +which was published in 1980 (Porter, 1980). +

+ +

+I must explain where Karen Sparck Jones fits into all of this. Keith +Andrews’ piece of work was originally suggested by Karen as a Diploma +student project, and she was able to use the Andrews stemmer in her IR +experiments throughout the seventies. In 1979 however Karen had moved much +more into the field of Natural Language Processing and Artificial +Intelligence, and by then had two or three research students in that field +just writing up their PhDs (only one of whom I really got to know — John +Tait, the editor of this volume). So we were in contact, but not working +together. That again was an odd chance: that Karen had been my research +supervisor in a topic other than IR, and that when later I was doing IR +research at Cambridge I was not working with Karen. While I was engaged on +writing the stemmer, Karen showed some justifiable irritation that I had +become interested in a topic so very remote from the one for which we had +received the British Library funding. Nevertheless, she came into my room +one day, said, ‘Look, if you're getting interested in stemming, you’d +better read this,’ and handed me the 1968 issue of Mechanical +Translation that contains the Lovins paper. I still have this issue with +Karen’s name across the top. (And I hope she didn't expect it back!) +

+ +

+Another 20 years have gone by, and I have been studying the Lovins stemmer +again, really because I was looking for examples to code up in Snowball, a +small string processing language I devised in the latter half of 2001 +particularly adapted for writing stemming algorithms. Lovins’ stemmer +strikes me now as a fine piece of work, for which she never quite received +the credit she deserved. It was the first stemmer for English set out as +an algorithm that described the stemming process exactly. She explained +how it was intended to be used to improve IR performance, in just the way +in which stemmers are used today. It is not seriously short of suffixes: +the outstanding omissions are the plural forms ements and ents +corresponding to her ement and ent, and it is easy enough to add +them into the definition. It performs well in practice. In fact it is +still in use, and can be downloaded in various languages from the net (1). +The tendency since 1980 has been to attach the name ‘Porter’ to any +language stemming process that does not use a dictionary, even when it is +quite dissimilar to the original Porter stemmer (witness the Dutch Porter +stemmer of Kraaij and Pohlmann (2) (Kraaij, 1994 and Kraaij, 1995), but +the priority really belongs to Lovins. It also has one clear advantage +over the Porter algorithm, in that it involves fewer steps. Coded up well, +it should run a lot faster. +

+ +

+A number of things intrigued me. Why are the Lovins and Porter stemmers so +different, when what they do looks so similar? Could the stemmer, in some +sense, be brought up-to-date? Could the Porter stemmer be cast into the +Lovins form, and so run faster? +

+ +

+This paper is about the answers for these questions. In discovering them, I +have learned a lot more about my own stemmer. +

+ +

Why stem?

+ +

+It may be worth saying a little on what stemming is all about. We can imagine +a document with the title, +

+ +
+ Pre-raphaelitism: A Study of Four Critical Approaches +
+ +

+and a query, containing the words +

+ +
+ PRE-RAPHAELITE CRITICISM +
+ +

+We want to match query against title so that ‘Pre-raphaelitism’ matches +‘PRE-RAPHAELITE’ and ‘Critical’ matches ‘CRITICISM’. This leads to the +idea of removing endings from words as part of the process of extracting index +terms from documents, a similar process of ending removal being applied to +queries prior to the match. For example, we would like to remove the endings +from +

+ +
+critical
+critically
+criticism
+criticisms
+critics
+
+ +

+so that each word is reduced to ‘critic’. This is the stem, from which the +other words are formed, so the process as a whole is called stemming. It is +a feature of English morphology that the part of the word we want to remove is +at the end — the suffix. But the same is broadly true of French, German and other +languages of the Indo-European group. It is also true of numerous languages +outside Indo-European, Finnish for example, although there is a +boundary beyond which it is not true. So Chinese, where words are simple +units without affixes, and Arabic, where the stem is modified by +prefixes and infixes as well as suffixes, lie outside the +boundary. As an IR technique it therefore has wide applicability. In developing +stemmers two points were recognised quite early on. One is that the +morphological regularities that you find in English (or other languages) mean +that you can attempt to do stemming by a purely algorithmic process. Endings +al, ally, ism etc. occur throughout English vocabulary, and are +easy to detect and remove: you don’t need access to an on-line dictionary. The +other is that the morphological irregularities of English set a limit to the +success of an algorithmic approach. Syntactically, what look like endings may +not be endings (offspring is not offspr + ing), and the list of +endings seems to extend indefinitely (trapez-oid, likeli-hood, +guardian-ship, Tibet-an, juven-ilia, Roman-esque, ox-en +...) It is difficult to gauge where to set the cut-off for these rarer forms. +Semantically, the addition of a suffix may alter the meaning of a word a +little, a lot, or completely, and morphology alone cannot measure the degree of +change (prove and provable have closely related meanings; probe and +probable do not.) This meant that stemming, if employed at all, became the +most challenging, and the most difficult part of the indexing process. +

+ +

+In the seventies, stemming might be applied as part of the process of +establishing a test collection, and when it was there would not usually be any +attempt to make the stemming process well-defined, or easily repeatable by +another researcher. This was really because the basis for experiment replication +was the normalised data that came out of the stemming process, rather than the +source data plus a description of stemming procedures. Stemming tended to be +applied, and then forgotten about. But by the 1980s, stemming itself was being +investigated. Lennon and others (Lennon, 1981) found no substantial differences +between the use of different stemmers for English. Harman (Harman, 1991) +challenged the effectiveness of stemming altogether, when she reported no +substantial differences between using and not using stemming in a series of +experiments. But later work has been more positive. Krovetz (Krovetz, 1995), for example, +reported small but significant improvements with stemming over a range of test +collections. +

+ +

+Of course, all these experiments assume some IR model which will use stemming in +a particular way, and will measure just those features that tests collections +are, notoriously, able to measure. We might imagine an IR system where the users +have been educated in the advantages and disadvantages to be expected from +stemming, and are able to flag individual search terms to say whether or not +they are to be used stemmed or unstemmed. Stemming sometimes improves, +occasionally degrades, search performance, and this would be the best way of +using it as an IR facility. Again stemming helps regularise the IR vocabulary, +which is very useful when preparing a list of terms to present to a user as +candidates for query expansion. But this advantage too is difficult to quantify. +

+ +

+An evaluative comparison between the Lovins and later stemmers lies in any case +outside the scope of this paper, but it is important to +bear in mind that it is not a straightforward undertaking. +

+ +

The Lovins Stemmer

+ +

+Structurally, the Lovins stemmer is in four parts, collected together in +four Appendices A, B, C and D in her paper. Part A is a list of 294 +endings, each with a letter which identifies a condition for whether or +not the ending should be removed. (I will follow Lovins in using ‘ending’ +rather than ‘suffix’ as a name for the items on the list.) +Part A therefore looks like this: +

+ +
+ .11.
+ alistically   B
+ arizability   A
+ izationally   B
+ .10.
+ antialness   A
+ arisations   A
+ arizations   A
+ entialness   A
+ .09.
+ allically   C
+ antaneous   A
+ antiality   A
+ . . .
+
+ .01.
+ a   A
+ e   A
+ i   A
+ o   A
+ s   W
+ y   B +
+ +

+Endings are banked by length, from 11 letters down to 1. Each bank is tried +in turn until an ending is found which matches the end of the word to be +stemmed and leaves a stem which satisfies the given condition, when the +ending is removed. For example condition C says that the stem must have at +least 4 letters, so bimetallically would lose allically leaving a +stem bimet of length 5, but metallically would not reduce to +met, since its length is only 3. +

+ +

+There are 29 such conditions, called A to Z, AA, BB and CC, and they +constitute part B of the stemmer. Here they are (* stands for any letter): +

+ +
+ +
A No restrictions on stem +
B Minimum stem length = 3 +
C Minimum stem length = 4 +
D Minimum stem length = 5 +
E Do not remove ending after e +
F Minimum stem length = 3 and do not remove ending after e +
G Minimum stem length = 3 and remove ending only after f +
H Remove ending only after t or ll +
I Do not remove ending after o or e +
J Do not remove ending after a or e +
K Minimum stem length = 3 and remove ending only after l, i or +u*e +
L Do not remove ending after u, x or s, unless s follows +o +
M Do not remove ending after a, c, e or m +
N Minimum stem length = 4 after s**, elsewhere = 3 +
O Remove ending only after l or i +
P Do not remove ending after c +
Q Minimum stem length = 3 and do not remove ending after l or +n +
R Remove ending only after n or r +
S Remove ending only after dr or t, unless t follows t +
T Remove ending only after s or t, unless t follows o +
U Remove ending only after l, m, n or r +
V Remove ending only after c +
W Do not remove ending after s or u +
X Remove ending only after l, i or u*e +
Y Remove ending only after in +
Z Do not remove ending after f +
AA Remove ending only after d, f, ph, th, l, er, or, es or t +
BB Minimum stem length = 3 and do not remove ending after met or +ryst +
CC Remove ending only after l +
+
+ +

+There is an implicit assumption in each condition, A included, that the minimum +stem length is 2. +

+ +

+This is much less complicated than it seems at first. Conditions A to D +depend on a simple measure of minimum stem length, and E and F are slight +variants of A and B. Out of the 294 endings, 259 use one of these +6 conditions. The remaining 35 endings use the other 23 conditions, so +conditions G, H ... CC have less than 2 suffixes each, on average. What is +happening here is that Lovins is trying to capture a rule which gives a +good removal criterion for one ending, or a small number of similar +endings. She does not explain the thinking behind the conditions, but it is +often not too difficult to reconstruct. Here for example are the last few +conditions with their endings, +

+ +
+ +Y (early,   ealy,   eal,   ear). collinearly, multilinear are +stemmed.
+ +Z (eature). misfeature does not lose eature.
+ +AA (ite). acolouthite, hemimorphite lose ite, ignite and +requite retain it.
+ +BB (allic,   als,   al). Words ending metal, crystal retain +al.
+ +CC (inity). crystallinitycrystall, but affinity, +infinity are unaltered. + +
+ +

+Part C of the Lovins stemmer is a set of 35 transformation rules used to +adjust the letters at the end of the stem. These rules are invoked after the +stemming step proper, irrespective of whether an ending was actually +removed. Here are about half of them, with examples to show the type of +transformation intended (letters in square brackets indicate the full form +of the words), +

+ +
+ +
1) bb →     b rubb[ing] → rub +
ll l controll[ed] → control +
mm m trimm[ed] → trim +
rr r abhorr[ing] → abhor +
2) iev ief believ[e] → belief +
3) uct uc induct[ion] → induc[e] +
4) umpt um consumpt[ion] → consum[e] +
5) rpt rb absorpt[ion] → absorb +
6) urs ur recurs[ive] → recur +
7a) metr meter     parametr[ic] → paramet[er] +
8) olv olut dissolv[ed] → dissolut[ion] +
11) dex dic index → indic[es] +
16) ix ic matrix → matric[es] +
18) uad uas persuad[e] → persuas[ion] +
19) vad vas evad[e] → evas[ion] +
20) cid cis decid[e] → decis[ion] +
21) lid lis elid[e] → elis[ion] +
31) ert ers convert[ed] → convers[ion] +
33) yt ys analytic → analysis +
34) yz ys analyzed → analysed +
+
+ +

+Finally, part D suggests certain relaxed matching rules between query terms +and index terms when the stemmer has been used to set up an IR system, but +we can regard that as not being part of the stemmer proper. +

+ +

The Lovins stemmer in Snowball

+ +

+Snowball is a string processing language designed with the idea of making +the definition of stemming algorithms much more rigorous. The Snowball +compiler translates a Snowball script into a thread-safe ANSI C module, +where speed of execution is a major design consideration. The resulting +stemmers are pleasantly fast, and will process one million or so words a +second on a high-performance modern PC. The Snowball website (3) gives a +full description of the language, and also presents stemmers for a range of +natural languages. Each stemmer is written out as a formal algorithm, with +the corresponding Snowball script following. The algorithm definition acts +as program comment for the Snowball script, and the Snowball script gives a +precise definition to the algorithm. The ANSI C code with the +same functionality can also be inspected, and sample vocabularies in source +and stemmed form can be used for test purposes. +An essential function of +the Snowball script is therefore comprehensibility — it should be fully understood +by the reader of the script, and Snowball has been designed with this in mind. +It contrasts interestingly in this respect with a system like Perl. +Perl has a very big definition. Writing your own scripts in Perl is easy, +after the initial learning hurdle, but understanding other scripts can be +quite hard. The size of the language means that there are many different +ways of doing the same thing, which gives programmers the opportunity of +developing highly idiosyncratic styles. Snowball has a small, tight +definition. Writing Snowball is much less easy than writing Perl, but on +the other hand once it is written it is fairly easy to understand +(or at least one hopes that it is). This is +illustrated by the Lovins stemmer in Snowball, which is given in Appendix +1. There is a very easy and natural correspondence +between the different parts of the stemmer definition in Lovins' original +paper and their Snowball equivalents. +For example, the Lovins conditions A, B ... CC code up very neatly +into routines with the same name. Taking condition L, +

+ +
+ L   Do not remove ending after u, x or s, unless s follows + o +
+ +

+corresponds to +

+ +[% highlight(" + define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) +") %] + +

+When  L  is called, we are the right end of the stem, moving left towards the +front of the word. Each Lovins condition has an implicit test for a stem of +length 2, and this is done by [% highlight_inline('test hop 2') %], which sees if it is possible to +hop two places left. If it is not, the routine immediately returns with a +false signal, otherwise it carries on. It tests that the character at the +right hand end is not u, and also not x, and also not s following a letter +which is not o. This is equivalent to the Lovins condition. Here is not of +course the place to give the exact semantics, but the you can quickly get +the feel of the language by comparing the 29 Lovins conditions with their +Snowball definitions. +

+ +

+Something must be said about the [% highlight_inline('among') %] feature of Snowball however, +since this is central to the efficient implementation of stemmers. It is +also the one part of Snowball that requires just a little effort to +understand. +

+ +

+At its simplest, [% highlight_inline('among') %] can be used to test for alternative strings. The +[% highlight_inline('among') %]s used in the definition of condition AA and the  undouble +routine have this form. In Snowball you can write +

+ +[% highlight(" + 'sh' or 's' or 't' 'o' or 'i' 'p' +") %] + +

+which will match the various forms shop, ship, sop, sip, top, tip. The +order is important, because if [% highlight_inline("'sh'") %] and [% highlight_inline("'s'") %] are swapped over, the +[% highlight_inline("'s'") %] would match the first letter of ship, while [% highlight_inline("'o'") %] or [% highlight_inline("'i'") %] +would fail to match with the following [% highlight_inline("'h'") %] — in other words the pattern +matching has no backtracking. But it can also be written as +

+ +[% highlight(" + among('sh' 's' 't') among('i' 'o') 'p' +") %] + +

+The order of the strings in each [% highlight_inline('among') %] is not important, because the +match will be with the longest of all the strings that can match. In +Snowball the implementation of [% highlight_inline('among') %] is based on the binary-chop idea, +but has been carefully optimised. For example, in the Lovins stemmer, the +main [% highlight_inline('among') %] in the  endings  routine has 294 different strings of average +length 5.2 characters. A search for an ending involves accessing a number +of characters within these 294 strings. The order is going to be +Klog2294, or 8.2K, where K is a number that one hopes will +be small, although one must certainly expect it to be greater than 1. It +turns out that, for the successive words of a standard test vocabulary, +K averages to 1.6, so for each word there are about 13 character +comparisons needed to determine whether it has one of the Lovins endings. +

+ +

+Each string in an [% highlight_inline('among') %] construction can be followed by a routine name. The +routine returns a true/false signal, and then the [% highlight_inline('among') %] searches for the +longest substring whose associated routine gives a true signal. A string not +followed by a routine name can be thought of as a string which is associated +with a routine that does nothing except give a true signal. This is the way +that the [% highlight_inline('among') %] in the  endings  routine works, where indeed every string is +followed by a routine name. +

+ +

+More generally, lists of strings in the [% highlight_inline('among') %] construction can be followed +by bracketed commands, which are obeyed if one of the strings in the list is +picked out for the longest match. The syntax is then +

+ +
+    among( S11 S12 ... (C1)
+           S21 S22 ... (C2)
+           ...
+
+           Sn1 Sn2 ... (Cn)
+         )
+
+ +

+where the  Sij  are strings, optionally followed by their routine names, +and the  Ci  are Snowball command sequences. The semantics is a bit +like a switch in C, where the switch is on a string rather than a numerical +value: +

+ +
+    switch(...) {
+        case S11: case S12: ... C1; break;
+        case S21: case S22: ... C2; break;
+        ...
+
+        case Sn1: case Sn2: ... Cn; break;
+    }
+
+ +

+The [% highlight_inline('among') %] in the  respell  routine has this form. +

+ +

+The full form however is to use [% highlight_inline('among') %] with a preceding [% highlight_inline('substring') %], with +[% highlight_inline('substring') %] and [% highlight_inline('among') %] possibly separated by further commands. +[% highlight_inline('substring') %] +triggers the test for the longest matching substring, and the [% highlight_inline('among') %] then +causes the corresponding bracketed command to be obeyed. At a simple +level this can be used to cut down the size of the code, in that +

+ +
+    substring C among( S11 S12 ... (C1)
+                       S21 S22 ... (C2)
+                       ...
+
+                       Sn1 Sn2 ... (Cn)
+                     )
+
+ +

+is a shorter form of +

+ +
+    among( S11 S12 ... (C C1)
+           S21 S22 ... (C C2)
+           ...
+
+           Sn1 Sn2 ... (C Cn)
+         )
+
+ +

+More importantly, [% highlight_inline('substring') %] and [% highlight_inline('among') %] can work in different contexts. For +example, [% highlight_inline('substring') %] could be used to test for the longest string, matching from +right to left, while the commands in the [% highlight_inline('among') %] could operate in a left to +right direction. In the Lovins stemmer, [% highlight_inline('substring') %] is used in this style: +

+ +[% highlight(" + [substring] among ( ... ) +") %] + +

+The two square brackets are in fact individual commands, so before the [% highlight_inline('among') %] +come three commands. [% highlight_inline('[') %] sets a lower marker, [% highlight_inline('substring') %] is obeyed, searching +for the strings in the following among, and then [% highlight_inline(']') %] sets an upper marker. +The region between the lower and upper markers is called the slice, and this +may subsequently be copied, replaced or deleted. +

+ +

+It was possible to get the Lovins stemmer working in Snowball very quickly. +The Sourceforge versions (1) could be used to get the long list of endings and +to help with the debugging. There was however one problem, that rules 24 and +30 of part C conflicted. They are given as +

+ +
+ 24) end → ens except following s
+ ...
+ 30) end → ens except following m +
+ +

+This had not been noticed in the Sourceforge implementations, but +immediately gave rise to a compilation error in Snowball. Experience +suggested that I was very unlikely to get this problem resolved. Only a few +months before, I had hit a point in a stemming algorithm where +something did not quite make sense. The algorithm had been published just a +few years ago, and contacting one at least of the authors was quite easy. +But I never sorted it out. The author I traced was not au fait +with the linguistic background, and the language expert had been swallowed +up in the wilds of America. So what chance would I have here? Even if I was +able to contact Lovins, it seemed to me inconceivable that she would have +any memory of, or even interest in, a tiny problem in a paper which she +published 33 years ago. But the spirit of academic enquiry forced me to +venture the attempt. After pursuing a number of red-herrings, email contact +was finally made. +

+ +

+Her reply was a most pleasant surprise. + +

+

+ ... The explanation is both mundane and exciting. You have just found + a typo in the MT article, which I was unaware of all these years, and I + suspect has puzzled a lot of other people too. The original paper, an + MIT-published memorandum from June 1968, has rule 30 as +

+ +

+       ent → ens except following m +

+ +

+ and that is undoubtedly what it should be ... +

+
+ +

An analysis of the Lovins stemmer

+ +

+It is very important in understanding the Lovins stemmer to know something +of the IR background of the late sixties. In the first place there was an +assumption that IR was all, or mainly, about the retrieval of +technical scientific papers, and research projects were set up accordingly. +I remember being shown, in about 1968, a graph illustrating the +‘information explosion’, as it was understood at the time, which showed +just the rate of growth of publications of scientific papers in various +different domains over the previous 10 or 20 years. Computing resources +were very precious, and they could not be wasted by setting up IR systems +for information that was, by comparison, merely frivolous (articles in +popular magazines, say). And even in 1980, when I was working in IR, the +data I was using came from the familiar, and narrow, scientific domain. +Lovins was working with Project Intrex (Overhage, 1966), where the data came from +papers in materials science and engineering. +

+ +

+Secondly, the idea of indexing on every word in a document, or even looking +at every word before deciding whether or not to put it into an index, would +have seemed quite impractical, even though it might have been recognised as +theoretically best. In the first place, the computing resources necessary to +store and analyse complete documents in machine readable form were absent, and in the +second, the rigidities of the printing industry almost guaranteed that one +would never get access to them. +A stemmer, therefore, would be seen as something not +applied to general text but to certain special words, and in the case of the +Lovins stemmer, the plan was to apply it to the subject terms that were used +to categorize each document. Subsequently it would be used with each word +in a query, where it +was hoped that the vocabulary of the queries would match the vocabulary of +the catalogue of subject terms. +

+ +

+This accounts for: — +

+ +
    +
  1. The emphasis on the scientific vocabulary. This can be seen in the +endings, which include oidal, on, oid, ide, for words like colloidal, +proton, spheroid, nucleotide. It can be seen in the transformation rules, +with their concern for Greek sis and Latin ix suffixes. And also it can be +seen in in the word samples of the paper (magnesia, magnesite, magnesian, +magnesium, magnet, magnetic, magneto etc. of Fig. 2). +
  2. + +
  3. The slight shortage of plural forms. The subject terms would naturally +have been mainly in the singular, and one might also expect the same of +query terms. +
  4. + +
  5. The surprising shortness of the allowed minimum stems — usually 2 +letters. A controlled technical vocabulary will contain longish words, and +the problem of minimum stem lengths only shows up with shorter words. +
  6. +
+ +

+If we take a fairly ordinary vocabulary of modern English, derived from +non-scientific writing, it is interesting to see how much of the Lovins +stemmer does not actually get used. We use vocabulary V, derived from a +sample of modern texts from Project Gutenberg (4). V can be inspected +at (5). It contains 29,401 words, and begins +

+ a   aback   abandon   abandoned   abandoning   abandonment   + abandons   abasement   abashed   abate   abated ... +
+We find that 22,311, or about 76%, of the words in V have one of the +294 endings removed if passed through the Lovins stemmer. Of this 76%, over a +half (55%) of the removals are done by just five of the endings, the breakdown +being, +
+ s (13%)   ed (12%)   e (10%)   ing (10%)   es (6%)   y (4%) +
+If, on the other hand, you look at the least frequent endings, 51% of them +do only 1.4% of the removals. So of the ones removed, half the endings in +V +correspond to 2% of the endings in the stemmer, and 1.4% of the endings in +V +correspond to half the endings in the stemmer. In fact 62 of the endings +(about a fifth) do not lead to any ending removals in V at all. These are +made up of the rarer ‘scientific’ endings, such as aroid and oidal, and +long endings, such as alistically and entiality. +

+ +

+This helps explain why the Porter and Lovins stemmers behave in a fairly +similar way despite the fact that they look completely different — it is +because most of the work is being done in just a small part of the stemmer, +and in that part there is a lot of overlap. Porter and Lovins stem 64% of +the words in V identically, which is quite high. (By contrast, an +erroneous but plausibly written Perl script +advertised on the Web as an implementation of the Porter stemmer +still proves to stem only 86% of the words in V +to the same forms that are produced by the Porter stemmer.) +

+ +

+A feature of the Lovins stemmer that is worth looking at in some detail is +the transformation rules. People who come to the problem of stemming for +the first time usually devote a lot of mental energy to the issue of +morphological irregularity which they are trying to address. +

+ +

+A good starting point is the verbs of English. Although grammatically +complex, the morphological forms of the English verb are few, and are +illustrated by the pattern harm, harms, harming, harmed, where the basic +verb form adds s, ing and ed to make the other three forms. There are +certain special rules: to add s to a verb ending ss an e is inserted, +so pass becomes passes, and adding e and ing replaces a final e of +the verb (love to loves), and can cause consonant doubling (hop to +hopped), but +apart from this all verbs in the language follow the basic pattern with the +exception of a finite class of irregular verbs. +In a regular verb, the addition of ed to the basic verb creates both the +past form (‘I harmed’) and the p.p. (past participle) form (‘I have +harmed’). An irregular verb, such as ring, forms its past in some other +way (‘I rang’), and may have a distinct p.p. (‘I have rung’). +The irregular verbs have a +different past form, and sometimes a separate p.p. form. +It is easy to think up more examples, + +

+
stem past p.p. +
+
ring rang rung +
rise rose risen +
sleep slept slept +
fight       fought       fought +
come came come +
go went gone +
hit hit hit +
+ +How many of these verbs are there altogether? On 20 Jan 2000, in order to +test the hypothesis that the number is consistently over-estimated, I asked +this question in a carefully worded email to a mixed group of +about 50 +well-educated +work colleagues (business rather than academic people). Ten of them replied, +and here are the +guesses they made: + +
+ 20,   25,   25,   50,   180,   200,   426,   25000,   10%,   20% +
+ +The last two numbers mean 10% and 20% of all English verbs. +My hypothesis was of course wrong. The truth is that most people have no +idea at all how many irregular verbs there are in English. +In +fact there are around 135 (see section 3.3 of Palmer, 1965). +If a stemming algorithm handles suffix removal +of all regular verbs correctly, the question arises as to whether it is +worth making it do the same for the irregular forms. Conflating fought and +fight, for example, could be useful in IR queries about boxing. It seems +easy: you make a list of the irregular verbs and create a mapping of the +past and p.p. forms to the main form. We can call the process +English verb respelling. But when you try it, numerous problems arise. Are +forsake, beseech, cleave really verbs of contemporary English? If so, what +is the p.p. of cleave? +Or take the verb stride, which is common enough. What is its p.p.? My +Concise Oxford English Dictionary says it is stridden (6), but have we ever +heard this word used? (‘I have stridden across the paving.’) +

+ +

+To compose a realistic list for English verb respelling we therefore need to +judge word rarity. But among the commoner verb forms even greater problems +arise because of their use as homonyms. A rose is a type of flower, so +is it wise +to conflate rose and rise? Is it wise to conflate +saw and see when saw can mean a cutting instrument? +

+ +

+We suddenly get to +the edge of what it is useful to include in a stemming algorithm. So long as +a stemming algorithm is built around general rules, the full impact of the +stemmer on a vocabulary need not be studied too closely. It is sufficient to +know that the stemmer, judiciously used, improves retrieval performance. But +when we look at its effect on individual words these issues can no longer be +ignored. To build even a short list of words into a stemmer for special +treatment takes us into the area of the dictionary-based stemmer, and the +problem of determining, for a pair of related words in the dictionary, a +measure of semantic similarity which tells us whether or not the words +should be conflated together. +

+ +

+About half the transformation rules in the Lovins stemmer deal with a +problem which is similar to that posed by the irregular verbs of English, +and which ultimately goes back to the irregular forms of second conjugation +verbs in Latin. We can call it Latin verb respelling. Verbs like +induce, consume, commit are perfectly regular in modern English, but +the adjectival and noun forms induction, consumptive, commission that +derive from them correspond to p.p. forms in Latin. +You can see the descendants of these Latin irregularities +in modern Italian, which has commettere with p.p. +commesso, like our commit and commission, and scendere with +p.p. sceso like our ascend and ascension (although scendere +means ‘to go down’ rather than ‘to go up’). +

+ +

+Latin verb respelling often seems to be more the territory of a stemmer than +English verb respelling, presumably because Latin verb irregularities +correspond to consonantal changes at the end of the stem, where the +stemmer naturally operates, while English verb irregularities more often +correspond to vowel changes in the middle. Lovins was no doubt +particularly interested in Latin verb respelling because so many of the +words affected have scientific usages. +

+ +

+We can judge that Latin verb respellings constitute a small set because the +number of second conjugation verbs of Latin form a small, fixed set. Again, +looking at Italian, a modern list of irregular verbs contains 150 basic forms +(nearly all of them second conjugation), not unlike the number of forms in +English. Extra verbs are formed with prefixes. Corresponding English words +that exhibit the Latin verb respelling problem +will be a subset of this system. In fact we +can offer a Snowball script that does the Latin verb respelling with more +care. It should be invoked, in the Porter stemmer, after removal of ive or +ion endings only, + +[% highlight(" +define prefix as ( + + among ( + + 'a' 'ab' 'ad' 'al' 'ap' 'col' 'com' 'con' 'cor' 'de' + 'di' 'dis' 'e' 'ex' 'in' 'inter' 'o' 'ob' 'oc' 'of' + 'per' 'pre' 'pro' 're' 'se' 'sub' 'suc' 'trans' + ) atlimit +) + +define second_conjugation_form as ( + + [substring] prefix among ( + + 'cept' (<-'ceiv') //-e con de re + 'cess' (<-'ced') //-e con ex inter pre re se suc + 'cis' (<-'cid') //-e de (20) + 'clus' (<-'clud') //-e con ex in oc (26) + 'curs' (<-'cur') // re (6) + 'dempt' (<-'deem') // re + 'duct' (<-'duc') //-e de in re pro (3) + 'fens' (<-'fend') // de of + 'hes' (<-'her') //-e ad (28) + 'lis' (<-'lid') //-e e col (21) + 'lus' (<-'lud') //-e al de e + 'miss' (<-'mit') // ad com o per re sub trans (29) + 'pans' (<-'pand') // ex (23) + 'plos' (<-'plod') //-e ex + 'prehens' (<-'prehend') // ap com + 'ris' (<-'rid') //-e de (22) + 'ros' (<-'rod') //-e cor e + 'scens' (<-'scend') // a + 'script' (<-'scrib') //-e de in pro + 'solut' (<-'solv') //-e dis re (8) + 'sorpt' (<-'sorb') // ab (5) + 'spons' (<-'spond') // re (25) + 'sumpt' (<-'sum') // con pre re (4) + 'suas' (<-'suad') //-e dis per (18) + 'tens' (<-'tend') // ex in pre (24) + 'trus' (<-'trud') //-e ob (27) + 'vas' (<-'vad') //-e e (19) + 'vers' (<-'vert') // con in re (31) + 'vis' (<-'vid') //-e di pro + ) +) +") %] + +This means that if suas, for example, is preceded by one of the strings +in [% highlight_inline('prefix') %], and there is nothing more before the prefix string (which is +what the +[% highlight_inline('atlimit') %] +command tests), it is replaced by suad. So dissuas(ion) goes to +dissuad(e) +and persuas(ive) to persuad(e). Of course, asuas(ion), absuas(ion), +adsuas(ion) and so on would get the same treatment, but not being words of +English that does not really matter. The corresponding Lovins rules are +shown in brackets. +This is not quite the end +of the story, however, because the Latin forms ex + cedere (‘go +beyond’) pro + cedere (‘go forth’), and sub + cedere +(‘go after’) give rise to verbs which, +by an oddity of English orthography, have an extra letter e: exceed, proceed, +succeed. They can be sorted out in a final respelling step: +

+ +[% highlight(" +define final_respell as ( + + [substring] atlimit among( + + 'exced' (<-'exceed') + 'proced' (<-'proceed') + 'succed' (<-'succeed') + /* extra forms here perhaps */ + ) +) +") %] + +

+As you might expect, close inspection of this process creates doubts in +the same way as for English verb respelling. (Should we really conflate +commission and commit? etc.) +

+ +

+The other transformation rules are concerned with unusual plurals, mainly +of Latin or Greek origin, er and re differences, as in parameter and +parametric, and the sis/tic connection of certain words of Greek origin: +analysis/analytic, paralysis/paralytic ... (rule 33), and +hypothesis/hypothetic, kinesis/kinetic ... (rule 32). Again, these +irregularities might be tackled by forming explicit word lists. Certainly +rule 30, given as, +

+ +
+ ent → ens except following m, +
+ +

+goes somewhat wild when given a general English vocabulary (dent becomes +dens for example), although it is the only rule that might be said to +have a damaging effect. +

+ +

A Lovins shape for the Porter stemmer

+ +

+The 1980 paper (Porter, 1980) may be said to define the ‘pure’ Porter stemmer. +The stemmer distributed at (7) can be called the ‘real’ Porter +stemmer, and differs from the pure stemmer in three small respects, which +are carefully explained. This disparity does not require much excuse, +since the oldest traceable encodings of the stemmer have always contained +these differences. There is also a revised stemmer for English, called +‘Porter2’ and still subject to slight changes. Unless otherwise stated, +it is the real Porter stemmer which is being studied below. +

+ +

+The Porter stemmer differs from the Lovins stemmer in a number of +respects. In the first place, it only takes account of fairly common +features of English. So rare suffixes are not included, and there is no +equivalent of Lovins’ transformation rules, other than her rule (1), the +undoubling of terminal double letters. Secondly, it removes suffixes only +when the residual stem is fairly substantial. Some suffixes are removed +only when at least one syllable is left, and most are removed only when at least two +syllables are left. (One might say that this is based on a guess about the +way in which the meanings of a stem is related to its length in syllables (8).) +The Porter stemmer is therefore ‘conservative’ in its removal +of suffixes, or at least that is how it has often been described. Thirdly, +it removes suffixes in a series of steps, often reducing a compound suffix +to its first part, so a step might reduce ibility to ible, where +ibility is thought of as being ible + ity. Although the +description of the whole stemmer is a bit complicated, the total number of +suffixes is quite small — about 60. +

+ +

+The Porter stemmer has five basic steps. Step 1 removes an +inflectional suffix. There are only three of these: ed and ing, which are +verbal, and s, which is verbal (he sings), plural (the songs) or possessive +(the horses’ hooves), although the rule for s removal is the same in all +three cases. Step 1 may also restore an e (hoping → hope), undouble a +double letter pair (hopping → hop), or change y to i (poppy → +poppi, to match with poppies → poppi.) Steps 2 to 4 remove derivational +suffixes. So +ibility may reduce to ible in step 2, and ible itself may be removed in step +4. Step 5 is for removing final e, and undoubling ll. +

+ +

+A clear advantage of the Lovins stemmer over the Porter stemmer is speed. +The Porter stemmer has five steps of suffix removal to the Lovins stemmer’s +one. It is instructive therefore to try and cast the Porter stemmer into +the shape of the Lovins stemmer, if only for the promise of certain speed +advantages. As we will see, we learn a few other things from the exercise +as well. +

+ +

+First we need a list of endings. The Lovins endings were built up by hand, +but we can construct a set of endings for the Porter stemmer by writing an +ending generator that follows the algorithm definition. From an analysis of +the suffixes in steps 2 to 4 of the Porter stemmer we can construct +the following diagram: +

+ +Diagram showing ending combinations for the Porter stemmer + +

+This is not meant to be a linguistic analysis of the suffix structure of +English, but is merely intended to show how the system of endings works in +the stemming algorithm. Suffixes combine if their boxes are connected by +an arrow. So ful combines with ness to make fulness. + +

+ ful + nessfulness +
+ +The combination is not always a concatenation of the strings +however, for we have, + +
+ able + ityability
+ able + lyably
+ ate + ionation
+ ible + ityibility
+ ible + lyibly
+ ize + ate + ionization +
+ +The path from ize to ion goes via ate, so we can form ization, but there is +no suffix izate. Three of the suffixes, ator, ance and ence, do not connect +into the rest of the diagram, and ance, ence also appear in the forms +ancy, ency. The letter to the left of the box is going to be the +condition for the +removal of the suffix in the box, so + +
+      B +-------+ n
+        |  ism  |
+        +-------+
+
+ +means that ism will be removed if it follows a stem that satisfies +condition B. On the right of the box is either n, v or hyphen. n means the +suffix is of noun type. So if a word ends ism it is a noun. v means verb +type. hyphen means neither: ly (adverbial) and ful, ous (adjectival) are of +this type. If a suffix is a noun type it can have a plural form (criticism, +criticisms), so we have to generate isms as well as ism. Again, the +combining is not just concatenation, + +
+ ity + sities
+ ness + snesses +
+ +If a suffix has v type, it has s, ed and ing forms, + +
+ ize + sizes
+ ize + edized
+ ize + ingizing +
+ +Type v therefore includes type n, and we should read this type as ‘verb or +noun’, rather than just ‘verb’. For example, condition, with suffix ion, is +both verb (‘They have been conditioned to behave like that’) and noun +(‘It is subject to certain conditions’). +

+ +

+The diagram is therefore a scheme for generating combined derivational +suffixes, each combination possibly terminated with an inflectional suffix. +A problem is that it contains a loop in + +

+ izeateionalize → ... +
+ +suggesting suffixes of the form izationalizational... We break the loop by +limiting the number of joined derivational suffixes of diagram 1 to four. +(Behaviour of the Porter stemmer shows that removal of five combined +derivation suffixes is never desirable, even supposing five ever combine.) +We can then generate 181 endings, with their removal codes. But 75 of these +suffixes do not occur as endings in V, and they can be eliminated as rare +forms, leaving 106. Alphabetically, the endings begin, + +
+ abilities   ability   able   ables   ably   al   alism   + (alisms)   alities   ality   alization   (alizationed)   + (alizationing)   (alizations)   alize   alized   (alizer)   + (alizered)   (alizering)   (alizers)   (alizes)   (alizing)   + ally   alness   (alnesses)   als   ance   ances   ancies   + ancy ... +
+ +The eliminated rare forms are shown bracketed. +

+ +

+The 106 endings are arranged in a file as a list of strings followed by +condition letter, + +

+    'abilities'     B
+    'ability'       B
+    'able'          B
+    'ables'         B
+    'ably'          B
+    'al'            B
+    ....
+
+ +This ending list is generated by running the ANSI C program shown in +Appendix 4, and line-sorting the result into a file, +and this file is called in by the [% highlight_inline('get') %] directive in the Snowball script of +Appendix 2, which is the Porter stemming algorithm laid out in the style of +the Lovins algorithm. In fact, precise equivalence cannot be achieved, but +in V only 137 words stem differently, which is 0.4% of V. There are 10 +removal conditions, compared with Lovins’ 29, and 11 transformation or +respelling rules, compared with Lovins’ 35. We can describe the process in +Lovins style, once we have got over a few preliminaries. +

+ +

+We have to distinguish y as a vowel from y as a consonant. We treat initial +y, and y before vowel, as a consonant, and make it upper case. Thereafter +a, e, i, o, u and y are vowels, and the other lower case letters and Y are +consonants. If [C] stands for zero or more consonants, C for one or more +consonants, and V for one or more vowels, then a stem of shape [C]VC has +length 1s (1 syllable), of shape [C]VCVC length 2s, and so on. +

+ +

+A stem ends with a short vowel if the ending has the form cvx, where c is a +consonant, v a vowel, and x a consonant other than w, x or Y. +(Short vowel endings with ed and ing imply loss of an e from +the stem, as in removing = remove + ing.) +

+ +

+Here are the removal conditions, +

+ +
+ +
A   Minimum stem length = 1s +
B Minimum stem length = 2s +
C Minimum stem length = 2s and remove ending only after s or t +
D Minimum stem length = 2s and do not remove ending after m +
E Remove ending only after e or ous after minimum stem length 1s +
F Remove ending only after ss or i +
G Do not remove ending after s +
H Remove ending only if stem contains a vowel +
I Remove ending only if stem contains a vowel and does not end in e +
J Remove ending only after ee after minimum stem length 1s +
+
+ +

+In condition J the stem must end ee, and the part of the stem before the +ee must have minimum length 1s. Condition E is similar. +

+ +

+Here are the respelling rules, defined with the help of the removal +conditions. In each case, the stem being tested does not include the string +at the end which has been identified for respelling. + +

+
1)   Remove e if A, or if B and the stem does not end with a short vowel +
2) Remove l if B and the stem ends with l +
3) enci/encyenc if A, otherwise → enci +
4) anci/ancyanc if A, otherwise → anci +
5) allyal if A, otherwise → alli +
6) entlyent if A, otherwise → entli +
7) atorat if A +
8) logi/logylog if A, otherwise → log +
9) bli/blybl if A, otherwise → bli +
10) bilbl if stem ends vowel after A +
11) y/Yi if stem contains a vowel +
+ +The 106 endings are distributed among conditions A to E as A(5), B(87), +C(8), D(3) and E(1). F to J deal with the purely inflectional endings: F +with es, G with s, H with ing and ings, I with ed and J with d. +There is however one point at which the Lovins structure breaks down, in that +removal of ed and ing(s) after conditions I and H requires a special +adjustment that cannot be left to a separate transformation rule. It is to +undouble the last letter, and to restore a final e if the stem has length 1s +and ends with a short vowel (so shopping loses a p and becomes shop, +sloping gains an e and becomes slope.) +

+ +

+The Porter stemmer cast into this form runs significantly faster than the +multi-stage stemmer — about twice as fast in tests with Snowball. +

+ +

+We will call the Porter stemmer P, the Lovins stemmer L, and this Lovins +version of the Porter stemmer LP. As we have said, P and LP are not identical, +but stem 137 of the 29,401 words of V differently. +

+ +

+A major cause of difference is unexpected suffix combinations. These can be +subdivided into combinations of what seem to be suffixes but are not, and +rare combinations of valid suffixes. +

+ +

+The first case is illustrated by the word disenchanted. P stems this to +disench, first taking off suffix ed, and then removing ant, which is +a suffix in English, although not a suffix in this word. P also stems +disenchant to disench, so the two words disenchant and +disenchanted are conflated by P, even though they make an error in the +stemming process. But ant is a noun type suffix, and so does not combine +with ed. anted is therefore omitted from the suffix list of LP, so LP +stems disenchanted to disenchant, but disenchant to disench. +

+ +

+This illustrates a frequently encountered problem in stemming. S1 +and S2 are suffixes of a language, but the combination +S1S2 is +not. A word has the form xS1, where x is some string, but in +xS1, S1 is not actually a suffix, but part of the stem. +S2 is a valid suffix for this word, so xS1S2 is +another word in the language. An algorithmic stemmer stems xS1 to +x in error. If presented with xS1S2 it can either +(a) stem it to xS1, knowing S1 cannot be a suffix in +this context, or (b) stem it to x, ignoring the knowledge to be +derived from the presence of S2. (a) gives the correct stemming +of at least xS1S2, although the stemming of xS1 +will be wrong, while (b) overstems both words, but at least achieves +their conflation. In other words (a) fails to conflate the two forms, but +may achieve correct conflations of xS1S2 with similar forms +xS1S3, xS1S4 etc., while (b) conflates +the two forms, but at the risk of additional false conflations. Often a study +of the results of a stemming strategy on a sample vocabulary leads one to +prefer approach (b) to (a) for certain classes of ending. This is +true in particular of the inflectional endings of English, which is why the +removals in step 1 of P are not remembered in some state variable, which +records whether the ending just removed is verb-type, noun-or-verb-type etc. +On balance you get better results by throwing that information away, and then +the many word pairs on the pattern of disenchant / disenchanted will +conflate together. +

+ +

+Other examples from V can be given: in misrepresenting, ent is +not a suffix, and enting not a valid suffix combination); in +witnessed, ness is not a suffix, and nessed not a valid +suffix combination. +

+ +

+This highlights a disadvantage of stemmers that work with a fixed list of +endings. To get the flexibility of context-free ending removal, we need to +build in extra endings which are not grammatically correct (like anted = +ant + ed), and this adds considerably to the burden of constructing +the list. In fact L does not include anted, but it does include for +example antic (ant + ic), which may be serving a similar +purpose. +

+ +

+For the second case, the rare combinations of valid suffixes, one may instance +ableness. Here again the multi-step stemmer makes life easier. P removes +ness in step 3 and able in step 4, but without making any necessary +connection. L has ableness as an ending, dictionaries contain many +ableness words, and it is an easy matter to make the connection across from +able to ness in diagram 1 and generate extra endings. Nevertheless the +ending is very rare in actual use. For example, Dickens’ Nicholas Nickleby +contains no examples, Bleak House contains two, in the same sentence: +

+ +
+ I was sure you would feel it yourself and would excuse the + reasonableness of MY feelings when coupled with the known + excitableness of my little woman. +
+ +

+reasonableness is perhaps the commonest word in English of this form, and +excitableness (instead of excitability) is there for contrast. Thackeray’s +Vanity Fair, a major source in testing out P and Porter2, contains one +word of this form, charitableness. One may say of this word that it is +inevitably rare, because it has no really distinct +meaning from the simpler charity, but that it has to be formed by adding +ableness rather than ability, because the repeated ity in charity + +ability is morphologically unacceptable. Other rare combinations are +ateness, entness +and eds (as in intendeds and beloveds). +fuls is another interesting case. The ful suffix, usually adjectival, +can sometimes create nouns, giving plurals such as mouthfuls and +spoonfuls. But in longer words sful is a more ‘elegant’ plural +(handbagsful, dessertspoonsful). +

+ +

+These account for most of the differences, but there are a few others. +

+ +

+One is in forms like bricklayersbricklai (P), bricklay (LP). +Terminal y is usefully turned to i to help conflate words where y is changed +to i and es added to form the plural, but this does not happen when +y +follows a vowel. LP improves on P here, but the Porter2 algorithm makes the +same improvement, so we have nothing to learn. +There is also a difference in words endings lle or lles, +quadrillequadril (P), quadrill (LP). This is because e and +l +removal are successive in step 5 of P, and done as alternatives in the +respelling rules +of LP. In LP this is not quite correct, since +Lovins makes it clear that her transformation rules should be +applied in succession. Even so, LP seems better than P, suggesting +that step 5b of P (undouble l) should not have been attempted after e removal +in step 5a. So here is a possible small improvement to Porter2. Another +small, but quite interesting difference, is the condition attached to the +ative ending. The ending generator makes B the removal condition by a +natural process, but in P its removal condition is A. This goes back to step +3 as originally presented in the paper of 1980: + +

+ (m>0) ICATE → IC
+ (m>0) ATIVE →
+ (m>0) ALIZE → AL
+ (m>0) ICITI → IC
+ (m>0) ICAL → IC
+ (m>0) FUL →
+ (m>0) NESS → +
+(m>0) corresponds to A. With removal condition B, the second line would be + +
+ (m>1) ATIVE → +
+ +which looks slightly incongruous. Nevertheless it is probably correct, because we +remove a half suffix from icate, alize, icity and ical when the stem +length is at least s1, and so we should remove the full ate + ive suffix when the stem +length is at least s2. We should not be influenced by ful and ness. +They are ‘native English’ stems, unlike the other five, which +have a ‘Romance’ origin, and for these two condition A has been found to +be more appropriate. In fact putting in this adjustment to Porter2 results in an +improvement in the small class of words thereby affected. +

+ +

Conclusion

+ +

+You never learn all there is to know about a computer program, unless the +program is really very simple. So even after 20 years of regular use, +we can learn something new about P by creating LP and comparing the +two. And in the process we learn a lot about L, the Lovins stemmer itself. +

+ +

+The truth is that the main motivation for studying L was to see how well the +Snowball system could be used for implementing and analyzing Lovins’ +original work, and the interest in what she had actually achieved in 1968 +only came later. I hope that this short account helps clarify her work, and +place it the context of the development of stemmers since then. +

+ +

Notes

+ +

+The http addresses below have a ‘last visited’ date of December 2001. +

+ +
    +
  1. The Lovins stemmer is available at +

    + + +
  2. + +
  3. See  http://www-uilots.let.uu.nl/~uplift/
  4. + +
  5. See  http://snowball.sourceforge.net
  6. + +
  7. See  http://promo.net/pg/
  8. + +
  9. See  http://snowball.sourceforge.net/english/voc.txt
  10. + +
  11. In looking at verbs with the pattern ride, rode, ridden, Palmer, +1965, notes that ‘we should perhaps add STRIDE, with past tense strode, +but without a past participle (there is no *stridden).’
  12. + +
  13. See  https://tartarus.org/~martin/PorterStemmer/
  14. + +
  15. Lovins (1968), p. 25, mentions that a stemming algorithm developed by + James L. Dolby in California used a two-syllable minimum stem length as a + condition for most of the stemming.
  16. +
+ +

Bibiliography

+ +

+Andrews K (1971) The development of a fast conflation algorithm for English. +Dissertation for the Diploma in Computer Science, Computer Laboratory, +University of Cambridge. +

+ +

+Harman D (1991) How effective is suffixing? Journal of the American +Society for Information Science, 42: 7-15. +

+ +

+Kraaij W and Pohlmann R (1994) Porter’s stemming algorithm for Dutch. In +Noordman LGM and de Vroomen WAM, eds. Informatiewetenschap 1994: +Wetenschappelijke bijdragen aan de derde STINFON Conferentie, Tilburg, +1994. pp. 167-180. +

+ +

+Kraaij W and Pohlmann R (1995) Evaluation of a Dutch stemming algorithm. +Rowley J, ed. The New Review of Document and Text Management, volume 1, +Taylor Graham, London, 1995. pp. 25-43, +

+ +

+Krovetz B (1995) Word sense disambiguation for large text databases. PhD +Thesis. Department of Computer Science, University of Massachusetts +Amherst. +

+ +

+Lennon M, Pierce DS, Tarry BD and Willett P (1981) An evaluation of some +conflation algorithms for information retrieval. Journal of Information +Science, 3: 177-183. +

+ +

+Lovins JB (1968) Development of a stemming algorithm. Mechanical +Translation and Computational Linguistics, 11: 22-31. +

+ +

+Overhage, CFJ (1966) Plans for project Intrex. Science, 152: +1032-1037. +

+ +

+Palmer FR (1965) A linguistic study of the English verb. London, +Longmans. +

+ +

+Porter MF (1980) An algorithm for suffix stripping. Program, 14: +130-137. +

+ +

Appendix 1

+ +

+The Lovins stemmer in Snowball. +

+ +[% highlight_file('lovins') %] + +

Appendix 2

+ +

+The Porter stemmer, cast, as far as is possible, into Lovins form. +

+ +[% highlight_file('porter_recast_as_lovins') %] + +

Appendix 3

+ +

+The list of 181 endings included by the [% highlight_inline('get') %] directive in the program +of Appendix 2. The numbers to the right show their frequency of occurrence +in the sample vocabulary. The 75 rare endings are shown commented out. +

+ +[% highlight(" + 'abilities' B /* (3) */ + 'ability' B /* (14) */ + 'able' B /* (293) */ + 'ables' B /* (4) */ + 'ably' B /* (68) */ + 'al' B /* (285) */ + 'alism' B /* (5) */ +// 'alisms' B /* (-) */ + 'alities' B /* (7) */ + 'ality' B /* (24) */ + 'alization' B /* (1) */ +// 'alizationed' B /* (-) */ +// 'alizationing' B /* (-) */ +// 'alizations' B /* (-) */ + 'alize' B /* (2) */ + 'alized' B /* (4) */ +// 'alizer' B /* (-) */ +// 'alizered' B /* (-) */ +// 'alizering' B /* (-) */ +// 'alizers' B /* (-) */ +// 'alizes' B /* (-) */ +// 'alizing' B /* (-) */ + 'ally' B /* (78) */ + 'alness' B /* (2) */ +// 'alnesses' B /* (-) */ + 'als' B /* (46) */ + 'ance' B /* (93) */ + 'ances' B /* (30) */ + 'ancies' B /* (2) */ + 'ancy' B /* (18) */ + 'ant' B /* (92) */ + 'ants' B /* (29) */ + 'ate' B /* (261) */ + 'ated' B /* (208) */ + 'ately' B /* (38) */ + 'ates' B /* (73) */ + 'ating' B /* (119) */ + 'ation' B /* (356) */ + 'ational' B /* (4) */ +// 'ationalism' B /* (-) */ +// 'ationalisms' B /* (-) */ +// 'ationalities' B /* (-) */ +// 'ationality' B /* (-) */ +// 'ationalize' B /* (-) */ +// 'ationalized' B /* (-) */ +// 'ationalizes' B /* (-) */ +// 'ationalizing' B /* (-) */ + 'ationally' B /* (2) */ +// 'ationalness' B /* (-) */ +// 'ationalnesses' B /* (-) */ +// 'ationals' B /* (-) */ +// 'ationed' B /* (-) */ +// 'ationing' B /* (-) */ + 'ations' B /* (139) */ + 'ative' B /* (40) */ + 'atively' B /* (4) */ +// 'ativeness' B /* (-) */ +// 'ativenesses' B /* (-) */ + 'atives' B /* (7) */ +// 'ativities' B /* (-) */ +// 'ativity' B /* (-) */ + 'ator' B /* (25) */ + 'ators' B /* (10) */ + 'ement' B /* (70) */ +// 'emently' B /* (-) */ + 'ements' B /* (31) */ + 'ence' B /* (100) */ + 'ences' B /* (25) */ + 'encies' B /* (9) */ + 'ency' B /* (41) */ + 'ent' D /* (154) */ + 'ently' D /* (53) */ + 'ents' D /* (25) */ + 'er' B /* (613) */ + 'ered' B /* (44) */ + 'ering' B /* (31) */ + 'ers' B /* (281) */ + 'ful' A /* (163) */ + 'fulness' A /* (31) */ +// 'fulnesses' A /* (-) */ + 'fuls' A /* (5) */ + 'ibilities' B /* (2) */ + 'ibility' B /* (10) */ + 'ible' B /* (53) */ + 'ibles' B /* (2) */ + 'ibly' B /* (14) */ + 'ic' B /* (142) */ + 'ical' B /* (91) */ +// 'icalism' B /* (-) */ +// 'icalisms' B /* (-) */ +// 'icalities' B /* (-) */ + 'icality' B /* (1) */ +// 'icalize' B /* (-) */ +// 'icalized' B /* (-) */ +// 'icalizer' B /* (-) */ +// 'icalizered' B /* (-) */ +// 'icalizering' B /* (-) */ +// 'icalizers' B /* (-) */ +// 'icalizes' B /* (-) */ +// 'icalizing' B /* (-) */ + 'ically' B /* (59) */ +// 'icalness' B /* (-) */ +// 'icalnesses' B /* (-) */ + 'icals' B /* (2) */ + 'icate' B /* (9) */ + 'icated' B /* (7) */ +// 'icately' B /* (-) */ + 'icates' B /* (4) */ + 'icating' B /* (3) */ + 'ication' B /* (23) */ +// 'icational' B /* (-) */ +// 'icationals' B /* (-) */ +// 'icationed' B /* (-) */ +// 'icationing' B /* (-) */ + 'ications' B /* (8) */ + 'icative' B /* (2) */ +// 'icatively' B /* (-) */ +// 'icativeness' B /* (-) */ +// 'icativenesses' B /* (-) */ +// 'icatives' B /* (-) */ +// 'icativities' B /* (-) */ +// 'icativity' B /* (-) */ + 'icities' B /* (1) */ + 'icity' B /* (5) */ + 'ics' B /* (21) */ + 'ion' C /* (383) */ + 'ional' C /* (18) */ +// 'ionalism' C /* (-) */ +// 'ionalisms' C /* (-) */ + 'ionalities' C /* (1) */ + 'ionality' C /* (1) */ +// 'ionalize' C /* (-) */ +// 'ionalized' C /* (-) */ +// 'ionalizer' C /* (-) */ +// 'ionalizered' C /* (-) */ +// 'ionalizering' C /* (-) */ +// 'ionalizers' C /* (-) */ +// 'ionalizes' C /* (-) */ +// 'ionalizing' C /* (-) */ + 'ionally' C /* (12) */ + 'ionalness' C /* (1) */ +// 'ionalnesses' C /* (-) */ + 'ionals' C /* (1) */ + 'ioned' C /* (13) */ + 'ioning' C /* (3) */ + 'ions' C /* (192) */ + 'ism' B /* (33) */ + 'isms' B /* (5) */ + 'ities' B /* (62) */ + 'ity' B /* (236) */ + 'ive' B /* (132) */ + 'ively' B /* (34) */ + 'iveness' B /* (14) */ +// 'ivenesses' B /* (-) */ + 'ives' B /* (12) */ +// 'ivities' B /* (-) */ + 'ivity' B /* (1) */ + 'ization' B /* (4) */ +// 'izational' B /* (-) */ +// 'izationals' B /* (-) */ +// 'izationed' B /* (-) */ +// 'izationing' B /* (-) */ + 'izations' B /* (1) */ + 'ize' B /* (32) */ + 'ized' B /* (32) */ + 'izer' B /* (3) */ +// 'izered' B /* (-) */ +// 'izering' B /* (-) */ + 'izers' B /* (1) */ + 'izes' B /* (6) */ + 'izing' B /* (30) */ + 'ly' E /* (135) */ + 'ment' B /* (105) */ +// 'mently' B /* (-) */ + 'ments' B /* (50) */ + 'ness' A /* (428) */ + 'nesses' A /* (21) */ + 'ous' B /* (340) */ + 'ously' B /* (130) */ + 'ousness' B /* (22) */ +// 'ousnesses' B /* (-) */ +") %] + +

Appendix 4

+ +

+An ANSI C program which will generate on  stdout  the raw ending list +(endings with condition letters) from which the list of Appendix 3 is +constructed. +

+ +[% highlight_file('porter_recast_as_lovins_generator.c') %] + +[% footer %] diff --git a/algorithms/lovins/porter-1.png b/algorithms/lovins/porter-1.png new file mode 100644 index 0000000..d9999b2 Binary files /dev/null and b/algorithms/lovins/porter-1.png differ diff --git a/algorithms/lovins/stemmer.html b/algorithms/lovins/stemmer.html new file mode 100644 index 0000000..7750c72 --- /dev/null +++ b/algorithms/lovins/stemmer.html @@ -0,0 +1,606 @@ + + + + + + + + + + The Lovins stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

The Lovins stemming algorithm

+ + +

Links to resources

+ + + +

+The first ever published stemming algorithm was: Lovins JB (1968) Development of +a stemming algorithm. Mechanical Translation and Computational Linguistics, +11: 22-31. Julie Beth Lovins’ paper was remarkable for the early date at which +it was done, and for its seminal influence on later work in +this area. +

+ +

+The design of the algorithm was much influenced by the technical vocabulary +with which Lovins found herself working (subject term keywords attached to +documents in the materials science and engineering field). The subject term +list may also have been slightly limiting in that certain common endings +are not represented (ements and ents for example, corresponding to +the singular forms ement and ent), and also in that the algorithm's +treatment of short words, or words with short stems, can be rather +destructive. +

+ +

+The Lovins algorithm is noticeably bigger than the Porter algorithm, +because of its very extensive endings list. But in one way that is used to +advantage: it is faster. It has effectively traded space for time, and with +its large suffix set it needs just two major steps to remove a suffix, +compared with the eight of the Porter algorithm. +

+ +

+transformation rules. Each ending is associated with one of the +conditions. In the first step the longest ending is found which satisfies +its associated condition, and is removed. In the second step the 35 rules +are applied to transform the ending. The second step is done whether or not +an ending is removed in the first step. +

+ +

+For example, nationally has the ending ationally, with associated +condition, B, ‘minimum stem length = 3’. Since removing ationally +would leave a stem of length 1 this is rejected. But it also has ending +ionally with associated condition A. Condition A is ‘no restriction on +stem length’, so ionally is removed, leaving nat. +

+ +

+The transformation rules handle features like letter undoubling (sitting +→ sittsit), irregular plurals (matrix and matrices), +and English morphological oddities ultimately caused by the behaviour of +Latin verbs of the second conjugation (assume / assumption, +commit / commission etc). Although they are described as being +applied in turn, they can be broken into two stages, rule 1 being done in +stage 1, and either zero or one of rules 2 to 35 being done in stage 2. +

+ +

+Here is the list of endings as given in Appendix A of Lovins’ paper. They +are grouped by length, from 11 characters down to 1. Each ending is +followed by its condition code. +

+ +
+

Appendix A. The list of endings

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
.11.
alistically B arizability A izationally B
.10.
antialness A arisations A arizations A entialness A
.09.
allically C antaneous A antiality A arisation A
arization A ationally B ativeness A eableness E
entations A entiality A entialize A entiation A
ionalness A istically A itousness A izability A
izational A
.08.
ableness A arizable A entation A entially A
eousness A ibleness A icalness A ionalism A
ionality A ionalize A iousness A izations A
lessness A
.07.
ability A aically A alistic B alities A
ariness E aristic A arizing A ateness A
atingly A ational B atively A ativism A
elihood E encible A entally A entials A
entiate A entness A fulness A ibility A
icalism A icalist A icality A icalize A
ication G icianry A ination A ingness A
ionally A isation A ishness A istical A
iteness A iveness A ivistic A ivities A
ization F izement A oidally A ousness A
.06.
aceous A acious B action G alness A
ancial A ancies A ancing B ariser A
arized A arizer A atable A ations B
atives A eature Z efully A encies A
encing A ential A enting C entist A
eously A ialist A iality A ialize A
ically A icance A icians A icists A
ifully A ionals A ionate D ioning A
ionist A iously A istics A izable E
lessly A nesses A oidism A
.05.
acies A acity A aging B aical A
alist A alism B ality A alize A
allic BB anced B ances B antic C
arial A aries A arily A arity B
arize A aroid A ately A ating I
ation B ative A ators A atory A
ature E early Y ehood A eless A
elity A ement A enced A ences A
eness E ening E ental A ented C
ently A fully A ially A icant A
ician A icide A icism A icist A
icity A idine I iedly A ihood A
inate A iness A ingly B inism J
inity CC ional A ioned A ished A
istic A ities A itous A ively A
ivity A izers F izing F oidal A
oides A otide A ously A
.04.
able A ably A ages B ally B
ance B ancy B ants B aric A
arly K ated I ates A atic B
ator A ealy Y edly E eful A
eity A ence A ency A ened E
enly E eous A hood A ials A
ians A ible A ibly A ical A
ides L iers A iful A ines M
ings N ions B ious A isms B
ists A itic H ized F izer F
less A lily A ness A ogen A
ward A wise A ying B yish A
.03.
acy A age B aic A als BB
ant B ars O ary F ata A
ate A eal Y ear Y ely E
ene E ent C ery E ese A
ful A ial A ian A ics A
ide L ied A ier A ies P
ily A ine M ing N ion Q
ish C ism B ist A ite AA
ity A ium A ive A ize F
oid A one R ous A
.02.
ae A al BB ar X as B
ed E en F es E ia A
ic A is A ly B on S
or T um U us V yl R
s' A 's A
.01.
a A e A i A o A
s W y B
+
+ +

+Here are the 29 conditions, called A to Z, AA, BB and CC (* stands for any letter): +

+ +
+

Appendix B. Codes for context-sensitive rules associated with +certain endings

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
A No restrictions on stem
B Minimum stem length = 3
C Minimum stem length = 4
D Minimum stem length = 5
E Do not remove ending after e
F Minimum stem length = 3 and do not remove ending after e
G Minimum stem length = 3 and remove ending only after f
H Remove ending only after t or ll
I Do not remove ending after o or e
J Do not remove ending after a or e
K Minimum stem length = 3 and remove ending only after l, i or u*e
L Do not remove ending after u, x or s, unless s follows o
M Do not remove ending after a, c, e or m
N Minimum stem length = 4 after s**, elsewhere = 3
O Remove ending only after l or i
P Do not remove ending after c
Q Minimum stem length = 3 and do not remove ending after l or n
R Remove ending only after n or r
S Remove ending only after dr or t, unless t follows t
T Remove ending only after s or t, unless t follows o
U Remove ending only after l, m, n or r
V Remove ending only after c
W Do not remove ending after s or u
X Remove ending only after l, i or u*e
Y Remove ending only after in
Z Do not remove ending after f
AA Remove ending only after d, f, ph, th, l, er, or, es or t
BB Minimum stem length = 3 and do not remove ending after met or ryst
CC Remove ending only after l
+
+ +

+There is an implicit assumption in each condition, A included, that the minimum +stem length is 2. +

+ +

+Finally, here are the 35 transformation rules. +

+ +
+

Appendix C. Transformation rules used in recoding stem terminations

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1       remove one of double b, d, g, l, m, n, p, r, s, t
2 iev   →   ief
3 uct   →   uc
4 umpt   →   um
5 rpt   →   rb
6 urs   →   ur
7 istr   →   ister
7a metr   →   meter
8 olv   →   olut
9 ul   →   l except following a, o, i
10 bex   →   bic
11 dex   →   dic
12 pex   →   pic
13 tex   →   tic
14 ax   →   ac
15 ex   →   ec
16 ix   →   ic
17 lux   →   luc
18 uad   →   uas
19 vad   →   vas
20 cid   →   cis
21 lid   →   lis
22 erid   →   eris
23 pand   →   pans
24 end   →   ens except following s
25 ond   →   ons
26 lud   →   lus
27 rud   →   rus
28 her   →   hes except following p, t
29 mit   →   mis
30 ent   →   ens except following m
31 ert   →   ers
32 et   →   es except following n
33 yt   →   ys
34 yz   →   ys
+
+ +

+(Rule 30 as given here corrects a typographical error in the published +paper of 1968.) +

+ +

+The following examples show the intentions behind these rules. +

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1       rubb[ing] → rub, embedd[ed] → embed etc
2 believ[e] → belief
3 induct[ion] → induc[e]
4 consumpt[ion] → consum[e]
5 absorpt[ion] → absorb
6 recurs[ive] → recur
7 administr[ate] → administ[er]
7a parametr[ic] → paramet[er]
8 dissolv[ed] → dissolut[ion]
9 angul[ar] → angl[e]
10 vibex → vibic[es]
11 index → indic[es]
12 apex → apic[es]
13 cortex → cortic[al]
14 anthrax → anthrac[ite]
15 ?
16 matrix → matric[es]
17 ?
18 persuad[e] → persuas[ion]
19 evad[e] → evas[ion]
20 decid[e] → decis[ion]
21 elid[e] → elis[ion]
22 derid[e] → deris[ion]
23 expand → expans[ion]
24 defend → defens[ive]
25 respond → respons[ive]
26 collud[e] → collus[ion]
27 obtrud[e] → obtrus[ion]
28 adher[e] → adhes[ion]
29 remit → remis[s][ion]
30 extent → extens[ion]
31 convert[ed] → convers[ion]
32 parenthet[ic] → parenthes[is]
33 analyt[ic] → analys[is]
34 analyz[ed] → analys[ed]
+
+ +

The Lovins algorithm in Snowball

+ +

+And here is the Lovins algorithm in Snowball. The natural representation +of the Lovins endings, conditions and rules in Snowball, is, I believe, a +vindication of the appropriateness of Snowball for stemming work. Once the +tables had been established, getting the Snowball version running was the +work of a few minutes. +

+ +
stringescapes {}
+
+routines (
+   A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
+
+   endings
+
+   undouble respell
+)
+
+externals ( stem )
+
+backwardmode (
+
+  /* Lovins' conditions A, B ... CC, as given in her Appendix B, where
+     a test for a two letter prefix ('test hop 2') is implicitly
+     assumed. Note that 'e' next 'u' corresponds to her u*e because
+     Snowball is scanning backwards. */
+
+  define A  as ( hop 2 )
+  define B  as ( hop 3 )
+  define C  as ( hop 4 )
+  define D  as ( hop 5 )
+  define E  as ( test hop 2 not 'e' )
+  define F  as ( test hop 3 not 'e' )
+  define G  as ( test hop 3 'f' )
+  define H  as ( test hop 2 't' or 'll' )
+  define I  as ( test hop 2 not 'o' not 'e' )
+  define J  as ( test hop 2 not 'a' not 'e' )
+  define K  as ( test hop 3 'l' or 'i' or ('e' next 'u') )
+  define L  as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
+  define M  as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
+  define N  as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
+  define O  as ( test hop 2 'l' or 'i' )
+  define P  as ( test hop 2 not 'c' )
+  define Q  as ( test hop 2 test hop 3 not 'l' not 'n' )
+  define R  as ( test hop 2 'n' or 'r' )
+  define S  as ( test hop 2 'dr' or ('t' not 't') )
+  define T  as ( test hop 2 's' or ('t' not 'o') )
+  define U  as ( test hop 2 'l' or 'm' or 'n' or 'r' )
+  define V  as ( test hop 2 'c' )
+  define W  as ( test hop 2 not 's' not 'u' )
+  define X  as ( test hop 2 'l' or 'i' or ('e' next 'u') )
+  define Y  as ( test hop 2 'in' )
+  define Z  as ( test hop 2 not 'f' )
+  define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
+                                    'es' 't' ) )
+  define BB as ( test hop 3 not 'met' not 'ryst' )
+  define CC as ( test hop 2 'l' )
+
+
+  /* The system of endings, as given in Appendix A. */
+
+  define endings as (
+    [substring] among(
+    'alistically' B 'arizability' A 'izationally' B
+
+     'antialness' A  'arisations' A  'arizations' A  'entialness' A
+
+      'allically' C   'antaneous' A   'antiality' A   'arisation' A
+      'arization' A   'ationally' B   'ativeness' A   'eableness' E
+      'entations' A   'entiality' A   'entialize' A   'entiation' A
+      'ionalness' A   'istically' A   'itousness' A   'izability' A
+      'izational' A
+
+       'ableness' A    'arizable' A    'entation' A    'entially' A
+       'eousness' A    'ibleness' A    'icalness' A    'ionalism' A
+       'ionality' A    'ionalize' A    'iousness' A    'izations' A
+       'lessness' A
+
+        'ability' A     'aically' A     'alistic' B     'alities' A
+        'ariness' E     'aristic' A     'arizing' A     'ateness' A
+        'atingly' A     'ational' B     'atively' A     'ativism' A
+        'elihood' E     'encible' A     'entally' A     'entials' A
+        'entiate' A     'entness' A     'fulness' A     'ibility' A
+        'icalism' A     'icalist' A     'icality' A     'icalize' A
+        'ication' G     'icianry' A     'ination' A     'ingness' A
+        'ionally' A     'isation' A     'ishness' A     'istical' A
+        'iteness' A     'iveness' A     'ivistic' A     'ivities' A
+        'ization' F     'izement' A     'oidally' A     'ousness' A
+
+         'aceous' A      'acious' B      'action' G      'alness' A
+         'ancial' A      'ancies' A      'ancing' B      'ariser' A
+         'arized' A      'arizer' A      'atable' A      'ations' B
+         'atives' A      'eature' Z      'efully' A      'encies' A
+         'encing' A      'ential' A      'enting' C      'entist' A
+         'eously' A      'ialist' A      'iality' A      'ialize' A
+         'ically' A      'icance' A      'icians' A      'icists' A
+         'ifully' A      'ionals' A      'ionate' D      'ioning' A
+         'ionist' A      'iously' A      'istics' A      'izable' E
+         'lessly' A      'nesses' A      'oidism' A
+
+          'acies' A       'acity' A       'aging' B       'aical' A
+          'alist' A       'alism' B       'ality' A       'alize' A
+          'allic'BB       'anced' B       'ances' B       'antic' C
+          'arial' A       'aries' A       'arily' A       'arity' B
+          'arize' A       'aroid' A       'ately' A       'ating' I
+          'ation' B       'ative' A       'ators' A       'atory' A
+          'ature' E       'early' Y       'ehood' A       'eless' A
+          'elity' A       'ement' A       'enced' A       'ences' A
+          'eness' E       'ening' E       'ental' A       'ented' C
+          'ently' A       'fully' A       'ially' A       'icant' A
+          'ician' A       'icide' A       'icism' A       'icist' A
+          'icity' A       'idine' I       'iedly' A       'ihood' A
+          'inate' A       'iness' A       'ingly' B       'inism' J
+          'inity'CC       'ional' A       'ioned' A       'ished' A
+          'istic' A       'ities' A       'itous' A       'ively' A
+          'ivity' A       'izers' F       'izing' F       'oidal' A
+          'oides' A       'otide' A       'ously' A
+
+           'able' A        'ably' A        'ages' B        'ally' B
+           'ance' B        'ancy' B        'ants' B        'aric' A
+           'arly' K        'ated' I        'ates' A        'atic' B
+           'ator' A        'ealy' Y        'edly' E        'eful' A
+           'eity' A        'ence' A        'ency' A        'ened' E
+           'enly' E        'eous' A        'hood' A        'ials' A
+           'ians' A        'ible' A        'ibly' A        'ical' A
+           'ides' L        'iers' A        'iful' A        'ines' M
+           'ings' N        'ions' B        'ious' A        'isms' B
+           'ists' A        'itic' H        'ized' F        'izer' F
+           'less' A        'lily' A        'ness' A        'ogen' A
+           'ward' A        'wise' A        'ying' B        'yish' A
+
+            'acy' A         'age' B         'aic' A         'als'BB
+            'ant' B         'ars' O         'ary' F         'ata' A
+            'ate' A         'eal' Y         'ear' Y         'ely' E
+            'ene' E         'ent' C         'ery' E         'ese' A
+            'ful' A         'ial' A         'ian' A         'ics' A
+            'ide' L         'ied' A         'ier' A         'ies' P
+            'ily' A         'ine' M         'ing' N         'ion' Q
+            'ish' C         'ism' B         'ist' A         'ite'AA
+            'ity' A         'ium' A         'ive' A         'ize' F
+            'oid' A         'one' R         'ous' A
+
+             'ae' A          'al'BB          'ar' X          'as' B
+             'ed' E          'en' F          'es' E          'ia' A
+             'ic' A          'is' A          'ly' B          'on' S
+             'or' T          'um' U          'us' V          'yl' R
+           '{'}s' A        's{'}' A
+
+              'a' A           'e' A           'i' A           'o' A
+              's' W           'y' B
+
+        (delete)
+    )
+  )
+
+  /* Undoubling is rule 1 of appendix C. */
+
+  define undouble as (
+    test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
+                          'tt')
+    [next] delete
+  )
+
+  /* The other appendix C rules can be done together. */
+
+  define respell as (
+    [substring] among (
+      'iev'  (<-'ief')
+      'uct'  (<-'uc')
+      'umpt' (<-'um')
+      'rpt'  (<-'rb')
+      'urs'  (<-'ur')
+      'istr' (<-'ister')
+      'metr' (<-'meter')
+      'olv'  (<-'olut')
+      'ul'   (not 'a' not 'i' not 'o' <-'l')
+      'bex'  (<-'bic')
+      'dex'  (<-'dic')
+      'pex'  (<-'pic')
+      'tex'  (<-'tic')
+      'ax'   (<-'ac')
+      'ex'   (<-'ec')
+      'ix'   (<-'ic')
+      'lux'  (<-'luc')
+      'uad'  (<-'uas')
+      'vad'  (<-'vas')
+      'cid'  (<-'cis')
+      'lid'  (<-'lis')
+      'erid' (<-'eris')
+      'pand' (<-'pans')
+      'end'  (not 's' <-'ens')
+      'ond'  (<-'ons')
+      'lud'  (<-'lus')
+      'rud'  (<-'rus')
+      'her'  (not 'p' not 't' <-'hes')
+      'mit'  (<-'mis')
+      'ent'  (not 'm' <-'ens')
+        /* 'ent' was 'end' in the 1968 paper - a typo. */
+      'ert'  (<-'ers')
+      'et'   (not 'n' <-'es')
+      'yt'   (<-'ys')
+      'yz'   (<-'ys')
+    )
+  )
+)
+
+define stem as (
+
+  backwards (
+    do endings
+    do undouble
+    do respell
+  )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/lovins/stemmer.tt b/algorithms/lovins/stemmer.tt new file mode 100644 index 0000000..9de41fb --- /dev/null +++ b/algorithms/lovins/stemmer.tt @@ -0,0 +1,332 @@ +[% header('The Lovins stemming algorithm') %] + +

Links to resources

+ + + +

+The first ever published stemming algorithm was: Lovins JB (1968) Development of +a stemming algorithm. Mechanical Translation and Computational Linguistics, +11: 22-31. Julie Beth Lovins’ paper was remarkable for the early date at which +it was done, and for its seminal influence on later work in +this area. +

+ +

+The design of the algorithm was much influenced by the technical vocabulary +with which Lovins found herself working (subject term keywords attached to +documents in the materials science and engineering field). The subject term +list may also have been slightly limiting in that certain common endings +are not represented (ements and ents for example, corresponding to +the singular forms ement and ent), and also in that the algorithm's +treatment of short words, or words with short stems, can be rather +destructive. +

+ +

+The Lovins algorithm is noticeably bigger than the Porter algorithm, +because of its very extensive endings list. But in one way that is used to +advantage: it is faster. It has effectively traded space for time, and with +its large suffix set it needs just two major steps to remove a suffix, +compared with the eight of the Porter algorithm. +

+ +

+transformation rules. Each ending is associated with one of the +conditions. In the first step the longest ending is found which satisfies +its associated condition, and is removed. In the second step the 35 rules +are applied to transform the ending. The second step is done whether or not +an ending is removed in the first step. +

+ +

+For example, nationally has the ending ationally, with associated +condition, B, ‘minimum stem length = 3’. Since removing ationally +would leave a stem of length 1 this is rejected. But it also has ending +ionally with associated condition A. Condition A is ‘no restriction on +stem length’, so ionally is removed, leaving nat. +

+ +

+The transformation rules handle features like letter undoubling (sitting +→ sittsit), irregular plurals (matrix and matrices), +and English morphological oddities ultimately caused by the behaviour of +Latin verbs of the second conjugation (assume / assumption, +commit / commission etc). Although they are described as being +applied in turn, they can be broken into two stages, rule 1 being done in +stage 1, and either zero or one of rules 2 to 35 being done in stage 2. +

+ +

+Here is the list of endings as given in Appendix A of Lovins’ paper. They +are grouped by length, from 11 characters down to 1. Each ending is +followed by its condition code. +

+ +
+

Appendix A. The list of endings

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
.11.
alistically B arizability A izationally B
.10.
antialness A arisations A arizations A entialness A
.09.
allically C antaneous A antiality A arisation A
arization A ationally B ativeness A eableness E
entations A entiality A entialize A entiation A
ionalness A istically A itousness A izability A
izational A
.08.
ableness A arizable A entation A entially A
eousness A ibleness A icalness A ionalism A
ionality A ionalize A iousness A izations A
lessness A
.07.
ability A aically A alistic B alities A
ariness E aristic A arizing A ateness A
atingly A ational B atively A ativism A
elihood E encible A entally A entials A
entiate A entness A fulness A ibility A
icalism A icalist A icality A icalize A
ication G icianry A ination A ingness A
ionally A isation A ishness A istical A
iteness A iveness A ivistic A ivities A
ization F izement A oidally A ousness A
.06.
aceous A acious B action G alness A
ancial A ancies A ancing B ariser A
arized A arizer A atable A ations B
atives A eature Z efully A encies A
encing A ential A enting C entist A
eously A ialist A iality A ialize A
ically A icance A icians A icists A
ifully A ionals A ionate D ioning A
ionist A iously A istics A izable E
lessly A nesses A oidism A
.05.
acies A acity A aging B aical A
alist A alism B ality A alize A
allic BB anced B ances B antic C
arial A aries A arily A arity B
arize A aroid A ately A ating I
ation B ative A ators A atory A
ature E early Y ehood A eless A
elity A ement A enced A ences A
eness E ening E ental A ented C
ently A fully A ially A icant A
ician A icide A icism A icist A
icity A idine I iedly A ihood A
inate A iness A ingly B inism J
inity CC ional A ioned A ished A
istic A ities A itous A ively A
ivity A izers F izing F oidal A
oides A otide A ously A
.04.
able A ably A ages B ally B
ance B ancy B ants B aric A
arly K ated I ates A atic B
ator A ealy Y edly E eful A
eity A ence A ency A ened E
enly E eous A hood A ials A
ians A ible A ibly A ical A
ides L iers A iful A ines M
ings N ions B ious A isms B
ists A itic H ized F izer F
less A lily A ness A ogen A
ward A wise A ying B yish A
.03.
acy A age B aic A als BB
ant B ars O ary F ata A
ate A eal Y ear Y ely E
ene E ent C ery E ese A
ful A ial A ian A ics A
ide L ied A ier A ies P
ily A ine M ing N ion Q
ish C ism B ist A ite AA
ity A ium A ive A ize F
oid A one R ous A
.02.
ae A al BB ar X as B
ed E en F es E ia A
ic A is A ly B on S
or T um U us V yl R
s' A 's A
.01.
a A e A i A o A
s W y B
+
+ +

+Here are the 29 conditions, called A to Z, AA, BB and CC (* stands for any letter): +

+ +
+

Appendix B. Codes for context-sensitive rules associated with +certain endings

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
A No restrictions on stem
B Minimum stem length = 3
C Minimum stem length = 4
D Minimum stem length = 5
E Do not remove ending after e
F Minimum stem length = 3 and do not remove ending after e
G Minimum stem length = 3 and remove ending only after f
H Remove ending only after t or ll
I Do not remove ending after o or e
J Do not remove ending after a or e
K Minimum stem length = 3 and remove ending only after l, i or u*e
L Do not remove ending after u, x or s, unless s follows o
M Do not remove ending after a, c, e or m
N Minimum stem length = 4 after s**, elsewhere = 3
O Remove ending only after l or i
P Do not remove ending after c
Q Minimum stem length = 3 and do not remove ending after l or n
R Remove ending only after n or r
S Remove ending only after dr or t, unless t follows t
T Remove ending only after s or t, unless t follows o
U Remove ending only after l, m, n or r
V Remove ending only after c
W Do not remove ending after s or u
X Remove ending only after l, i or u*e
Y Remove ending only after in
Z Do not remove ending after f
AA Remove ending only after d, f, ph, th, l, er, or, es or t
BB Minimum stem length = 3 and do not remove ending after met or ryst
CC Remove ending only after l
+
+ +

+There is an implicit assumption in each condition, A included, that the minimum +stem length is 2. +

+ +

+Finally, here are the 35 transformation rules. +

+ +
+

Appendix C. Transformation rules used in recoding stem terminations

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1       remove one of double b, d, g, l, m, n, p, r, s, t
2 iev   →   ief
3 uct   →   uc
4 umpt   →   um
5 rpt   →   rb
6 urs   →   ur
7 istr   →   ister
7a metr   →   meter
8 olv   →   olut
9 ul   →   l except following a, o, i
10 bex   →   bic
11 dex   →   dic
12 pex   →   pic
13 tex   →   tic
14 ax   →   ac
15 ex   →   ec
16 ix   →   ic
17 lux   →   luc
18 uad   →   uas
19 vad   →   vas
20 cid   →   cis
21 lid   →   lis
22 erid   →   eris
23 pand   →   pans
24 end   →   ens except following s
25 ond   →   ons
26 lud   →   lus
27 rud   →   rus
28 her   →   hes except following p, t
29 mit   →   mis
30 ent   →   ens except following m
31 ert   →   ers
32 et   →   es except following n
33 yt   →   ys
34 yz   →   ys
+
+ +

+(Rule 30 as given here corrects a typographical error in the published +paper of 1968.) +

+ +

+The following examples show the intentions behind these rules. +

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1       rubb[ing] → rub, embedd[ed] → embed etc
2 believ[e] → belief
3 induct[ion] → induc[e]
4 consumpt[ion] → consum[e]
5 absorpt[ion] → absorb
6 recurs[ive] → recur
7 administr[ate] → administ[er]
7a parametr[ic] → paramet[er]
8 dissolv[ed] → dissolut[ion]
9 angul[ar] → angl[e]
10 vibex → vibic[es]
11 index → indic[es]
12 apex → apic[es]
13 cortex → cortic[al]
14 anthrax → anthrac[ite]
15 ?
16 matrix → matric[es]
17 ?
18 persuad[e] → persuas[ion]
19 evad[e] → evas[ion]
20 decid[e] → decis[ion]
21 elid[e] → elis[ion]
22 derid[e] → deris[ion]
23 expand → expans[ion]
24 defend → defens[ive]
25 respond → respons[ive]
26 collud[e] → collus[ion]
27 obtrud[e] → obtrus[ion]
28 adher[e] → adhes[ion]
29 remit → remis[s][ion]
30 extent → extens[ion]
31 convert[ed] → convers[ion]
32 parenthet[ic] → parenthes[is]
33 analyt[ic] → analys[is]
34 analyz[ed] → analys[ed]
+
+ +

The Lovins algorithm in Snowball

+ +

+And here is the Lovins algorithm in Snowball. The natural representation +of the Lovins endings, conditions and rules in Snowball, is, I believe, a +vindication of the appropriateness of Snowball for stemming work. Once the +tables had been established, getting the Snowball version running was the +work of a few minutes. +

+ +[% highlight_file('lovins') %] + +[% footer %] diff --git a/algorithms/norwegian/stemmer.html b/algorithms/norwegian/stemmer.html new file mode 100644 index 0000000..8ea60ec --- /dev/null +++ b/algorithms/norwegian/stemmer.html @@ -0,0 +1,446 @@ + + + + + + + + + + Norwegian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Norwegian stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Norwegian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+havnedistrikt
+havnedistriktene
+havnedistrikter
+havnedistriktet
+havnedistriktets
+havnedrift
+havnedriften
+havneeffektivitet
+havneeier
+havneeiere
+havneenheter
+havneforbund
+havneforbundets
+havneformål
+havneforvaltningen
+havnefunksjonene
+havnefunksjoner
+havnefylkene
+havnefylker
+havnehagen
+havneinfrastrukturen
+havneinnretningene
+havneinnretninger
+havneinteresser
+havnekapasitet
+havnekassa
+havnekasse
+havnekassemidler
+havnekassen
+havnekassene
+havnekassens
+havnelokalisering
+havneloven
+havnelovens
+havneløsning
+havneløsningene
+havneløsninger
+havnemessig
+havnemyndighetene
+havnemyndigheter
+
+havnedistrikt
+havnedistrikt
+havnedistrikt
+havnedistrikt
+havnedistrikt
+havnedrift
+havnedrift
+havneeffektivit
+havneei
+havneeier
+havneen
+havneforbund
+havneforbund
+havneformål
+havneforvaltning
+havnefunksjon
+havnefunksjon
+havnefylk
+havnefylk
+havnehag
+havneinfrastruktur
+havneinnretning
+havneinnretning
+havneinteress
+havnekapasit
+havnekass
+havnekass
+havnekassemidl
+havnekass
+havnekass
+havnekass
+havnelokalisering
+havn
+havn
+havneløsning
+havneløsning
+havneløsning
+havnemess
+havnemynd
+havnemynd
+
+opning
+opninga
+opningsbalanse
+opningsbalansen
+opp
+oppad
+opparbeide
+opparbeidede
+opparbeidelse
+opparbeider
+opparbeides
+opparbeidet
+opparbeiding
+oppattbygging
+oppbevarer
+oppbevaring
+oppblåst
+oppblåste
+oppbrente
+oppbygd
+oppbygde
+oppbygget
+oppbygging
+oppbygginga
+oppbyggingen
+oppdage
+oppdager
+oppdaterte
+oppdeling
+oppdelingen
+oppdelt
+oppdrag
+oppdraget
+oppdragsavtale
+oppdragsgivere
+oppdragstakaren
+oppe
+oppebærer
+oppfarende
+oppfatning
+
+opning
+opning
+opningsbalans
+opningsbalans
+opp
+oppad
+opparbeid
+opparbeid
+opparbeid
+opparbeid
+opparbeid
+opparbeid
+opparbeiding
+oppattbygging
+oppbevar
+oppbevaring
+oppblåst
+oppblåst
+oppbrent
+oppbygd
+oppbygd
+oppbygg
+oppbygging
+oppbygging
+oppbygging
+oppdag
+oppdag
+oppdater
+oppdeling
+oppdeling
+oppdelt
+oppdrag
+oppdrag
+oppdragsavtal
+oppdragsgiver
+oppdragstakar
+opp
+oppebær
+oppfar
+oppfatning
+
+
+ +

The stemming algorithm

+ +

+The Norwegian alphabet includes the following additional letters, +

+ +
+ æ   å   ø +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   æ   å   ø +
+ +

+R2 is not used: R1 is defined in the same way as in the +German stemmer. +(See the note on R1 and R2.) +

+ +

+Define a valid s-ending as one of +

+ +
+b   c   d   f   g   h   j   +l   m   n   o   p   r   t   v   +y   z, +
or k not preceded by a vowel. +
+ +

+Do each of steps 1, 2 and 3. +

+ +

+Step 1: +

+ +
+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +
+
(a) + a   e   ede   ande   ende   ane   ene   hetene   en +   heten   ar   er   heter   as   es   edes   endes   + enes   hetenes   ens   hetens   ers   ets   et   het   + ast +
delete +
(b) + s +
delete if preceded by a valid s-ending +
(c) + erte   ert +
replace with er +
+

+ (Of course the letter of the valid s-ending is + not necessarily in R1) +

+
+ +

+Step 2: +

+ +
+

+ If the word ends dt or vt in R1, delete the t. +

+ +

+ (For example, meldtmeld, operativtoperativ) +

+
+ +

+Step 3: +

+ +
+ Search for the longest among the following suffixes in R1, and if found, + delete. +
+ leg   eleg   ig   eig   lig   elig   els +   lov   elov   slov   hetslov +
+
+ +

The same algorithm in Snowball

+ +
routines (
+           mark_regions
+           main_suffix
+           consonant_pair
+           other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef ae   '{U+00E6}'
+stringdef ao   '{U+00E5}'
+stringdef o/   '{U+00F8}'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending  'bcdfghjlmnoprtvyz'
+
+define mark_regions as (
+
+    $p1 = limit
+
+    test ( hop 3 setmark x )
+    goto v  gopast non-v  setmark p1
+    try ( $p1 < x  $p1 = x )
+)
+
+backwardmode (
+
+    define main_suffix as (
+        setlimit tomark p1 for ([substring])
+        among(
+
+            'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
+            'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
+            'hetens' 'ers' 'ets' 'et' 'het' 'ast'
+                (delete)
+            's'
+                (s_ending or ('k' non-v) delete)
+            'erte' 'ert'
+                (<-'er')
+        )
+    )
+
+    define consonant_pair as (
+        test (
+            setlimit tomark p1 for ([substring])
+            among(
+                'dt' 'vt'
+            )
+        )
+        next] delete
+    )
+
+    define other_suffix as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
+            'hetslov'
+                (delete)
+        )
+    )
+)
+
+define stem as (
+
+    do mark_regions
+    backwards (
+        do main_suffix
+        do consonant_pair
+        do other_suffix
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/norwegian/stemmer.tt b/algorithms/norwegian/stemmer.tt new file mode 100644 index 0000000..30578d1 --- /dev/null +++ b/algorithms/norwegian/stemmer.tt @@ -0,0 +1,109 @@ +[% header('Norwegian stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'havnedistrikt', 'opning']) %] + +

The stemming algorithm

+ +

+The Norwegian alphabet includes the following additional letters, +

+ +
+ æ   å   ø +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   æ   å   ø +
+ +

+R2 is not used: R1 is defined in the same way as in the +German stemmer. +(See the note on R1 and R2.) +

+ +

+Define a valid s-ending as one of +

+ +
+b   c   d   f   g   h   j   +l   m   n   o   p   r   t   v   +y   z, +
or k not preceded by a vowel. +
+ +

+Do each of steps 1, 2 and 3. +

+ +

+Step 1: +

+ +
+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +
+
(a) + a   e   ede   ande   ende   ane   ene   hetene   en +   heten   ar   er   heter   as   es   edes   endes   + enes   hetenes   ens   hetens   ers   ets   et   het   + ast +
delete +
(b) + s +
delete if preceded by a valid s-ending +
(c) + erte   ert +
replace with er +
+

+ (Of course the letter of the valid s-ending is + not necessarily in R1) +

+
+ +

+Step 2: +

+ +
+

+ If the word ends dt or vt in R1, delete the t. +

+ +

+ (For example, meldtmeld, operativtoperativ) +

+
+ +

+Step 3: +

+ +
+ Search for the longest among the following suffixes in R1, and if found, + delete. +
+ leg   eleg   ig   eig   lig   elig   els +   lov   elov   slov   hetslov +
+
+ +

The same algorithm in Snowball

+ +[% highlight_file('norwegian') %] + +[% footer %] diff --git a/algorithms/norwegian/stop.txt b/algorithms/norwegian/stop.txt new file mode 100644 index 0000000..df1c509 --- /dev/null +++ b/algorithms/norwegian/stop.txt @@ -0,0 +1,182 @@ + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/which/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/algorithms/porter/stemmer.html b/algorithms/porter/stemmer.html new file mode 100644 index 0000000..f68dbc7 --- /dev/null +++ b/algorithms/porter/stemmer.html @@ -0,0 +1,863 @@ + + + + + + + + + + The Porter stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

The Porter stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a case study on how to code up a stemming algorithm in Snowball. First, +the definition of the Porter stemmer, as it appeared in Program, Vol 14 no. 3 pp +130-137, July 1980. +

+ +
+

THE ALGORITHM

+ +

+A consonant in a word is a letter other than A, E, I, O or U, and other +than Y preceded by a consonant. (The fact that the term ‘consonant’ is +defined to some extent in terms of itself does not make it ambiguous.) So in +TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a +letter is not a consonant it is a vowel. +

+ +

+A consonant will be denoted by c, a vowel by v. A list ccc... of length +greater than 0 will be denoted by C, and a list vvv... of length greater +than 0 will be denoted by V. Any word, or part of a word, therefore has one +of the four forms: +

+ +
+
CVCV ... C +
CVCV ... V +
VCVC ... C +
VCVC ... V +
+ +

+These may all be represented by the single form +

+ +
+ [C]VCVC ... [V] +
+ +

+where the square brackets denote arbitrary presence of their contents. +Using (VC)m to denote VC repeated m times, this may again be written as +

+ +
+ [C](VC)m[V]. +
+ +

+m will be called the measure of any word or word part when represented in +this form. The case m = 0 covers the null word. Here are some examples: +

+ +
+
m=0 TR,   EE,   TREE,   Y,   BY. +
m=1 TROUBLE,   OATS,   TREES,   IVY. +
m=2 TROUBLES,   PRIVATE,   OATEN,   ORRERY. +
+ +

+The rules for removing a suffix will be given in the form +

+ +
+ (condition) S1 → S2 +
+ +

+This means that if a word ends with the suffix S1, and the stem before S1 +satisfies the given condition, S1 is replaced by S2. The condition is +usually given in terms of m, e.g. +

+ +
+ (m > 1) EMENT → +
+ +

+Here S1 is ‘EMENT’ and S2 is null. This would map REPLACEMENT to REPLAC, +since REPLAC is a word part for which m = 2. +

+ +

+The ‘condition’ part may also contain the following: +

+ +
+
*S - the stem ends with S (and similarly for the other letters). + +
*v* - the stem contains a vowel. + +
*d - the stem ends with a double consonant (e.g. -TT, -SS). + +
*o - the stem ends cvc, where the second c is not W, X or Y (e.g. + -WIL, -HOP). +
+ +

+And the condition part may also contain expressions with and, or and +not, so that +

+ +
+ (m>1 and (*S or *T)) +
+ +

+tests for a stem with m>1 ending in S or T, while +

+ +
+ (*d and not (*L or *S or *Z)) +
+ +

+tests for a stem ending with a double consonant other than L, S or Z. +Elaborate conditions like this are required only rarely. +

+ +

+In a set of rules written beneath each other, only one is obeyed, and this +will be the one with the longest matching S1 for the given word. For +example, with +

+ +
+
SSES SS +
IES I +
SS SS +
S +
+ +

+(here the conditions are all null) CARESSES maps to CARESS since SSES is +the longest match for S1. Equally CARESS maps to CARESS (S1=‘SS’) and CARES +to CARE (S1=‘S’). +

+ +

+In the rules below, examples of their application, successful or otherwise, +are given on the right in lower case. The algorithm now follows: +

+ +

+Step 1a +

+ +
+
SSES SS      caresses caress +
IES I      ponies poni +
     ties ti +
SS SS      caress caress +
S      cats cat +
+ +

+Step 1b +

+ +
+
(m>0) EED EE      feed feed +
     agreed agree +
(*v*) ED      plastered plaster +
     bled bled +
(*v*) ING      motoring motor +
sing sing +
+ +

+If the second or third of the rules in Step 1b is successful, the following +is done: +

+ +
+
AT ATE      conflat(ed) conflate +
BL BLE      troubl(ed) trouble +
IZ IZE      siz(ed) size +
(*d and not (*L or *S or *Z)) + single letter      hopp(ing) hop +
     tann(ed) tan +
     fall(ing) fall +
     hiss(ing) hiss +
     fizz(ed) fizz +
(m=1 and *o) + E      fail(ing) fail +
     fil(ing) file +
+ +

+The rule to map to a single letter causes the removal of one of the double +letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes +-ATE, -BLE and -IZE can be recognised later. This E may be removed in step +4. +

+ +

+Step 1c +

+ +
+
(*v*) Y I      happy happi +
     sky sky +
+ +

+Step 1 deals with plurals and past participles. The subsequent steps are +much more straightforward. +

+ +

+Step 2 +

+ +
+
(m>0) ATIONAL ATE      relational relate +
(m>0) TIONAL TION      conditional condition +
     rational rational +
(m>0) ENCI ENCE      valenci valence +
(m>0) ANCI ANCE      hesitanci hesitance +
(m>0) IZER IZE      digitizer digitize +
(m>0) ABLI ABLE      conformabli conformable +
(m>0) ALLI AL      radicalli radical +
(m>0) ENTLI ENT      differentli different +
(m>0) ELI E      vileli vile +
(m>0) OUSLI OUS      analogousli analogous +
(m>0) IZATION IZE      vietnamization vietnamize +
(m>0) ATION ATE      predication predicate +
(m>0) ATOR ATE      operator operate +
(m>0) ALISM AL      feudalism feudal +
(m>0) IVENESS IVE      decisiveness decisive +
(m>0) FULNESS FUL      hopefulness hopeful +
(m>0) OUSNESS OUS      callousness callous +
(m>0) ALITI AL      formaliti formal +
(m>0) IVITI IVE      sensitiviti sensitive +
(m>0) BILITI BLE      sensibiliti sensible +
+ +

+The test for the string S1 can be made fast by doing a program switch on +the penultimate letter of the word being tested. This gives a fairly even +breakdown of the possible values of the string S1. It will be seen in fact +that the S1-strings in step 2 are presented here in the alphabetical order +of their penultimate letter. Similar techniques may be applied in the other +steps. +

+ +

+Step 3 +

+ +
+
(m>0) ICATE IC      triplicate triplic +
(m>0) ATIVE      formative form +
(m>0) ALIZE AL      formalize formal +
(m>0) ICITI IC      electriciti electric +
(m>0) ICAL IC      electrical electric +
(m>0) FUL      hopeful hope +
(m>0) NESS      goodness good +
+ +

+Step 4 +

+ +
+
(m>1) AL      revival reviv +
(m>1) ANCE      allowance allow +
(m>1) ENCE      inference infer +
(m>1) ER      airliner airlin +
(m>1) IC      gyroscopic gyroscop +
(m>1) ABLE      adjustable adjust +
(m>1) IBLE      defensible defens +
(m>1) ANT      irritant irrit +
(m>1) EMENT      replacement replac +
(m>1) MENT      adjustment adjust +
(m>1) ENT      dependent depend +
(m>1 and (*S or *T)) ION +      adoption adopt +
(m>1) OU      homologou homolog +
(m>1) ISM      communism commun +
(m>1) ATE      activate activ +
(m>1) ITI      angulariti angular +
(m>1) OUS      homologous homolog +
(m>1) IVE      effective effect +
(m>1) IZE      bowdlerize bowdler +
+ +

+The suffixes are now removed. All that remains is a little tidying up. +

+ +

+Step 5a +

+ +
+
(m>1) E      probate probat +
     rate rate +
(m=1 and not *o) E +      cease ceas +
+ +

+Step 5b +

+ +
+
(m > 1 and *d and *L) + single letter      controll control +
     roll roll +
+
+ +

+Now, turning it into Snowball. +

+ +

+The Porter stemmer makes a use of a measure, m, of the length of a word or +word part. If C is a sequence of one or more consonants, and V a sequence +of one or more vowels, any word part has the form +

+ +
+ [C](VC)m[V], +
+ +

+which is to be read as an optional C, followed by m repetitions of VC, +followed by an optional V. This defines m. So for crepuscular the +measure would be 4. +

+ +
+    c r e p u s c u l a r
+       |   |     |   |   |
+    [C] V C  V C  V C V C
+         1    2    3   4
+
+ +

+Most of the rules for suffix removal involve leaving behind a stem whose +measure exceeds some value, for example, +

+ +
+ (m > 0) eedee +
+ +

+means ‘replace eed with ee if the stem before eed has measure +m > 0’. Implementations of the Porter stemmer usually have a routine that +computes m each time there is a possible candidate for removal. +

+ +

+In fact the only tests on m in the Porter stemmer are m > 0, m > 1, and, +at two interesting points, m = 1. This suggests that there are two +critical positions in a word: the point at which, going from left to +right, m > 0 becomes true, and then the point at which m > 1 becomes true. +It turns out that m > 0 becomes true at the point after the first consonant +following a vowel, and m > 1 becomes true at the point after the first +consonant following a vowel following a consonant following a vowel. +Calling these positions p1 and p2, we can determine them quite simply in +Snowball: +

+ +
    define v 'aeiouy'
+
+    /* ... */
+
+    do(
+        gopast v  gopast non-v  setmark p1
+        gopast v  gopast non-v  setmark p2
+    )
+
+ + +

+The region to the right of p1 will be denoted by R1, the region to the +right of p2 by R2: +

+ +
+    c r e p u s c u l a r
+           |   |
+           p1  p2
+           <---  R1  --->
+               <-- R2 -->
+
+ +

+We can test for being in these regions with calls to  R1  and  R2, defined by, +

+ +
    define R1 as  <= cursor
+    define R2 as  <= cursor
+
+ + +

+and using these tests instead of computing m is acceptable, so long as the +stemming process never alters the p1 and p2 positions, which is indeed true +in the Porter stemmer. +

+ +

+A particularly interesting feature of the stemmers presented here is the +common use they make of the positions p1 and p2. The details of marking +p1 +and p2 vary between the languages because the definitions of vowel and +consonant vary. For example, French i preceded and followed by vowel +should be treated as a consonant (inquiétude); Portuguese (ã and õ +should be treated as a vowel-consonant pair (São João). A third +important position is pV, which tries to mark the position of the shortest +acceptable verb stem. Its definition varies somewhat between languages. +The Porter stemmer does not use a pV explicitly, but the idea appears when +the verb endings ing and ed are removed only when preceded by a vowel. +In English therefore pV would be defined as the position after the first +vowel. +

+ +

+The Porter stemmer is divided into five steps, step 1 is divided further +into steps 1a, 1b and 1c, and step 5 into steps 5a and 5b. Step 1 removes +the i-suffixes, and steps 2 to 4 the d-suffixes (*). Composite d-suffixes are +reduced to single d-suffixes one at a time. So for example if a word ends +icational, step 2 reduces it to icate and step 3 to ic. Three steps are +sufficient for this process in English. Step 5 does some tidying up. +

+ +

+One can see how easily the stemming rules translate into Snowball by +comparing the definition of Step 1a from the 1980 paper, +

+ +
+    Step 1a:
+        SSES → SS                         caresses  →  caress
+        IES  → I                          ponies    →  poni
+                                           ties      →  ti
+        SS   → SS                         caress    →  caress
+        S    →                            cats      →  cat
+
+ +

+with its Snowball equivalent, +

+ +
    define Step_1a as (
+        [substring] among (
+            'sses' (<-'ss')
+            'ies'  (<-'i')
+            'ss'   ()
+            's'    (delete)
+        )
+    )
+
+ + +

+The word to be stemmed is being scanned right to left from the end. The +longest of  'sses',  'ies',  'ss'  or  's'  is searched for and defined as the +slice. (If none are found, Step_1a signals f.) If  'sses'  is found, it is +replaced by  'ss', and so on. Of course, replacing  'ss'  by  'ss'  is a dummy +action, so we can write +

+ +
            'ss'   ()
+
+ + +

+instead of +

+ +
            'ss'   (<-'ss')
+
+ + +

+Remember that  delete  just means  <- ''. +

+ +

+The really tricky part of the whole algorithm is step 1b, +which may be worth looking at in detail. Here it is, without the +example words on the far right, +

+ +
+    Step 1b:
+        (m > 0) EED → EE
+        (*v*)   ED  →
+        (*v*)   ING →
+
+    If the second or third of the rules in Step 1b is successful, the
+    following is done:
+
+        AT → ATE
+        BL → BLE
+        IZ → IZE
+        (*d and not (*L or *S or *Z)) → single letter
+        (m = 1 and *o) → E
+
+ +

+The first part of the rule means that eed maps to ee if eed is in R1 +(which is equivalent to m > 0), or ed and ing are removed if they are +preceded by a vowel. In Snowball this is simply, +

+ +
    define Step_1b as (
+        [substring] among (
+            'eed'  (R1 <-'ee')
+            'ed'
+            'ing'  (test gopast v  delete)
+        )
+    )
+
+ + +

+But this must be modified by the second part of the rule. *d indicates a +test for double letter consonant — bb, dd etc. *L, *S, *Z are tests +for l, s, z. *o is a short vowel test — it is matched by +consonant-vowel-consonant, where the consonant on the right is not w, x +or y. If the short vowel test is satisfied, m = 1 is equivalent to the +cursor being at p1. So the second part of the rule means, map at, bl, iz +to ate, ble, ize; map certain double letters to single letters; and +add e after a short vowel in words of one syllable. +

+ +

+We first need two extra groupings, +

+ +
    define v        'aeiouy'
+    define v_WXY    v + 'wxY'   // v with 'w', 'x' and 'y'-consonant
+    define v_LSZ    v + 'lsz'   // v with 'l', 's', 'z'
+
+ + +

+and a test for a short vowel, +

+ +
    define shortv as ( non-v_WXY v non-v )
+
+ + +

+(The  v_WXY  test comes first because we are scanning backwards, from right to +left.) +

+ +

+The double to single letter map can be done as follows: first define the +slice as the next  non-v_LSZ  and copy it to a string,  ch, as a single +character, +

+ +
    strings ( ch )
+
+    /* ... */
+
+    [non-v_LSZ] ->ch
+
+ + +

+A further test,  ch, tests that the next letter of the string is the same +as the one in  ch, and if this gives signal t,  delete  deletes the slice, +

+ +
    [non-v_LSZ] ->ch  ch  delete
+
+ + +

+Step_1b  can then be written like this, +

+ +
    define Step_1b as (
+        [substring] among (
+            'eed'  (R1 <-'ee')
+            'ed'
+            'ing' (
+                test gopast v  delete
+                (test among('at' 'bl' 'iz')  <+ 'e')
+                or
+                ([non-v_LSZ]->ch  ch  delete)
+                or
+                (atmark p1  test shortv  <+ 'e')
+            )
+        )
+    )
+
+ + +

+But we can improve the appearance, and speed, of this by turning the +second part of the rule into another  among  command, noting that the only +letters that need undoubling are b, d, f, g, m, n, p, r +and t, +

+ +
    define Step_1b as (
+        [substring] among (
+            'eed'  (R1 <-'ee')
+            'ed'
+            'ing' (
+                test gopast v  delete
+                test substring among(
+                    'at' 'bl' 'iz'
+                         (<+ 'e')
+                    'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+                    // ignoring double c, h, j, k, q, v, w, and x
+                         ([next]  delete)
+                    ''   (atmark p1  test shortv  <+ 'e')
+                )
+            )
+        )
+    )
+
+ + +

+Note the null string in the second  among, which acts as a default case. +

+ +

+The Porter stemmer in Snowball is given below. This is an exact +implementation of the algorithm described in the 1980 paper, unlike the +other implementations distributed by the author, which have, and have +always had, three small points of difference (clearly indicated) from the +original algorithm. Since all other implementations of the algorithm seen +by the author are in some degree inexact, this may well be the first ever +correct implementation. +

+ +

The full algorithm in Snowball

+ +
integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+   shortv
+   R1 R2
+   Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
+)
+
+externals ( stem )
+
+groupings ( v v_WXY )
+
+define v        'aeiouy'
+define v_WXY    v + 'wxY'
+
+backwardmode (
+
+    define shortv as ( non-v_WXY v non-v )
+
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define Step_1a as (
+        [substring] among (
+            'sses' (<-'ss')
+            'ies'  (<-'i')
+            'ss'   ()
+            's'    (delete)
+        )
+    )
+
+    define Step_1b as (
+        [substring] among (
+            'eed'  (R1 <-'ee')
+            'ed'
+            'ing' (
+                test gopast v  delete
+                test substring among(
+                    'at' 'bl' 'iz'
+                         (<+ 'e')
+                    'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+                    // ignoring double c, h, j, k, q, v, w, and x
+                         ([next]  delete)
+                    ''   (atmark p1  test shortv  <+ 'e')
+                )
+            )
+        )
+    )
+
+    define Step_1c as (
+        ['y' or 'Y']
+        gopast v
+        <-'i'
+    )
+
+    define Step_2 as (
+        [substring] R1 among (
+            'tional'  (<-'tion')
+            'enci'    (<-'ence')
+            'anci'    (<-'ance')
+            'abli'    (<-'able')
+            'entli'   (<-'ent')
+            'eli'     (<-'e')
+            'izer' 'ization'
+                      (<-'ize')
+            'ational' 'ation' 'ator'
+                      (<-'ate')
+            'alli'    (<-'al')
+            'alism' 'aliti'
+                      (<-'al')
+            'fulness' (<-'ful')
+            'ousli' 'ousness'
+                      (<-'ous')
+            'iveness' 'iviti'
+                      (<-'ive')
+            'biliti'  (<-'ble')
+        )
+    )
+
+    define Step_3 as (
+        [substring] R1 among (
+            'alize'   (<-'al')
+            'icate' 'iciti' 'ical'
+                      (<-'ic')
+            'ative' 'ful' 'ness'
+                      (delete)
+        )
+    )
+
+    define Step_4 as (
+        [substring] R2 among (
+            'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
+            'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
+                      (delete)
+            'ion'     ('s' or 't' delete)
+        )
+    )
+
+    define Step_5a as (
+        ['e']
+        R2 or (R1 not shortv)
+        delete
+    )
+
+    define Step_5b as (
+        ['l']
+        R2 'l'
+        delete
+    )
+)
+
+define stem as (
+
+    unset Y_found
+    do ( ['y'] <-'Y' set Y_found)
+    do repeat(goto (v ['y']) <-'Y' set Y_found)
+
+    $p1 = limit
+    $p2 = limit
+    do(
+        gopast v  gopast non-v  setmark p1
+        gopast v  gopast non-v  setmark p2
+    )
+
+    backwards (
+        do Step_1a
+        do Step_1b
+        do Step_1c
+        do Step_2
+        do Step_3
+        do Step_4
+        do Step_5a
+        do Step_5b
+    )
+
+    do(Y_found  repeat(goto (['Y']) <-'y'))
+
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/porter/stemmer.tt b/algorithms/porter/stemmer.tt new file mode 100644 index 0000000..c345a88 --- /dev/null +++ b/algorithms/porter/stemmer.tt @@ -0,0 +1,656 @@ +[% header('The Porter stemming algorithm') %] + +

Links to resources

+ + + +

+Here is a case study on how to code up a stemming algorithm in Snowball. First, +the definition of the Porter stemmer, as it appeared in Program, Vol 14 no. 3 pp +130-137, July 1980. +

+ +
+

THE ALGORITHM

+ +

+A consonant in a word is a letter other than A, E, I, O or U, and other +than Y preceded by a consonant. (The fact that the term ‘consonant’ is +defined to some extent in terms of itself does not make it ambiguous.) So in +TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a +letter is not a consonant it is a vowel. +

+ +

+A consonant will be denoted by c, a vowel by v. A list ccc... of length +greater than 0 will be denoted by C, and a list vvv... of length greater +than 0 will be denoted by V. Any word, or part of a word, therefore has one +of the four forms: +

+ +
+
CVCV ... C +
CVCV ... V +
VCVC ... C +
VCVC ... V +
+ +

+These may all be represented by the single form +

+ +
+ [C]VCVC ... [V] +
+ +

+where the square brackets denote arbitrary presence of their contents. +Using (VC)m to denote VC repeated m times, this may again be written as +

+ +
+ [C](VC)m[V]. +
+ +

+m will be called the measure of any word or word part when represented in +this form. The case m = 0 covers the null word. Here are some examples: +

+ +
+
m=0 TR,   EE,   TREE,   Y,   BY. +
m=1 TROUBLE,   OATS,   TREES,   IVY. +
m=2 TROUBLES,   PRIVATE,   OATEN,   ORRERY. +
+ +

+The rules for removing a suffix will be given in the form +

+ +
+ (condition) S1 → S2 +
+ +

+This means that if a word ends with the suffix S1, and the stem before S1 +satisfies the given condition, S1 is replaced by S2. The condition is +usually given in terms of m, e.g. +

+ +
+ (m > 1) EMENT → +
+ +

+Here S1 is ‘EMENT’ and S2 is null. This would map REPLACEMENT to REPLAC, +since REPLAC is a word part for which m = 2. +

+ +

+The ‘condition’ part may also contain the following: +

+ +
+
*S - the stem ends with S (and similarly for the other letters). + +
*v* - the stem contains a vowel. + +
*d - the stem ends with a double consonant (e.g. -TT, -SS). + +
*o - the stem ends cvc, where the second c is not W, X or Y (e.g. + -WIL, -HOP). +
+ +

+And the condition part may also contain expressions with and, or and +not, so that +

+ +
+ (m>1 and (*S or *T)) +
+ +

+tests for a stem with m>1 ending in S or T, while +

+ +
+ (*d and not (*L or *S or *Z)) +
+ +

+tests for a stem ending with a double consonant other than L, S or Z. +Elaborate conditions like this are required only rarely. +

+ +

+In a set of rules written beneath each other, only one is obeyed, and this +will be the one with the longest matching S1 for the given word. For +example, with +

+ +
+
SSES SS +
IES I +
SS SS +
S +
+ +

+(here the conditions are all null) CARESSES maps to CARESS since SSES is +the longest match for S1. Equally CARESS maps to CARESS (S1=‘SS’) and CARES +to CARE (S1=‘S’). +

+ +

+In the rules below, examples of their application, successful or otherwise, +are given on the right in lower case. The algorithm now follows: +

+ +

+Step 1a +

+ +
+
SSES SS      caresses caress +
IES I      ponies poni +
     ties ti +
SS SS      caress caress +
S      cats cat +
+ +

+Step 1b +

+ +
+
(m>0) EED EE      feed feed +
     agreed agree +
(*v*) ED      plastered plaster +
     bled bled +
(*v*) ING      motoring motor +
sing sing +
+ +

+If the second or third of the rules in Step 1b is successful, the following +is done: +

+ +
+
AT ATE      conflat(ed) conflate +
BL BLE      troubl(ed) trouble +
IZ IZE      siz(ed) size +
(*d and not (*L or *S or *Z)) + single letter      hopp(ing) hop +
     tann(ed) tan +
     fall(ing) fall +
     hiss(ing) hiss +
     fizz(ed) fizz +
(m=1 and *o) + E      fail(ing) fail +
     fil(ing) file +
+ +

+The rule to map to a single letter causes the removal of one of the double +letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes +-ATE, -BLE and -IZE can be recognised later. This E may be removed in step +4. +

+ +

+Step 1c +

+ +
+
(*v*) Y I      happy happi +
     sky sky +
+ +

+Step 1 deals with plurals and past participles. The subsequent steps are +much more straightforward. +

+ +

+Step 2 +

+ +
+
(m>0) ATIONAL ATE      relational relate +
(m>0) TIONAL TION      conditional condition +
     rational rational +
(m>0) ENCI ENCE      valenci valence +
(m>0) ANCI ANCE      hesitanci hesitance +
(m>0) IZER IZE      digitizer digitize +
(m>0) ABLI ABLE      conformabli conformable +
(m>0) ALLI AL      radicalli radical +
(m>0) ENTLI ENT      differentli different +
(m>0) ELI E      vileli vile +
(m>0) OUSLI OUS      analogousli analogous +
(m>0) IZATION IZE      vietnamization vietnamize +
(m>0) ATION ATE      predication predicate +
(m>0) ATOR ATE      operator operate +
(m>0) ALISM AL      feudalism feudal +
(m>0) IVENESS IVE      decisiveness decisive +
(m>0) FULNESS FUL      hopefulness hopeful +
(m>0) OUSNESS OUS      callousness callous +
(m>0) ALITI AL      formaliti formal +
(m>0) IVITI IVE      sensitiviti sensitive +
(m>0) BILITI BLE      sensibiliti sensible +
+ +

+The test for the string S1 can be made fast by doing a program switch on +the penultimate letter of the word being tested. This gives a fairly even +breakdown of the possible values of the string S1. It will be seen in fact +that the S1-strings in step 2 are presented here in the alphabetical order +of their penultimate letter. Similar techniques may be applied in the other +steps. +

+ +

+Step 3 +

+ +
+
(m>0) ICATE IC      triplicate triplic +
(m>0) ATIVE      formative form +
(m>0) ALIZE AL      formalize formal +
(m>0) ICITI IC      electriciti electric +
(m>0) ICAL IC      electrical electric +
(m>0) FUL      hopeful hope +
(m>0) NESS      goodness good +
+ +

+Step 4 +

+ +
+
(m>1) AL      revival reviv +
(m>1) ANCE      allowance allow +
(m>1) ENCE      inference infer +
(m>1) ER      airliner airlin +
(m>1) IC      gyroscopic gyroscop +
(m>1) ABLE      adjustable adjust +
(m>1) IBLE      defensible defens +
(m>1) ANT      irritant irrit +
(m>1) EMENT      replacement replac +
(m>1) MENT      adjustment adjust +
(m>1) ENT      dependent depend +
(m>1 and (*S or *T)) ION +      adoption adopt +
(m>1) OU      homologou homolog +
(m>1) ISM      communism commun +
(m>1) ATE      activate activ +
(m>1) ITI      angulariti angular +
(m>1) OUS      homologous homolog +
(m>1) IVE      effective effect +
(m>1) IZE      bowdlerize bowdler +
+ +

+The suffixes are now removed. All that remains is a little tidying up. +

+ +

+Step 5a +

+ +
+
(m>1) E      probate probat +
     rate rate +
(m=1 and not *o) E +      cease ceas +
+ +

+Step 5b +

+ +
+
(m > 1 and *d and *L) + single letter      controll control +
     roll roll +
+
+ +

+Now, turning it into Snowball. +

+ +

+The Porter stemmer makes a use of a measure, m, of the length of a word or +word part. If C is a sequence of one or more consonants, and V a sequence +of one or more vowels, any word part has the form +

+ +
+ [C](VC)m[V], +
+ +

+which is to be read as an optional C, followed by m repetitions of VC, +followed by an optional V. This defines m. So for crepuscular the +measure would be 4. +

+ +
+    c r e p u s c u l a r
+       |   |     |   |   |
+    [C] V C  V C  V C V C
+         1    2    3   4
+
+ +

+Most of the rules for suffix removal involve leaving behind a stem whose +measure exceeds some value, for example, +

+ +
+ (m > 0) eedee +
+ +

+means ‘replace eed with ee if the stem before eed has measure +m > 0’. Implementations of the Porter stemmer usually have a routine that +computes m each time there is a possible candidate for removal. +

+ +

+In fact the only tests on m in the Porter stemmer are m > 0, m > 1, and, +at two interesting points, m = 1. This suggests that there are two +critical positions in a word: the point at which, going from left to +right, m > 0 becomes true, and then the point at which m > 1 becomes true. +It turns out that m > 0 becomes true at the point after the first consonant +following a vowel, and m > 1 becomes true at the point after the first +consonant following a vowel following a consonant following a vowel. +Calling these positions p1 and p2, we can determine them quite simply in +Snowball: +

+ +[% highlight(" + define v 'aeiouy' + + /* ... */ + + do( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +") %] + +

+The region to the right of p1 will be denoted by R1, the region to the +right of p2 by R2: +

+ +
+    c r e p u s c u l a r
+           |   |
+           p1  p2
+           <---  R1  --->
+               <-- R2 -->
+
+ +

+We can test for being in these regions with calls to  R1  and  R2, defined by, +

+ +[% highlight(" + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor +") %] + +

+and using these tests instead of computing m is acceptable, so long as the +stemming process never alters the p1 and p2 positions, which is indeed true +in the Porter stemmer. +

+ +

+A particularly interesting feature of the stemmers presented here is the +common use they make of the positions p1 and p2. The details of marking +p1 +and p2 vary between the languages because the definitions of vowel and +consonant vary. For example, French i preceded and followed by vowel +should be treated as a consonant (inquiétude); Portuguese (ã and õ +should be treated as a vowel-consonant pair (São João). A third +important position is pV, which tries to mark the position of the shortest +acceptable verb stem. Its definition varies somewhat between languages. +The Porter stemmer does not use a pV explicitly, but the idea appears when +the verb endings ing and ed are removed only when preceded by a vowel. +In English therefore pV would be defined as the position after the first +vowel. +

+ +

+The Porter stemmer is divided into five steps, step 1 is divided further +into steps 1a, 1b and 1c, and step 5 into steps 5a and 5b. Step 1 removes +the i-suffixes, and steps 2 to 4 the d-suffixes (*). Composite d-suffixes are +reduced to single d-suffixes one at a time. So for example if a word ends +icational, step 2 reduces it to icate and step 3 to ic. Three steps are +sufficient for this process in English. Step 5 does some tidying up. +

+ +

+One can see how easily the stemming rules translate into Snowball by +comparing the definition of Step 1a from the 1980 paper, +

+ +
+    Step 1a:
+        SSES → SS                         caresses  →  caress
+        IES  → I                          ponies    →  poni
+                                           ties      →  ti
+        SS   → SS                         caress    →  caress
+        S    →                            cats      →  cat
+
+ +

+with its Snowball equivalent, +

+ +[% highlight(" + define Step_1a as ( + [substring] among ( + 'sses' (<-'ss') + 'ies' (<-'i') + 'ss' () + 's' (delete) + ) + ) +") %] + +

+The word to be stemmed is being scanned right to left from the end. The +longest of  'sses',  'ies',  'ss'  or  's'  is searched for and defined as the +slice. (If none are found, Step_1a signals f.) If  'sses'  is found, it is +replaced by  'ss', and so on. Of course, replacing  'ss'  by  'ss'  is a dummy +action, so we can write +

+ +[% highlight(" + 'ss' () +") %] + +

+instead of +

+ +[% highlight(" + 'ss' (<-'ss') +") %] + +

+Remember that  delete  just means  <- ''. +

+ +

+The really tricky part of the whole algorithm is step 1b, +which may be worth looking at in detail. Here it is, without the +example words on the far right, +

+ +
+    Step 1b:
+        (m > 0) EED → EE
+        (*v*)   ED  →
+        (*v*)   ING →
+
+    If the second or third of the rules in Step 1b is successful, the
+    following is done:
+
+        AT → ATE
+        BL → BLE
+        IZ → IZE
+        (*d and not (*L or *S or *Z)) → single letter
+        (m = 1 and *o) → E
+
+ +

+The first part of the rule means that eed maps to ee if eed is in R1 +(which is equivalent to m > 0), or ed and ing are removed if they are +preceded by a vowel. In Snowball this is simply, +

+ +[% highlight(" + define Step_1b as ( + [substring] among ( + 'eed' (R1 <-'ee') + 'ed' + 'ing' (test gopast v delete) + ) + ) +") %] + +

+But this must be modified by the second part of the rule. *d indicates a +test for double letter consonant — bb, dd etc. *L, *S, *Z are tests +for l, s, z. *o is a short vowel test — it is matched by +consonant-vowel-consonant, where the consonant on the right is not w, x +or y. If the short vowel test is satisfied, m = 1 is equivalent to the +cursor being at p1. So the second part of the rule means, map at, bl, iz +to ate, ble, ize; map certain double letters to single letters; and +add e after a short vowel in words of one syllable. +

+ +

+We first need two extra groupings, +

+ +[% highlight(" + define v 'aeiouy' + define v_WXY v + 'wxY' // v with 'w', 'x' and 'y'-consonant + define v_LSZ v + 'lsz' // v with 'l', 's', 'z' +") %] + +

+and a test for a short vowel, +

+ +[% highlight(" + define shortv as ( non-v_WXY v non-v ) +") %] + +

+(The  v_WXY  test comes first because we are scanning backwards, from right to +left.) +

+ +

+The double to single letter map can be done as follows: first define the +slice as the next  non-v_LSZ  and copy it to a string,  ch, as a single +character, +

+ +[% highlight(" + strings ( ch ) + + /* ... */ + + [non-v_LSZ] ->ch +") %] + +

+A further test,  ch, tests that the next letter of the string is the same +as the one in  ch, and if this gives signal t,  delete  deletes the slice, +

+ +[% highlight(" + [non-v_LSZ] ->ch ch delete +") %] + +

+Step_1b  can then be written like this, +

+ +[% highlight(" + define Step_1b as ( + [substring] among ( + 'eed' (R1 <-'ee') + 'ed' + 'ing' ( + test gopast v delete + (test among('at' 'bl' 'iz') <+ 'e') + or + ([non-v_LSZ]->ch ch delete) + or + (atmark p1 test shortv <+ 'e') + ) + ) + ) +") %] + +

+But we can improve the appearance, and speed, of this by turning the +second part of the rule into another  among  command, noting that the only +letters that need undoubling are b, d, f, g, m, n, p, r +and t, +

+ +[% highlight(" + define Step_1b as ( + [substring] among ( + 'eed' (R1 <-'ee') + 'ed' + 'ing' ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) +") %] + +

+Note the null string in the second  among, which acts as a default case. +

+ +

+The Porter stemmer in Snowball is given below. This is an exact +implementation of the algorithm described in the 1980 paper, unlike the +other implementations distributed by the author, which have, and have +always had, three small points of difference (clearly indicated) from the +original algorithm. Since all other implementations of the algorithm seen +by the author are in some degree inexact, this may well be the first ever +correct implementation. +

+ +

The full algorithm in Snowball

+ +[% highlight_file('porter') %] + +[% footer %] diff --git a/algorithms/portuguese/stemmer.html b/algorithms/portuguese/stemmer.html new file mode 100644 index 0000000..62f9c72 --- /dev/null +++ b/algorithms/portuguese/stemmer.html @@ -0,0 +1,650 @@ + + + + + + + + + + Portuguese stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Portuguese stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Portuguese vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+boa
+boainain
+boas
+bôas
+boassu
+boataria
+boate
+boates
+boatos
+bob
+boba
+bobagem
+bobagens
+bobalhões
+bobear
+bobeira
+bobinho
+bobinhos
+bobo
+bobs
+boca
+bocadas
+bocadinho
+bocado
+bocaiúva
+boçal
+bocarra
+bocas
+bode
+bodoque
+body
+boeing
+boem
+boemia
+boêmio
+boêmios
+bogotá
+boi
+bóia
+boiando
+
+boa
+boainain
+boas
+bôas
+boassu
+boat
+boat
+boat
+boat
+bob
+bob
+bobag
+bobagens
+bobalhõ
+bob
+bobeir
+bobinh
+bobinh
+bob
+bobs
+boc
+boc
+bocadinh
+boc
+bocaiúv
+boçal
+bocarr
+boc
+bod
+bodoqu
+body
+boeing
+boem
+boem
+boêmi
+boêmi
+bogot
+boi
+bói
+boi
+
+quiabo
+quicaram
+quickly
+quieto
+quietos
+quilate
+quilates
+quilinhos
+quilo
+quilombo
+quilométricas
+quilométricos
+quilômetro
+quilômetros
+quilos
+química
+químicas
+químico
+químicos
+quimioterapia
+quimioterápicos
+quimono
+quincas
+quinhão
+quinhentos
+quinn
+quino
+quinta
+quintal
+quintana
+quintanilha
+quintão
+quintessência
+quintino
+quinto
+quintos
+quintuplicou
+quinze
+quinzena
+quiosque
+
+quiab
+quic
+quickly
+quiet
+quiet
+quilat
+quilat
+quilinh
+quil
+quilomb
+quilométr
+quilométr
+quilômetr
+quilômetr
+quil
+químic
+químic
+químic
+químic
+quimioterap
+quimioteráp
+quimon
+quinc
+quinhã
+quinhent
+quinn
+quin
+quint
+quintal
+quintan
+quintanilh
+quintã
+quintessent
+quintin
+quint
+quint
+quintuplic
+quinz
+quinzen
+quiosqu
+
+
+ +

The stemming algorithm

+ +

+Letters in Portuguese include the following accented forms, +

+ +
+ á   é   í   ó   ú   â   ê   ô   ç   ã   õ   ü +
+The following letters are vowels: +
+ a   e   i   o   u   á   é   í   ó   ú   â   ê   ô +
+And the two nasalised vowel forms, +
+ ã   õ +
+ +

+should be treated as a vowel followed by a consonant. +

+ +

+ã and õ are therefore replaced by a~ and o~ in the word, where ~ is a +separate character to be treated as a consonant. And then — +

+ +

+R2 +(see the note on R1 and R2) +and RV have the same definition as in the + Spanish stemmer. +

+ +

+Always do step 1. +

+ +

+Step 1: Standard suffix removal +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
eza   ezas   ico   ica   icos   icas   ismo   ismos   + ável   ível   ista   istas   oso   osa   + osos   osas   amento   amentos   imento   imentos   + adora   ador   aça~o   adoras   adores   aço~es   + ante   antes   ância +
delete if in R2 +
logia   logias +
replace with log if in R2 +
ução   uções +
replace with u if in R2 +
ência   ências +
replace with ente if in R2 +
amente +
delete if in R1 +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by os, ic or ad, delete if in R2 +
mente +
delete if in R2 +
if preceded by ante, avel or ível, delete if in R2 +
idade   idades +
delete if in R2 +
if preceded by abil, ic or iv, delete if in R2 +
iva   ivo   ivas   ivos +
delete if in R2 +
if preceded by at, delete if in R2 +
ira   iras +
replace with ir if in RV and preceded by e +
+
+ +

+Do step 2 if no ending was removed by step 1. +

+ +

+Step 2: Verb suffixes +

+ +
+ Search for the longest among the following suffixes in RV, and if found, + delete. +
+ ada   ida   ia   aria   eria   iria   ará   ara   erá   era   irá   ava   asse   esse +   isse   aste   este   iste   ei   arei   erei   irei   am   iam   ariam   eriam   iriam +   aram   eram   iram   avam   em   arem   erem   irem   assem   essem   issem   ado   ido +   ando   endo   indo   ara~o   era~o   ira~o   ar   er   ir   as   adas   idas   ias   arias +   erias   irias   arás   aras   erás   eras   irás   avas   es   ardes   erdes +   irdes   ares   eres   ires   asses   esses   isses   astes   estes   istes   is   ais +   eis   íeis   aríeis   eríeis   iríeis   áreis   areis   éreis   ereis +   íreis   ireis   ásseis   ésseis   ísseis   áveis   ados   idos   ámos +   amos   íamos   aríamos   eríamos   iríamos   áramos   éramos +   íramos   ávamos   emos   aremos   eremos   iremos   ássemos   êssemos +   íssemos   imos   armos   ermos   irmos   eu   iu   ou   ira   + iras +

+If the last step to be obeyed — either step 1 or 2 — altered the word, +do step 3 +
+Step 3 +
+ Delete suffix i if in RV and preceded by c +
+ +

+Alternatively, if neither steps 1 nor 2 altered the word, do step 4 +

+ +

+Step 4: Residual suffix +

+ +
+ If the word ends with one of the suffixes +
+ os   a   i   o   á   í   ó +

+ in RV, delete it +
+ +

+Always do step 5 +

+ +

+Step 5: +

+ +
+

+ If the word ends with one of +

+
+ e   é   ê +
+

+ in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, + delete the u (or i). +

+ +

+ Or if the word ends ç remove the cedilla +

+
+ +

+And finally: +

+ +
+ Turn a~, o~ back into ã, õ +
+ +

The same algorithm in Snowball

+ +
routines (
+           prelude postlude mark_regions
+           RV R1 R2
+           standard_suffix
+           verb_suffix
+           residual_suffix
+           residual_form
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a'   '{U+00E1}'  // a-acute
+stringdef a^   '{U+00E2}'  // a-circumflex e.g. 'bota^nico
+stringdef e'   '{U+00E9}'  // e-acute
+stringdef e^   '{U+00EA}'  // e-circumflex
+stringdef i'   '{U+00ED}'  // i-acute
+stringdef o^   '{U+00F4}'  // o-circumflex
+stringdef o'   '{U+00F3}'  // o-acute
+stringdef u'   '{U+00FA}'  // u-acute
+stringdef cc   '{U+00E7}'  // c-cedilla
+
+stringdef a~   '{U+00E3}'  // a-tilde
+stringdef o~   '{U+00F5}'  // o-tilde
+
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
+
+define prelude as repeat (
+    [substring] among(
+        '{a~}' (<- 'a~')
+        '{o~}' (<- 'o~')
+        ''     (next)
+    ) //or next
+)
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit  // defaults
+
+    do (
+        ( v (non-v gopast v) or (v gopast non-v) )
+        or
+        ( non-v (non-v gopast v) or (v next) )
+        setmark pV
+    )
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define postlude as repeat (
+    [substring] among(
+        'a~' (<- '{a~}')
+        'o~' (<- '{o~}')
+        ''   (next)
+    ) //or next
+)
+
+backwardmode (
+
+    define RV as $pV <= cursor
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define standard_suffix as (
+        [substring] among(
+
+            'eza' 'ezas'
+            'ico' 'ica' 'icos' 'icas'
+            'ismo' 'ismos'
+            '{a'}vel'
+            '{i'}vel'
+            'ista' 'istas'
+            'oso' 'osa' 'osos' 'osas'
+            'amento' 'amentos'
+            'imento' 'imentos'
+
+           'adora' 'ador' 'a{cc}a~o'
+           'adoras' 'adores' 'a{cc}o~es'  // no -ic test
+           'ante' 'antes' '{a^}ncia' // Note 1
+            (
+                R2 delete
+            )
+            'logia'
+            'logias'
+            (
+                R2 <- 'log'
+            )
+            'u{cc}a~o' 'u{cc}o~es'
+            (
+                R2 <- 'u'
+            )
+            '{e^}ncia' '{e^}ncias'
+            (
+                R2 <- 'ente'
+            )
+            'amente'
+            (
+                R1 delete
+                try (
+                    [substring] R2 delete among(
+                        'iv' (['at'] R2 delete)
+                        'os'
+                        'ic'
+                        'ad'
+                    )
+                )
+            )
+            'mente'
+            (
+                R2 delete
+                try (
+                    [substring] among(
+                        'ante' // Note 1
+                        'avel'
+                        '{i'}vel' (R2 delete)
+                    )
+                )
+            )
+            'idade'
+            'idades'
+            (
+                R2 delete
+                try (
+                    [substring] among(
+                        'abil'
+                        'ic'
+                        'iv'   (R2 delete)
+                    )
+                )
+            )
+            'iva' 'ivo'
+            'ivas' 'ivos'
+            (
+                R2 delete
+                try (
+                    ['at'] R2 delete // but not a further   ['ic'] R2 delete
+                )
+            )
+            'ira' 'iras'
+            (
+                RV 'e'  // -eira -eiras usually non-verbal
+                <- 'ir'
+            )
+        )
+    )
+
+    define verb_suffix as setlimit tomark pV for (
+        [substring] among(
+            'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
+            'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
+            'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
+            'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
+            'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
+            'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
+            'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
+            'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
+            'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
+            'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
+            '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
+            '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
+            '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
+            'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
+            'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
+            '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
+
+            'ira' 'iras'
+                (delete)
+        )
+    )
+
+    define residual_suffix as (
+        [substring] among(
+            'os'
+            'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
+                ( RV delete )
+        )
+    )
+
+    define residual_form as (
+        [substring] among(
+            'e' '{e'}' '{e^}'
+                ( RV delete [('u'] test 'g') or
+                             ('i'] test 'c') RV delete )
+            '{cc}' (<-'c')
+        )
+    )
+)
+
+define stem as (
+    do prelude
+    do mark_regions
+    backwards (
+        do (
+            ( ( standard_suffix or verb_suffix )
+              and do ( ['i'] test 'c' RV delete )
+            )
+            or residual_suffix
+        )
+        do residual_form
+    )
+    do postlude
+)
+
+/*
+    Note 1: additions of 15 Jun 2005
+*/
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/portuguese/stemmer.tt b/algorithms/portuguese/stemmer.tt new file mode 100644 index 0000000..945a37e --- /dev/null +++ b/algorithms/portuguese/stemmer.tt @@ -0,0 +1,175 @@ +[% header('Portuguese stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'boa', 'quiabo']) %] + +

The stemming algorithm

+ +

+Letters in Portuguese include the following accented forms, +

+ +
+ á   é   í   ó   ú   â   ê   ô   ç   ã   õ   ü +
+The following letters are vowels: +
+ a   e   i   o   u   á   é   í   ó   ú   â   ê   ô +
+And the two nasalised vowel forms, +
+ ã   õ +
+ +

+should be treated as a vowel followed by a consonant. +

+ +

+ã and õ are therefore replaced by a~ and o~ in the word, where ~ is a +separate character to be treated as a consonant. And then — +

+ +

+R2 +(see the note on R1 and R2) +and RV have the same definition as in the + Spanish stemmer. +

+ +

+Always do step 1. +

+ +

+Step 1: Standard suffix removal +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
eza   ezas   ico   ica   icos   icas   ismo   ismos   + ável   ível   ista   istas   oso   osa   + osos   osas   amento   amentos   imento   imentos   + adora   ador   aça~o   adoras   adores   aço~es   + ante   antes   ância +
delete if in R2 +
logia   logias +
replace with log if in R2 +
ução   uções +
replace with u if in R2 +
ência   ências +
replace with ente if in R2 +
amente +
delete if in R1 +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by os, ic or ad, delete if in R2 +
mente +
delete if in R2 +
if preceded by ante, avel or ível, delete if in R2 +
idade   idades +
delete if in R2 +
if preceded by abil, ic or iv, delete if in R2 +
iva   ivo   ivas   ivos +
delete if in R2 +
if preceded by at, delete if in R2 +
ira   iras +
replace with ir if in RV and preceded by e +
+
+ +

+Do step 2 if no ending was removed by step 1. +

+ +

+Step 2: Verb suffixes +

+ +
+ Search for the longest among the following suffixes in RV, and if found, + delete. +
+ ada   ida   ia   aria   eria   iria   ará   ara   erá   era   irá   ava   asse   esse +   isse   aste   este   iste   ei   arei   erei   irei   am   iam   ariam   eriam   iriam +   aram   eram   iram   avam   em   arem   erem   irem   assem   essem   issem   ado   ido +   ando   endo   indo   ara~o   era~o   ira~o   ar   er   ir   as   adas   idas   ias   arias +   erias   irias   arás   aras   erás   eras   irás   avas   es   ardes   erdes +   irdes   ares   eres   ires   asses   esses   isses   astes   estes   istes   is   ais +   eis   íeis   aríeis   eríeis   iríeis   áreis   areis   éreis   ereis +   íreis   ireis   ásseis   ésseis   ísseis   áveis   ados   idos   ámos +   amos   íamos   aríamos   eríamos   iríamos   áramos   éramos +   íramos   ávamos   emos   aremos   eremos   iremos   ássemos   êssemos +   íssemos   imos   armos   ermos   irmos   eu   iu   ou   ira   + iras +

+If the last step to be obeyed — either step 1 or 2 — altered the word, +do step 3 +
+Step 3 +
+ Delete suffix i if in RV and preceded by c +
+ +

+Alternatively, if neither steps 1 nor 2 altered the word, do step 4 +

+ +

+Step 4: Residual suffix +

+ +
+ If the word ends with one of the suffixes +
+ os   a   i   o   á   í   ó +

+ in RV, delete it +
+ +

+Always do step 5 +

+ +

+Step 5: +

+ +
+

+ If the word ends with one of +

+
+ e   é   ê +
+

+ in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, + delete the u (or i). +

+ +

+ Or if the word ends ç remove the cedilla +

+
+ +

+And finally: +

+ +
+ Turn a~, o~ back into ã, õ +
+ +

The same algorithm in Snowball

+ +[% highlight_file('portuguese') %] + +[% footer %] diff --git a/algorithms/portuguese/stop.txt b/algorithms/portuguese/stop.txt new file mode 100644 index 0000000..9c3c9ac --- /dev/null +++ b/algorithms/portuguese/stop.txt @@ -0,0 +1,245 @@ + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/algorithms/romance.html b/algorithms/romance.html new file mode 100644 index 0000000..8a64be3 --- /dev/null +++ b/algorithms/romance.html @@ -0,0 +1,170 @@ + + + + + + + + + + Romance language stemmers - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Romance language stemmers

+ + +

Links to resources

+ + + +

+The Romance languages have a wealth of different i-suffixes (*) among the verb +forms, and relatively few for the other parts of speech. In addition to +this, many verbs exhibit irregularities. Many also have short stems, +leading to dangers of over-stemming. The verb, therefore, tends to +dominate initial thinking about stemming in these languages. +

+ +

+An algorithmic stemmer can usually reduce the multiple forms of a verb to at +most two or three, and often just one. This is probably +adequate for standard IR use, where the verb is used rather less than other +parts of speech in short queries. +

+ +

+In French the verb endings ent and ons cannot be removed without +unacceptable overstemming. The ons form is rarer, but ent forms +are quite common, and will appear regularly throughout a stemmed vocabulary. +

+ +

+In Italian, the final vowel of nouns and adjectives indicates number and +gender (amico is male friend, amica is female friend) and its removal is a +necessary part of stemming, but the final vowel sometimes separates words +of different meanings (banco is bench, banca is bank), which leads to some +over-stemming. +

+ +

+The d-suffixes of all four languages follow a similar pattern. They can be +tabulated as follows, +

+ +
+ +
French Spanish Portug. Italian +
+
noun ANCE ance anza eza anza +
adjective IC ique ico ico ico +
noun ISM isme ismo ismo ismo +
adjective ABLE able able ável abile +
adjective IBLE - ible ível ibile +
noun IST iste ista ista ista +
adjective OUS eux oso oso oso +
noun MENT ment amiento amento mente +
noun ATOR ateur ador ador attore +
noun ATRESS atrice - - atrice +
noun ATION ation ación ação azione +
noun LOGY logie logía logía logia +
noun USION usion ución ución uzione +
noun ENCE ence encia ência enza +
adjective ENT ent ente ente ente + +
noun ANCE ance ancia ância anza +
noun ANT ant ante ante ante + +
adverb LY (e)ment (a)mente (a)mente +(a)mente +
noun ITY ité idad idade ità +
adjective IVE if ive ivo ivo +
verb ATE at at at at +
+ +

+Equivalent English forms are shown in upper case. In English, ATE is a valid ending, but +in the Romance languages it only exists in combinations. The endings can appear in a +number of styles. In Italian, oso can also be osa, osi or ose, French +ique becomes ic in combinations. +

+ +

+The important combining forms are summarised in the following picture: +

+ +Graph showing important combining forms in English + +

+In English, ABLE combines with LY to form ABLY. So in French, for example, +able combines with (e)ment to form ablement. +In some languages particular combinations are rare. In Italian, for example, +ANT + LY, which would be the ending antemente, is so rare that it does not +figure in the stemming algorithm. +According to the picture, we +should encounter the forms ICATIVELY and ICATIVITY, and dictionaries +instance a few English words with these endings (communicatively for +example). +But in practice three is the maximum number of derivational +suffixes that one need consider in combination. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/romance.tt b/algorithms/romance.tt new file mode 100644 index 0000000..2c1d703 --- /dev/null +++ b/algorithms/romance.tt @@ -0,0 +1,106 @@ +[% header('Romance language stemmers') %] + +

Links to resources

+ + + +

+The Romance languages have a wealth of different i-suffixes (*) among the verb +forms, and relatively few for the other parts of speech. In addition to +this, many verbs exhibit irregularities. Many also have short stems, +leading to dangers of over-stemming. The verb, therefore, tends to +dominate initial thinking about stemming in these languages. +

+ +

+An algorithmic stemmer can usually reduce the multiple forms of a verb to at +most two or three, and often just one. This is probably +adequate for standard IR use, where the verb is used rather less than other +parts of speech in short queries. +

+ +

+In French the verb endings ent and ons cannot be removed without +unacceptable overstemming. The ons form is rarer, but ent forms +are quite common, and will appear regularly throughout a stemmed vocabulary. +

+ +

+In Italian, the final vowel of nouns and adjectives indicates number and +gender (amico is male friend, amica is female friend) and its removal is a +necessary part of stemming, but the final vowel sometimes separates words +of different meanings (banco is bench, banca is bank), which leads to some +over-stemming. +

+ +

+The d-suffixes of all four languages follow a similar pattern. They can be +tabulated as follows, +

+ +
+ +
French Spanish Portug. Italian +
+
noun ANCE ance anza eza anza +
adjective IC ique ico ico ico +
noun ISM isme ismo ismo ismo +
adjective ABLE able able ável abile +
adjective IBLE - ible ível ibile +
noun IST iste ista ista ista +
adjective OUS eux oso oso oso +
noun MENT ment amiento amento mente +
noun ATOR ateur ador ador attore +
noun ATRESS atrice - - atrice +
noun ATION ation ación ação azione +
noun LOGY logie logía logía logia +
noun USION usion ución ución uzione +
noun ENCE ence encia ência enza +
adjective ENT ent ente ente ente + +
noun ANCE ance ancia ância anza +
noun ANT ant ante ante ante + +
adverb LY (e)ment (a)mente (a)mente +(a)mente +
noun ITY ité idad idade ità +
adjective IVE if ive ivo ivo +
verb ATE at at at at +
+ +

+Equivalent English forms are shown in upper case. In English, ATE is a valid ending, but +in the Romance languages it only exists in combinations. The endings can appear in a +number of styles. In Italian, oso can also be osa, osi or ose, French +ique becomes ic in combinations. +

+ +

+The important combining forms are summarised in the following picture: +

+ +Graph showing important combining forms in English + +

+In English, ABLE combines with LY to form ABLY. So in French, for example, +able combines with (e)ment to form ablement. +In some languages particular combinations are rare. In Italian, for example, +ANT + LY, which would be the ending antemente, is so rare that it does not +figure in the stemming algorithm. +According to the picture, we +should encounter the forms ICATIVELY and ICATIVITY, and dictionaries +instance a few English words with these endings (communicatively for +example). +But in practice three is the maximum number of derivational +suffixes that one need consider in combination. +

+ +[% footer %] diff --git a/algorithms/romanian/stemmer.html b/algorithms/romanian/stemmer.html new file mode 100644 index 0000000..eacfc68 --- /dev/null +++ b/algorithms/romanian/stemmer.html @@ -0,0 +1,728 @@ + + + + + + + + + + Romanian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Romanian stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Romanian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+abruptă
+absent
+absentă
+absente
+absenţa
+absența
+absenţă
+absență
+absenţi
+absenți
+absolut
+absoluta
+absolută
+absolute
+absolutul
+absolutului
+absoluţi
+absoluți
+absolve
+absolvenţi
+absolvenți
+absolvenţii
+absolvenții
+absolvi
+absolvire
+absolvit
+absolvită
+absolviţi
+absolviți
+absorbant
+absorbantă
+absorbi
+absorbit
+absorbite
+absorbiţi
+absorbiți
+absorbţia
+absorbția
+abstinent
+abstract
+
+abrupt
+absent
+absent
+absent
+absenț
+absenț
+absenț
+absenț
+absenț
+absenț
+absol
+absol
+absol
+absol
+absol
+absol
+absoluț
+absoluț
+absolv
+absolvenț
+absolvenț
+absolvenț
+absolvenț
+absolv
+absolv
+absolv
+absolv
+absolv
+absolv
+absorb
+absorb
+absorb
+absorb
+absorb
+absorb
+absorb
+absorbț
+absorbț
+abstinent
+abstract
+
+ocol
+ocolea
+ocolesc
+ocoleşte
+ocolește
+ocoleşti
+ocolești
+ocoli
+ocolim
+ocolind
+ocolire
+ocolişuri
+ocolișuri
+ocolit
+ocolită
+ocoliţi
+ocoliți
+ocolul
+ocoluri
+ocolurile
+ocrotit
+ocrotitoare
+ocrotitor
+ocrotiţi
+ocrotiți
+octavă
+octavian
+octet
+octeţi
+octeți
+octogenarul
+octombrie
+ocular
+ocult
+ocultarea
+ocultat
+ocultă
+ocultării
+oculţi
+oculți
+
+ocol
+ocol
+ocol
+ocol
+ocol
+ocol
+ocol
+ocol
+ocol
+ocol
+ocol
+ocolișur
+ocolișur
+ocol
+ocol
+ocol
+ocol
+ocol
+ocolur
+ocolur
+ocrot
+ocrot
+ocrot
+ocrot
+ocrot
+octav
+octavian
+octet
+octeț
+octeț
+octogenar
+octombr
+ocular
+ocult
+ocult
+ocult
+ocult
+ocultăr
+oculț
+oculț
+
+
+ +

+(For the background to this work, see the +credits page. Following earlier misgivings on the wisdom +of removing IST/ISM endings, in this stemmer they are now conflated to a single +form. It can easily be modified it to bring it in line with the other Romance +stemmers: see the internal comments marked ‘IST’. +

+ +

+It is assumed that hyphenated forms are split into separate words prior to +stemming.) +

+ +

The stemming algorithm

+ +

+Letters in Romanian include the following accented forms, +

+ +
+ ă   â   î   ș   ț +
+ +

+The following letters are vowels: +

+ +
+ a   ă   â   e   i   î   o   u +
+ +

+Before full Unicode support was widespread it was common to use ş and +ţ (cedilla instead of comma-below) in Romanian text as these characters +were more readily available in 8-bit character sets. The original version of +this algorithm only recognised the cedilla forms, but the current version +instead normalises the old forms as a first step: replace ş by +ș and ţ by ț. +

+ +

+Then, i and u between vowels are put into upper case +(so that they are treated as consonants). +

+ +

+R1, R2 +(see the note on R1 and R2) +and RV then have the same definition as in the + Spanish stemmer. +

+ +

+Always do steps 0, 1, 2 and 4. (Step 3 is conditional on steps 1 and 2.) +

+ +

+Step 0: Removal of plurals (and other simplifications) +

+ +
+ Search for the longest among the following suffixes, and, if + it is in R1, perform the + action indicated. +
+
ul   ului +
delete +
aua +
replace with a +
ea   ele   elor +
replace with e +
ii   iua   iei   iile   iilor   ilor +
replace with i +
ile +
replace with i if not preceded by ab +
atei +
replace with at +
ație   ația +
replace with ați +
+
+ +

+Step 1: Reduction of combining suffixes +

+ +
+ Search for the longest among the following suffixes, and, if + it is in R1, preform the replacement action indicated. + Then repeat this step until no replacement occurs. +
+
abilitate   abilitati   abilităi   abilități +
replace with abil +
ibilitate +
replace with ibil +
ivitate   ivitati   ivităi   ivități +
replace with iv +
icitate   icitati   icităi   icități   + icator   icatori   + iciv   iciva   icive   icivi   icivă   + ical   icala   icale   icali   icală +
replace with ic +
ativ   ativa   ative   ativi   ativă   ațiune   + atoare   ator   atori   + ătoare   ător   ători +
replace with at +
itiv   itiva   itive   itivi   itivă   ițiune   + itoare   itor   itori +
replace with it +
+
+ +

+Step 2: Removal of ‘standard’ suffixes +

+ +
+ Search for the longest among the following suffixes, and, if + it is in R2, perform the action indicated. +
+
at   ata   ată   ati   ate   + ut   uta   ută   uti   ute   + it   ita   ită   iti   ite   + ic   ica   ice   ici   ică   + abil   abila   abile   abili   abilă   + ibil   ibila   ibile   ibili   ibilă   + oasa   oasă   oase   os   osi   oși   + ant   anta   ante   anti   antă   + ator   atori   + itate   itati   ităi   ități   + iv   iva   ive   ivi   ivă +
delete +
iune   iuni +
delete if preceded by ț, and replace the ț by t. +
ism   isme   + ist   ista   iste   isti   istă   iști +
replace with ist +
+
+ +

+Do step 3 if no suffix was removed either by step 1 or step 2. +

+ +

+Step 3: Removal of verb suffixes +

+ +
+ Search for the longest suffix in region RV among the following, + and perform the action indicated. +
+
are   ere   ire   âre   + ind   ând   + indu   ându   + eze   + ească   + ez   ezi   ează   esc   ești   + ește   + ăsc   ăști   + ăște   + am   ai   au   + eam   eai   ea   eați   eau   + iam   iai   ia   iați   iau   + ui   + ași   arăm   arăți   ară   + uși   urăm   urăți   ură   + iși   irăm   irăți   iră   + âi   âși   ârăm   ârăți   âră   + asem   aseși   ase   aserăm   aserăți   aseră   + isem   iseși   ise   iserăm   iserăți   iseră   + âsem   âseși   âse   âserăm   âserăți   âseră   + usem   useși   use   userăm   userăți   useră + +
delete if preceded in RV by a consonant or u +
ăm   ați   + em   eți   + im   iți   + âm   âți   + seși   serăm   serăți   seră   + sei   se   + sesem   seseși   sese   seserăm   seserăți   seseră +
delete +
+
+ +

+Step 4: Removal of final vowel +

+ +
+Search for the longest among the suffixes +
+ a   e   i   ie   ă +
+and, if it is in RV, delete it. +
+ +

+And finally: +

+ +
+ Turn I, U back into i, u. +
+ +

The same algorithm in Snowball

+ +
routines (
+           norm
+           prelude postlude mark_regions
+           RV R1 R2
+           step_0
+           standard_suffix combo_suffix
+           verb_suffix
+           vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+booleans  ( standard_suffix_removed )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a^   '{U+00E2}'  // a circumflex
+stringdef i^   '{U+00EE}'  // i circumflex
+stringdef a+   '{U+0103}'  // a breve
+stringdef sc   '{U+015F}'  // s cedilla
+stringdef tc   '{U+0163}'  // t cedilla
+stringdef s,   '{U+0219}'  // s comma
+stringdef t,   '{U+021B}'  // t comma
+
+define v 'aeiou{a^}{i^}{a+}'
+
+// Normalize old cedilla forms to correct comma-below forms.
+define norm as (
+    do repeat goto (
+        [substring] among (
+            '{sc}'    (<- '{s,}')
+            '{tc}'    (<- '{t,}')
+        )
+    )
+)
+
+define prelude as (
+    repeat goto (
+        v [ ('u' ] v <- 'U') or
+            ('i' ] v <- 'I')
+    )
+)
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit // defaults
+
+    do (
+        ( v (non-v gopast v) or (v gopast non-v) )
+        or
+        ( non-v (non-v gopast v) or (v next) )
+        setmark pV
+    )
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define postlude as repeat (
+
+    [substring] among(
+        'I'  (<- 'i')
+        'U'  (<- 'u')
+        ''   (next)
+    )
+
+)
+
+backwardmode (
+
+    define RV as $pV <= cursor
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define step_0 as (
+        [substring] R1 among(
+            'ul' 'ului'
+                ( delete )
+            'aua'
+                ( <-'a' )
+            'ea' 'ele' 'elor'
+                ( <-'e' )
+            'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
+                ( <-'i')
+            'ile'
+                ( not 'ab' <- 'i' )
+            'atei'
+                ( <- 'at' )
+            'a{t,}ie' 'a{t,}ia'
+                ( <- 'a{t,}i' )
+        )
+    )
+
+    define combo_suffix as test (
+        [substring] R1 (
+            among(
+            /* 'IST'. alternative: include the following
+                'alism' 'alisme'
+                'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
+                    <- 'al'
+                )
+            */
+                'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
+                    <- 'abil'
+                )
+                'ibilitate' (
+                    <- 'ibil'
+                )
+                'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
+                    <- 'iv'
+                )
+                'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
+                'icator' 'icatori'
+                'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
+                'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
+                    <- 'ic'
+                )
+                'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
+                'atoare' 'ator' 'atori'
+                '{a+}toare' '{a+}tor' '{a+}tori' (
+                    <- 'at'
+                )
+                'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
+                'itoare' 'itor' 'itori' (
+                    <- 'it'
+                )
+            )
+            set standard_suffix_removed
+        )
+    )
+
+    define standard_suffix as (
+        unset standard_suffix_removed
+        repeat combo_suffix
+        [substring] R2 (
+            among(
+
+                // past participle is treated here, rather than
+                // as a verb ending:
+                'at' 'ata' 'at{a+}' 'ati' 'ate'
+                'ut' 'uta' 'ut{a+}' 'uti' 'ute'
+                'it' 'ita' 'it{a+}' 'iti' 'ite'
+
+                'ic' 'ica' 'ice' 'ici' 'ic{a+}'
+                'abil' 'abila' 'abile' 'abili' 'abil{a+}'
+                'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
+                'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
+                'ant' 'anta' 'ante' 'anti' 'ant{a+}'
+                'ator' 'atori'
+                'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
+                'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
+                    delete
+                )
+                'iune' 'iuni' (
+                    '{t,}'] <- 't'
+                )
+                'ism' 'isme'
+                'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
+                    <- 'ist'
+                    /* 'IST'. alternative: remove with <- '' */
+                )
+            )
+            set standard_suffix_removed
+        )
+    )
+
+    define verb_suffix as setlimit tomark pV for (
+        [substring] among(
+            // 'long' infinitive:
+            'are' 'ere' 'ire' '{a^}re'
+
+            // gerund:
+            'ind' '{a^}nd'
+            'indu' '{a^}ndu'
+
+            'eze'
+            'easc{a+}'
+            // present:
+            'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
+            'e{s,}te'
+            '{a+}sc' '{a+}{s,}ti'
+            '{a+}{s,}te'
+
+            // imperfect:
+            'am' 'ai' 'au'
+            'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
+            'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
+
+            // past: // (not 'ii')
+            'ui'
+            'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
+            'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
+            'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
+            '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
+
+            // pluperfect:
+            'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
+            'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
+            '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
+            '{a^}ser{a+}'
+            'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
+
+                ( non-v or 'u'  delete )
+
+            // present:
+            '{a+}m' 'a{t,}i'
+            'em' 'e{t,}i'
+            'im' 'i{t,}i'
+            '{a^}m' '{a^}{t,}i'
+
+            // past:
+            'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
+            'sei' 'se'
+
+            // pluperfect:
+            'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
+                (delete)
+        )
+    )
+
+    define vowel_suffix as (
+        [substring] RV among (
+            'a' 'e' 'i' 'ie' '{a+}' ( delete )
+        )
+    )
+)
+
+define stem as (
+    do norm
+    do prelude
+    do mark_regions
+    backwards (
+        do step_0
+        do standard_suffix
+        do ( standard_suffix_removed or verb_suffix )
+        do vowel_suffix
+    )
+    do postlude
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/romanian/stemmer.tt b/algorithms/romanian/stemmer.tt new file mode 100644 index 0000000..3e70031 --- /dev/null +++ b/algorithms/romanian/stemmer.tt @@ -0,0 +1,224 @@ +[% header('Romanian stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'abruptă', 'ocol']) %] + +

+(For the background to this work, see the +credits page. Following earlier misgivings on the wisdom +of removing IST/ISM endings, in this stemmer they are now conflated to a single +form. It can easily be modified it to bring it in line with the other Romance +stemmers: see the internal comments marked ‘IST’. +

+ +

+It is assumed that hyphenated forms are split into separate words prior to +stemming.) +

+ +

The stemming algorithm

+ +

+Letters in Romanian include the following accented forms, +

+ +
+ ă   â   î   ș   ț +
+ +

+The following letters are vowels: +

+ +
+ a   ă   â   e   i   î   o   u +
+ +

+Before full Unicode support was widespread it was common to use ş and +ţ (cedilla instead of comma-below) in Romanian text as these characters +were more readily available in 8-bit character sets. The original version of +this algorithm only recognised the cedilla forms, but the current version +instead normalises the old forms as a first step: replace ş by +ș and ţ by ț. +

+ +

+Then, i and u between vowels are put into upper case +(so that they are treated as consonants). +

+ +

+R1, R2 +(see the note on R1 and R2) +and RV then have the same definition as in the + Spanish stemmer. +

+ +

+Always do steps 0, 1, 2 and 4. (Step 3 is conditional on steps 1 and 2.) +

+ +

+Step 0: Removal of plurals (and other simplifications) +

+ +
+ Search for the longest among the following suffixes, and, if + it is in R1, perform the + action indicated. +
+
ul   ului +
delete +
aua +
replace with a +
ea   ele   elor +
replace with e +
ii   iua   iei   iile   iilor   ilor +
replace with i +
ile +
replace with i if not preceded by ab +
atei +
replace with at +
ație   ația +
replace with ați +
+
+ +

+Step 1: Reduction of combining suffixes +

+ +
+ Search for the longest among the following suffixes, and, if + it is in R1, preform the replacement action indicated. + Then repeat this step until no replacement occurs. +
+
abilitate   abilitati   abilităi   abilități +
replace with abil +
ibilitate +
replace with ibil +
ivitate   ivitati   ivităi   ivități +
replace with iv +
icitate   icitati   icităi   icități   + icator   icatori   + iciv   iciva   icive   icivi   icivă   + ical   icala   icale   icali   icală +
replace with ic +
ativ   ativa   ative   ativi   ativă   ațiune   + atoare   ator   atori   + ătoare   ător   ători +
replace with at +
itiv   itiva   itive   itivi   itivă   ițiune   + itoare   itor   itori +
replace with it +
+
+ +

+Step 2: Removal of ‘standard’ suffixes +

+ +
+ Search for the longest among the following suffixes, and, if + it is in R2, perform the action indicated. +
+
at   ata   ată   ati   ate   + ut   uta   ută   uti   ute   + it   ita   ită   iti   ite   + ic   ica   ice   ici   ică   + abil   abila   abile   abili   abilă   + ibil   ibila   ibile   ibili   ibilă   + oasa   oasă   oase   os   osi   oși   + ant   anta   ante   anti   antă   + ator   atori   + itate   itati   ităi   ități   + iv   iva   ive   ivi   ivă +
delete +
iune   iuni +
delete if preceded by ț, and replace the ț by t. +
ism   isme   + ist   ista   iste   isti   istă   iști +
replace with ist +
+
+ +

+Do step 3 if no suffix was removed either by step 1 or step 2. +

+ +

+Step 3: Removal of verb suffixes +

+ +
+ Search for the longest suffix in region RV among the following, + and perform the action indicated. +
+
are   ere   ire   âre   + ind   ând   + indu   ându   + eze   + ească   + ez   ezi   ează   esc   ești   + ește   + ăsc   ăști   + ăște   + am   ai   au   + eam   eai   ea   eați   eau   + iam   iai   ia   iați   iau   + ui   + ași   arăm   arăți   ară   + uși   urăm   urăți   ură   + iși   irăm   irăți   iră   + âi   âși   ârăm   ârăți   âră   + asem   aseși   ase   aserăm   aserăți   aseră   + isem   iseși   ise   iserăm   iserăți   iseră   + âsem   âseși   âse   âserăm   âserăți   âseră   + usem   useși   use   userăm   userăți   useră + +
delete if preceded in RV by a consonant or u +
ăm   ați   + em   eți   + im   iți   + âm   âți   + seși   serăm   serăți   seră   + sei   se   + sesem   seseși   sese   seserăm   seserăți   seseră +
delete +
+
+ +

+Step 4: Removal of final vowel +

+ +
+Search for the longest among the suffixes +
+ a   e   i   ie   ă +
+and, if it is in RV, delete it. +
+ +

+And finally: +

+ +
+ Turn I, U back into i, u. +
+ +

The same algorithm in Snowball

+ +[% highlight_file('romanian') %] + +[% footer %] diff --git a/algorithms/russian/stemmer.html b/algorithms/russian/stemmer.html new file mode 100644 index 0000000..d8388fd --- /dev/null +++ b/algorithms/russian/stemmer.html @@ -0,0 +1,885 @@ + + + + + + + + + + Russian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Russian stemming algorithm

+ + +

Links to resources

+ + + +

+The Snowball stemmer represents the Cyrillic alphabet with ASCII characters, +following the standard Library of Congress transliteration scheme. + +

+ +

+Here is a sample of Russian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem

+вавиловка
+вагнера
+вагон
+вагона
+вагоне
+вагонов
+вагоном
+вагоны
+важная
+важнее
+важнейшие
+важнейшими
+важничал
+важно
+важного
+важное
+важной
+важном
+важному
+важности
+важностию
+важность
+важностью
+важную
+важны
+важные
+важный
+важным
+важных
+вазах
+вазы
+вакса
+вакханка
+вал
+валандался
+валентина
+валериановых
+валерию
+валетами
+вали
+валил
+валился
+валится
+валов
+вальдшнепа
+вальс
+вальса
+вальсе
+вальсишку
+вальтера
+валяется
+валялась
+валялись
+валялось
+валялся
+валять
+валяются
+вам
+вами
+

+вавиловк
+вагнер
+вагон
+вагон
+вагон
+вагон
+вагон
+вагон
+важн
+важн
+важн
+важн
+важнича
+важн
+важн
+важн
+важн
+важн
+важн
+важност
+важност
+важност
+важност
+важн
+важн
+важн
+важн
+важн
+важн
+ваз
+ваз
+вакс
+вакханк
+вал
+валанда
+валентин
+валерианов
+валер
+валет
+вал
+вал
+вал
+вал
+вал
+вальдшнеп
+вальс
+вальс
+вальс
+вальсишк
+вальтер
+валя
+валя
+валя
+валя
+валя
+валя
+валя
+вам
+вам
+
+п
+па
+пава
+павел
+павильон
+павильонам
+павла
+павлиний
+павлиньи
+павлиньим
+павлович
+павловна
+павловне
+павловной
+павловну
+павловны
+павловцы
+павлыч
+павлыча
+пагубная
+падает
+падай
+падал
+падала
+падаль
+падать
+падаю
+падают
+падающего
+падающие
+падеж
+падение
+падением
+падении
+падений
+падения
+паденье
+паденья
+падет
+падут
+падучая
+падчерицей
+падчерицы
+падшая
+падшей
+падшему
+падший
+падшим
+падших
+падшую
+паек
+пазухи
+пазуху
+пай
+пакет
+пакетом
+пакеты
+пакостей
+пакостно
+пал
+
+п
+па
+пав
+павел
+павильон
+павильон
+павл
+павлин
+павлин
+павлин
+павлович
+павловн
+павловн
+павловн
+павловн
+павловн
+павловц
+павлыч
+павлыч
+пагубн
+пада
+пада
+пада
+пада
+падал
+пада
+пада
+пада
+пада
+пада
+падеж
+паден
+паден
+паден
+паден
+паден
+паден
+паден
+падет
+падут
+падуч
+падчериц
+падчериц
+падш
+падш
+падш
+падш
+падш
+падш
+падш
+паек
+пазух
+пазух
+па
+пакет
+пакет
+пакет
+пакост
+пакостн
+пал
+
+
+ +

The stemming algorithm

+ +

+i-suffixes (*) of Russian tend to be quite regular, with irregularities of +declension involving a change to the stem. Irregular forms therefore +usually just generate two or more possible stems. Stems in Russian can +be very short, and many of the suffixes are also particle words that make +‘natural stopwords’, so a tempting way of running the stemmer is to set a +minimum stem length of zero, and thereby reduce to null all words which +are made up entirely of suffix parts. We have been a little more cautious, +and have insisted that a minimum stem contains one vowel. +

+ +

+The 32 letters of the Russian alphabet are as follows, with the +transliterated forms that we will use here shown in brackets: +

+ +
+
а (a) + б (b) + в (v) + г (g) + д (d) + е (e) + ж (zh) + з (z) + +
и (i) + й (ì) + к (k) + л (l) + м (m) + н (n) + о (o) + п (p) + +
р (r) + с (s) + т (t) + у (u) + ф (f) + х (kh) + ц (ts) + ч (ch) + +
ш (sh) + щ (shch) + ъ (") + ы (y) + ь (') + э (è) + ю (iu) + я (ia) + +
+ +

+There is a 33rd letter, ё (e"), but it is rarely used and often +replaced by е in informal writing. The original algorithm here assumed it +had already been mapped to е (e); since 2018-03-16 the Snowball +implementation we provide performs this mapping for you. +

+ +

+The following are vowels: +

+ +
+ а (a)   е (e)   и (i)   о (o)   у (u)   ы (y)   + э (è)   ю (iu)   я (ia) +
+ +

+In any word, RV is the region after the first vowel, or the end of the word +if it contains no vowel. +

+ +

+R1 is the region after the first non-vowel following a vowel, or the end of +the word if there is no such non-vowel. +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or the +end of the word if there is no such non-vowel. +

+ +

+For example: +

+ +
+    p r o t i v o e s t e s t v e n n o m
+         |<------       RV        ------>|
+           |<-----       R1       ------>|
+               |<-----     R2     ------>|
+
+ +

+(See note on R1 and R2.) +

+ +

+We now define the following classes of ending: +

+ +

+PERFECTIVE GERUND: +

+ +
+

+ group 1:   в (v)   вши (vshi)   вшись (vshis') +

+ +

+ group 2:   ив (iv)   ивши (ivshi)   ившись (ivshis')   + ыв (yv)   ывши (yvshi)   ывшись (yvshis') +

+
+ +

+group 1 endings must follow а (a) or я (ia) +

+ +

+ADJECTIVE: +

+ +
+ ее (ee)   ие (ie)   ые (ye)   ое (oe)   ими (imi)   ыми + (ymi)   ей ()   ий ()   ый ()   ой ()   ем + (em)   им (im)   ым (ym)   ом (om)   его (ego)   ого (ogo) +   ему (emu)   ому (omu)   их (ikh)   ых (ykh)   ую (uiu) +   юю (iuiu)   ая (aia)   яя (iaia) +   ою (oiu) +   ею (eiu) +
+ +

+PARTICIPLE: +

+ +
+

+ group 1:   ем (em)   нн (nn)   вш (vsh)   ющ (iushch)   щ (shch) +

+ +

+ group 2:   ивш (ivsh)   ывш (yvsh)   ующ (uiushch) +

+
+ +

+group 1 endings must follow а (a) or я (ia) +

+ +

+REFLEXIVE: +

+ +
+

+ ся (sia)   сь (s') +

+
+ +

+VERB: +

+ +
+

+ group 1: ла (la)   на (na)   ете (ete)   йте (ìte)   ли (li) +   й (ì)   л (l)   ем (em)   н (n)   ло (lo)   но (no)   ет + (et)   ют (iut)   ны (ny)   ть (t')   ешь (esh')   нно (nno) +

+ +

+ group 2: ила (ila)   ыла (yla)   ена (ena)   ейте (eìte)   + уйте (uìte)   ите (ite)   или (ili)   ыли + (yli)   ей ()   уй ()   ил (il)   ыл (yl)   им (im)   + ым (ym)   ен (en)   ило (ilo)   ыло (ylo)   ено (eno)   ят + (iat)   ует (uet)   уют (uiut)   ит (it)   ыт (yt)   ены + (eny)   ить (it')   ыть (yt')   ишь (ish')   + ую (uiu)   ю (iu) +

+
+ +

+group 1 endings must follow а (a) or я (ia) +

+ +

+NOUN: +

+ +
+

+а (a)   ев (ev)   ов (ov)   ие (ie)   ье ('e)   е (e)   иями +(iiami)   ями (iami)   ами (ami)   еи (ei)   ии (ii)   и (i)   +ией (ieì)   ей ()   ой ()   ий ()   й (ì)   +иям (iiam)   ям (iam)   ием (iem)   ем (em)   ам (am)   ом +(om)   о (o)   у (u)   ах (akh)   иях (iiakh)   ях (iakh)   ы +(y)   ь (')   ию (iiu)   ью ('iu)   ю (iu)   ия (iia)   ья +('ia)   я (ia) +

+
+ +

+SUPERLATIVE: +

+ +
+

+ ейш (eìsh)   ейше (eìshe) +

+
+ +

+These are all i-suffixes. The list of d-suffixes is very short, +

+ +

+DERIVATIONAL: +

+ +
+

+ ост (ost)   ость (ost') +

+
+ +

+Define an ADJECTIVAL ending as an ADJECTIVE ending optionally preceded +by a PARTICIPLE ending. +

+ +
+ For example, in +
+
бегавшая = бега + вш + ая +
(begavshaia = bega + vsh + aia) +
+ ая (aia) is an adjective ending, and вш (vsh) a participle ending of group 1 + (preceded by the final а (a) of бега (bega)), so вшая (vshaia) is an + adjectival ending. +
+ +

+In searching for an ending in a class, always choose the longest one +from the class. +

+ +
+ So in seaching for a NOUN ending for величие (velichie), choose ие (ie) rather than + е (e). +
+ +

+Undouble н (n) means, if the word ends нн (nn), remove the last letter. +

+ +

+Here now are the stemming rules. +

+ +

+All tests take place in the RV part of the word. +

+ +
+ So in the test for perfective gerund, the а (a) or я (ia) which the group 1 + endings must follow must itself be in RV. In other words the letters + before the RV region are never examined in the stemming process. +
+ +

+Do each of steps 1, 2, 3 and 4. +

+ +

+Step 1: +Search for a PERFECTIVE GERUND ending. If one is found remove it, and that +is then the end of step 1. Otherwise try and remove a REFLEXIVE ending, +and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a +NOUN ending. As soon as one of the endings (1) to (3) is found remove it, +and terminate step 1. +

+ +

+Step 2: If the word ends with и (i), remove it. +

+ +

+Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending +must lie in R2), and if one is found, remove it. +

+ +

+Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, +remove it and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it. +

+ +

The same algorithm in Snowball

+ +
stringescapes {}
+
+/* the 33 Cyrillic letters represented in ASCII characters following the
+ * conventions of the standard Library of Congress transliteration: */
+
+stringdef a    '{U+0430}'
+stringdef b    '{U+0431}'
+stringdef v    '{U+0432}'
+stringdef g    '{U+0433}'
+stringdef d    '{U+0434}'
+stringdef e    '{U+0435}'
+stringdef e"   '{U+0451}'
+stringdef zh   '{U+0436}'
+stringdef z    '{U+0437}'
+stringdef i    '{U+0438}'
+stringdef i`   '{U+0439}'
+stringdef k    '{U+043A}'
+stringdef l    '{U+043B}'
+stringdef m    '{U+043C}'
+stringdef n    '{U+043D}'
+stringdef o    '{U+043E}'
+stringdef p    '{U+043F}'
+stringdef r    '{U+0440}'
+stringdef s    '{U+0441}'
+stringdef t    '{U+0442}'
+stringdef u    '{U+0443}'
+stringdef f    '{U+0444}'
+stringdef kh   '{U+0445}'
+stringdef ts   '{U+0446}'
+stringdef ch   '{U+0447}'
+stringdef sh   '{U+0448}'
+stringdef shch '{U+0449}'
+stringdef "    '{U+044A}'
+stringdef y    '{U+044B}'
+stringdef '    '{U+044C}'
+stringdef e`   '{U+044D}'
+stringdef iu   '{U+044E}'
+stringdef ia   '{U+044F}'
+
+routines ( mark_regions R2
+           perfective_gerund
+           adjective
+           adjectival
+           reflexive
+           verb
+           noun
+           derivational
+           tidy_up
+)
+
+externals ( stem )
+
+integers ( pV p2 )
+
+groupings ( v )
+
+define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
+
+define mark_regions as (
+
+    $pV = limit
+    $p2 = limit
+    do (
+        gopast v  setmark pV  gopast non-v
+        gopast v  gopast non-v  setmark p2
+       )
+)
+
+backwardmode (
+
+    define R2 as $p2 <= cursor
+
+    define perfective_gerund as (
+        [substring] among (
+            '{v}'
+            '{v}{sh}{i}'
+            '{v}{sh}{i}{s}{'}'
+                ('{a}' or '{ia}' delete)
+            '{i}{v}'
+            '{i}{v}{sh}{i}'
+            '{i}{v}{sh}{i}{s}{'}'
+            '{y}{v}'
+            '{y}{v}{sh}{i}'
+            '{y}{v}{sh}{i}{s}{'}'
+                (delete)
+        )
+    )
+
+    define adjective as (
+        [substring] among (
+            '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
+            '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
+            '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
+            '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
+            '{ia}{ia}'
+                        // and -
+            '{o}{iu}'   // - which is somewhat archaic
+            '{e}{iu}'   // - soft form of {o}{iu}
+                (delete)
+        )
+    )
+
+    define adjectival as (
+        adjective
+
+        /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
+           nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
+           errors. Removing im, uem, enn creates too many errors.
+        */
+
+        try (
+            [substring] among (
+                '{e}{m}'                  // present passive participle
+                '{n}{n}'                  // adjective from past passive participle
+                '{v}{sh}'                 // past active participle
+                '{iu}{shch}' '{shch}'     // present active participle
+                    ('{a}' or '{ia}' delete)
+
+     //but not  '{i}{m}' '{u}{e}{m}'      // present passive participle
+     //or       '{e}{n}{n}'               // adjective from past passive participle
+
+                '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
+                '{u}{iu}{shch}'          // present active participle
+                    (delete)
+            )
+        )
+
+    )
+
+    define reflexive as (
+        [substring] among (
+            '{s}{ia}'
+            '{s}{'}'
+                (delete)
+        )
+    )
+
+    define verb as (
+        [substring] among (
+            '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
+            '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
+            '{n}{y}' '{t}{'}' '{e}{sh}{'}'
+
+            '{n}{n}{o}'
+                ('{a}' or '{ia}' delete)
+
+            '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
+            '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
+            '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
+            '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
+            '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
+            '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
+                (delete)
+            /* note the short passive participle tests:
+               '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
+               '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
+            */
+        )
+    )
+
+    define noun as (
+        [substring] among (
+            '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
+            '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
+            '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
+            '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
+            '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
+            '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
+                (delete)
+            /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
+               '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
+               omitted - they only occur on 12 words.
+            */
+        )
+    )
+
+    define derivational as (
+        [substring] R2 among (
+            '{o}{s}{t}'
+            '{o}{s}{t}{'}'
+                (delete)
+        )
+    )
+
+    define tidy_up as (
+        [substring] among (
+
+            '{e}{i`}{sh}'
+            '{e}{i`}{sh}{e}'  // superlative forms
+               (delete
+                ['{n}'] '{n}' delete
+               )
+            '{n}'
+               ('{n}' delete) // e.g. -nno endings
+            '{'}'
+               (delete)  // with some slight false conflations
+        )
+    )
+)
+
+define stem as (
+
+    // Normalise {e"} to {e}.  The documentation has long suggested the user
+    // should do this before calling the stemmer - we now do it for them.
+    do repeat ( goto (['{e"}']) <- '{e}' )
+
+    do mark_regions
+    backwards setlimit tomark pV for (
+        do (
+             perfective_gerund or
+             ( try reflexive
+               adjectival or verb or noun
+             )
+        )
+        try([ '{i}' ] delete)
+        // because noun ending -i{iu} is being treated as verb ending -{iu}
+
+        do derivational
+        do tidy_up
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/russian/stemmer.tt b/algorithms/russian/stemmer.tt new file mode 100644 index 0000000..9e7b408 --- /dev/null +++ b/algorithms/russian/stemmer.tt @@ -0,0 +1,327 @@ +[% header('Russian stemming algorithm') %] + +

Links to resources

+ + + +

+The Snowball stemmer represents the Cyrillic alphabet with ASCII characters, +following the standard Library of Congress transliteration scheme. + +

+ +[% algorithm_vocab([60, 'в', 'п']) %] + +

The stemming algorithm

+ +

+i-suffixes (*) of Russian tend to be quite regular, with irregularities of +declension involving a change to the stem. Irregular forms therefore +usually just generate two or more possible stems. Stems in Russian can +be very short, and many of the suffixes are also particle words that make +‘natural stopwords’, so a tempting way of running the stemmer is to set a +minimum stem length of zero, and thereby reduce to null all words which +are made up entirely of suffix parts. We have been a little more cautious, +and have insisted that a minimum stem contains one vowel. +

+ +

+The 32 letters of the Russian alphabet are as follows, with the +transliterated forms that we will use here shown in brackets: +

+ +
+
а (a) + б (b) + в (v) + г (g) + д (d) + е (e) + ж (zh) + з (z) + +
и (i) + й (ì) + к (k) + л (l) + м (m) + н (n) + о (o) + п (p) + +
р (r) + с (s) + т (t) + у (u) + ф (f) + х (kh) + ц (ts) + ч (ch) + +
ш (sh) + щ (shch) + ъ (") + ы (y) + ь (') + э (è) + ю (iu) + я (ia) + +
+ +

+There is a 33rd letter, ё (e"), but it is rarely used and often +replaced by е in informal writing. The original algorithm here assumed it +had already been mapped to е (e); since 2018-03-16 the Snowball +implementation we provide performs this mapping for you. +

+ +

+The following are vowels: +

+ +
+ а (a)   е (e)   и (i)   о (o)   у (u)   ы (y)   + э (è)   ю (iu)   я (ia) +
+ +

+In any word, RV is the region after the first vowel, or the end of the word +if it contains no vowel. +

+ +

+R1 is the region after the first non-vowel following a vowel, or the end of +the word if there is no such non-vowel. +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or the +end of the word if there is no such non-vowel. +

+ +

+For example: +

+ +
+    p r o t i v o e s t e s t v e n n o m
+         |<------       RV        ------>|
+           |<-----       R1       ------>|
+               |<-----     R2     ------>|
+
+ +

+(See note on R1 and R2.) +

+ +

+We now define the following classes of ending: +

+ +

+PERFECTIVE GERUND: +

+ +
+

+ group 1:   в (v)   вши (vshi)   вшись (vshis') +

+ +

+ group 2:   ив (iv)   ивши (ivshi)   ившись (ivshis')   + ыв (yv)   ывши (yvshi)   ывшись (yvshis') +

+
+ +

+group 1 endings must follow а (a) or я (ia) +

+ +

+ADJECTIVE: +

+ +
+ ее (ee)   ие (ie)   ые (ye)   ое (oe)   ими (imi)   ыми + (ymi)   ей ()   ий ()   ый ()   ой ()   ем + (em)   им (im)   ым (ym)   ом (om)   его (ego)   ого (ogo) +   ему (emu)   ому (omu)   их (ikh)   ых (ykh)   ую (uiu) +   юю (iuiu)   ая (aia)   яя (iaia) +   ою (oiu) +   ею (eiu) +
+ +

+PARTICIPLE: +

+ +
+

+ group 1:   ем (em)   нн (nn)   вш (vsh)   ющ (iushch)   щ (shch) +

+ +

+ group 2:   ивш (ivsh)   ывш (yvsh)   ующ (uiushch) +

+
+ +

+group 1 endings must follow а (a) or я (ia) +

+ +

+REFLEXIVE: +

+ +
+

+ ся (sia)   сь (s') +

+
+ +

+VERB: +

+ +
+

+ group 1: ла (la)   на (na)   ете (ete)   йте (ìte)   ли (li) +   й (ì)   л (l)   ем (em)   н (n)   ло (lo)   но (no)   ет + (et)   ют (iut)   ны (ny)   ть (t')   ешь (esh')   нно (nno) +

+ +

+ group 2: ила (ila)   ыла (yla)   ена (ena)   ейте (eìte)   + уйте (uìte)   ите (ite)   или (ili)   ыли + (yli)   ей ()   уй ()   ил (il)   ыл (yl)   им (im)   + ым (ym)   ен (en)   ило (ilo)   ыло (ylo)   ено (eno)   ят + (iat)   ует (uet)   уют (uiut)   ит (it)   ыт (yt)   ены + (eny)   ить (it')   ыть (yt')   ишь (ish')   + ую (uiu)   ю (iu) +

+
+ +

+group 1 endings must follow а (a) or я (ia) +

+ +

+NOUN: +

+ +
+

+а (a)   ев (ev)   ов (ov)   ие (ie)   ье ('e)   е (e)   иями +(iiami)   ями (iami)   ами (ami)   еи (ei)   ии (ii)   и (i)   +ией (ieì)   ей ()   ой ()   ий ()   й (ì)   +иям (iiam)   ям (iam)   ием (iem)   ем (em)   ам (am)   ом +(om)   о (o)   у (u)   ах (akh)   иях (iiakh)   ях (iakh)   ы +(y)   ь (')   ию (iiu)   ью ('iu)   ю (iu)   ия (iia)   ья +('ia)   я (ia) +

+
+ +

+SUPERLATIVE: +

+ +
+

+ ейш (eìsh)   ейше (eìshe) +

+
+ +

+These are all i-suffixes. The list of d-suffixes is very short, +

+ +

+DERIVATIONAL: +

+ +
+

+ ост (ost)   ость (ost') +

+
+ +

+Define an ADJECTIVAL ending as an ADJECTIVE ending optionally preceded +by a PARTICIPLE ending. +

+ +
+ For example, in +
+
бегавшая = бега + вш + ая +
(begavshaia = bega + vsh + aia) +
+ ая (aia) is an adjective ending, and вш (vsh) a participle ending of group 1 + (preceded by the final а (a) of бега (bega)), so вшая (vshaia) is an + adjectival ending. +
+ +

+In searching for an ending in a class, always choose the longest one +from the class. +

+ +
+ So in seaching for a NOUN ending for величие (velichie), choose ие (ie) rather than + е (e). +
+ +

+Undouble н (n) means, if the word ends нн (nn), remove the last letter. +

+ +

+Here now are the stemming rules. +

+ +

+All tests take place in the RV part of the word. +

+ +
+ So in the test for perfective gerund, the а (a) or я (ia) which the group 1 + endings must follow must itself be in RV. In other words the letters + before the RV region are never examined in the stemming process. +
+ +

+Do each of steps 1, 2, 3 and 4. +

+ +

+Step 1: +Search for a PERFECTIVE GERUND ending. If one is found remove it, and that +is then the end of step 1. Otherwise try and remove a REFLEXIVE ending, +and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a +NOUN ending. As soon as one of the endings (1) to (3) is found remove it, +and terminate step 1. +

+ +

+Step 2: If the word ends with и (i), remove it. +

+ +

+Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending +must lie in R2), and if one is found, remove it. +

+ +

+Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, +remove it and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it. +

+ +

The same algorithm in Snowball

+ +[% highlight_file('russian') %] + +[% footer %] diff --git a/algorithms/russian/stop.txt b/algorithms/russian/stop.txt new file mode 100644 index 0000000..54fcc3d --- /dev/null +++ b/algorithms/russian/stop.txt @@ -0,0 +1,236 @@ + + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/algorithms/scandinavian.html b/algorithms/scandinavian.html new file mode 100644 index 0000000..2ec48b9 --- /dev/null +++ b/algorithms/scandinavian.html @@ -0,0 +1,98 @@ + + + + + + + + + + Scandinavian language stemmers - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Scandinavian language stemmers

+ + +

Links to resources

+ + + +

+The stemmers for these three Scandinavian languages are all very simple, +and quite similar to each other. But between the languages there is a difference +in which endings can be removed without difficulty, even though the endings +are very similar. For example, in Norwegian +the ending ede can be removed safely, but not in Danish. +

+ +

+To the definite article (the in English, der etc in German) there +corresponds +a noun ending in the Scandinavian languages. This ending cannot always be removed +with certainty. In Swedish, for example, the en form is removed, but not the +t or n form, +

+ +
+
husen hus +
flickan   →   flickan +
äpplet äpplet + +
+ +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/scandinavian.tt b/algorithms/scandinavian.tt new file mode 100644 index 0000000..94f6d13 --- /dev/null +++ b/algorithms/scandinavian.tt @@ -0,0 +1,34 @@ +[% header('Scandinavian language stemmers') %] + +

Links to resources

+ + + +

+The stemmers for these three Scandinavian languages are all very simple, +and quite similar to each other. But between the languages there is a difference +in which endings can be removed without difficulty, even though the endings +are very similar. For example, in Norwegian +the ending ede can be removed safely, but not in Danish. +

+ +

+To the definite article (the in English, der etc in German) there +corresponds +a noun ending in the Scandinavian languages. This ending cannot always be removed +with certainty. In Swedish, for example, the en form is removed, but not the +t or n form, +

+ +
+
husen hus +
flickan   →   flickan +
äpplet äpplet + +
+ +[% footer %] diff --git a/algorithms/serbian/stemmer.html b/algorithms/serbian/stemmer.html new file mode 100644 index 0000000..a64acf2 --- /dev/null +++ b/algorithms/serbian/stemmer.html @@ -0,0 +1,2828 @@ + + + + + + + + + + Serbian stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Serbian stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Serbian vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+ambasade
+ambasadi
+abdominalna
+abdominalno
+abdominalnih
+abdominalnim
+abdominalnog
+abdominalnoj
+abdominalnom
+abdominalnu
+abeceda
+abecede
+abecedi
+abecedni
+abecednih
+abecednim
+abecedno
+abecednog
+abecednom
+abecedom
+abecedu
+abecendom
+ablendovanje
+ablenduje
+ablenduju
+ablendujući
+abnormalan
+abnormalije
+abnormalijom
+abnormalna
+abnormalne
+abnormalni
+abnormalnih
+abnormalnim
+abnormalno
+abnormalnog
+abnormalnom
+abnormalnost
+abnormalnosti
+abnormalnostima
+abnormalnu
+
+ambasad
+ambasad
+abdominaln
+abdominaln
+abdominaln
+abdominaln
+abdominaln
+abdominaln
+abdominaln
+abdominaln
+abeced
+abeced
+abeced
+abecedn
+abecedn
+abecedn
+abecedn
+abecedn
+abecedn
+abeced
+abeced
+abecend
+ablendovanj
+ablenduj
+ablenduj
+ablenduj
+abnormaln
+abnormalij
+abnormalij
+abnormaln
+abnormaln
+abnormaln
+abnormaln
+abnormaln
+abnormaln
+abnormaln
+abnormaln
+abnormaln
+abnormalnost
+abnormalnost
+abnormaln
+
+obustavila
+obustavile
+obustavili
+obustavilo
+obustavio
+ocenjivala
+ocenjivali
+ocenjivan
+ocenjivana
+ocenjivane
+ocenjivani
+ocenjivano
+ocenjivanja
+ocenjivanje
+ocenjivanju
+ocenjivao
+ocenjivati
+ocenjuje
+ocenjujem
+ocenjujemo
+ocenjuješ
+ocenjujete
+ocenjuju
+ocenjujući
+očajan
+očajna
+očajne
+očajni
+očajno
+padobrana
+padobranaca
+padobranci
+padobrancima
+padobranom
+padobranskim
+padobransku
+padobranu
+paganska
+paganske
+paganski
+paganskih
+
+obustavi
+obustavi
+obustavi
+obustavi
+obustavi
+ocenjiva
+ocenjiva
+ocenjiva
+ocenjiva
+ocenjiva
+ocenjiva
+ocenjiva
+ocenjivanj
+ocenjivanj
+ocenjivanj
+ocenjiva
+ocenjiva
+ocenjuj
+ocenjuj
+ocenjuj
+ocenjuj
+ocenjuj
+ocenjuj
+ocenjuj
+očajn
+očajn
+očajn
+očajn
+očajn
+padobran
+padobranc
+padobranc
+padobranc
+padobran
+padobransk
+padobransk
+padobran
+pagansk
+pagansk
+pagansk
+pagansk
+
+
+ +

The stemming algorithm

+ +

+The Serbian language is a Slavic language (Indo-European) of the South Slavic +subgroup. It is highly inflected and uses similar rules for morphological +derivation and flexion as other Slavic languages, especially ones derived from +the Serbo-Croatian language used in the former Yugoslavia. Because of this +highly inflected characteristic a stemmer for Serbian language will have many +more rules than stemmers for less inflected languages. +

+ +

+Serbian Stemmer described in this document is based on the Croatian +Stemmer which is published under the GNU Lesser General Public License. +Mark Regions, Morphological Changes (Step_1) and Stemming +(Step_2) routines are based on the Croatian Stemming Algorithm. In +addition, some of the existing rules for Morphological Changes and Stemming +(Step_1 and Step_2 among lists) have been modified and new rules have +been added for the needs of the Serbian Stemmer. +

+ +

+Latin alphabet in Serbian includes the following letters with diacritics: +

+ +
+ č   ć   đ   š   ž +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u +
+ +

+There is also letter - r - that isn't a vowel but it is sometimes used for syllabification. +

+ +

Main Routines of Serbian Stemming Algorithm are:

+ +
    +
  1. Conversion of Cyrillic alphabet to Latin alphabet +

    +The Serbian language uses both Cyrillic and Latin alphabets, but +these days most people use the Latin alphabet on their PCs, Phones, etc. This +algorithm is developed mostly for the purposes of the Information Retrieval, +therefore the first thing it does is to convert Cyrillic letters to +Latin. +

    +
  2. + +
  3. Prelude +

    +In Serbian language there are two dialects: Ekavian and +Ijekavian. For example words: +

      +
    • senka (Ekavian) +
    • sjenka (Ijekavian) +
    +have the same meaning (Shadow), also words: +
      +
    • mleko (Ekavian) +
    • mlijeko (Ijekavian) +
    +have the same meaning (Milk) but are spelled differently and because +mostly used dialect in Serbia is Ekavian the next thing to do is to +replace Ijekavian dialect with it. +

    + +

    +These days it is also common, although not valid, to use combination of letters +"d" and "j" instead of a single letter "đ". For example +people will more often write "Novak Djoković" instead of "Novak +Đoković" and because this algorithm is developed with Information Retrieval +in mind they should be treated as the same terms. +

    +
  4. +
  5. Mark Regions +

    +R1 is either: +

      +
    1. a region after the first vowel if there are at least two letters outside +of it, otherwise it is a region after the first non-vowel following a vowel, +
    2. a region after the first "r" if there are at least two letters +outside of it, otherwise it is a region after the first non-"r" +following an "r". +
    +

    +Note that every suffix which the stemmer can remove contains at least one +vowel, so in the degenerate case of an input which contains no vowels there +is nothing to be done. The Snowball implementation of this stemmer sets +R1 to be a zero length region at the end of the word if the input +contains no vowels and no "r". +

    +

    +In Serbian language there are some words in which "r" letter is used for +syllabification and in such words vowels can appear at the very end - for +example word "grmlje". +

    + +

    +So before algorithm decide what will R1 be, it needs to look if and +where "r" letter occurs and where is the first vowel. If it finds "r" +that occurred before the first vowel and there is at least one letter between +them this means that "r" is used for syllabification and R1 is +2), otherwise R1 is 1). +

    + +

    +For example: +

      +
    • "tr|go|va|čki" - in this word "tr" is the first syllable + which means that "r" is used for syllabification and R1 = + "govački" + +
    • "tre|ne|rka" - in this word there is a letter "r" before the + first vowel but there aren't any letters between them which means that + "r" isn't used for syllabification and R1 = "nerka". + +
    • "r|ta|njski" - in this word "r" is the first syllable but if + we use "tanjski" as R1 it won't left enough letters outside + of it, so we need to shrink it down to a region after the first + non-"r" following an "r" which is in this case = + "anjski". + +
    • "a|vi|on" - similar to the previous case but with a vowel instead + of an "r". +
    + +Inside Mark Regions routine there is a test routine that is used to +check for letters with diacritics and is used later to apply certain rules in +stemming. Result of this test routine is stored inside no_diacritics flag. +This test routine is used because people these days tend to use letters without +diacritics (instead of the proper ones with diacritics) and we need to take +this into account also. +

    +
  6. + +
  7. Morphological Changes +

    +Very last thing to do, before any stemming is done, are morphological changes. +These changes are applied so that we get the same stems for different forms of a +word. +

    + +

    +For example words: +

      +
    • "pravilan" (Masculine, Singular) +
    • "pravilna" (Feminine, Singular) +
    • "pravilno" (Neuter, Singular) +
    +should have the same stem. To get that result the algorithm will first change +word "pravilan" (Masculine, Singular) to "pravilni" (Masculine, +Plural) and after that the word will be stemmed. +

    +
  8. + +
  9. Stemming +

    +There are two steps for stemming. The first contains most of the rules and is +the primary stemming routine and the second one will try to stem the word only +if the first one failed to do so - whether it was because there were no rules +that could be applied or the rule overlapped the R1 region. The second +step contains a few rules that will do proper stemming for most words that +couldn't be stemmed using the rules from the first step. +

    +
  10. +
+ +

The full algorithm in Snowball

+ +
/* Stemmer for Serbian language, based on:
+ *
+ * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian
+ * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/
+ *
+ * authors: Stefan Petkovic and Dragan Ivanovic
+ * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs
+ * version: 1.0 (20.04.2019)
+*/
+
+routines (
+    cyr_to_lat
+    prelude
+    mark_regions
+    R1
+    Step_1
+    Step_2
+    Step_3
+)
+
+externals ( stem )
+
+booleans ( no_diacritics )
+
+integers ( p1 )
+
+groupings ( v ca sa rg )
+
+stringescapes {}
+
+/* special characters - Unicode codepoints */
+
+/* serbian cyrillic */
+
+stringdef cyrA    '{U+0430}'
+stringdef cyrB    '{U+0431}'
+stringdef cyrV    '{U+0432}'
+stringdef cyrG    '{U+0433}'
+stringdef cyrD    '{U+0434}'
+stringdef cyrDx   '{U+0452}'
+stringdef cyrE    '{U+0435}'
+stringdef cyrZh   '{U+0436}'
+stringdef cyrZ    '{U+0437}'
+stringdef cyrI    '{U+0438}'
+stringdef cyrJ    '{U+0458}'
+stringdef cyrK    '{U+043A}'
+stringdef cyrL    '{U+043B}'
+stringdef cyrLJ   '{U+0459}'
+stringdef cyrM    '{U+043C}'
+stringdef cyrN    '{U+043D}'
+stringdef cyrNJ   '{U+045A}'
+stringdef cyrO    '{U+043E}'
+stringdef cyrP    '{U+043F}'
+stringdef cyrR    '{U+0440}'
+stringdef cyrS    '{U+0441}'
+stringdef cyrT    '{U+0442}'
+stringdef cyrCy   '{U+045B}'
+stringdef cyrU    '{U+0443}'
+stringdef cyrF    '{U+0444}'
+stringdef cyrH    '{U+0445}'
+stringdef cyrC    '{U+0446}'
+stringdef cyrCx   '{U+0447}'
+stringdef cyrDzx  '{U+045F}'
+stringdef cyrSx   '{U+0448}'
+
+/* serbian latin with diacritics */
+
+stringdef cx    '{U+010D}' // small c with caron
+stringdef cy    '{U+0107}' // small c with acute
+stringdef zx    '{U+017E}' // small z with caron
+stringdef sx    '{U+0161}' // small s with caron
+stringdef dx    '{U+0111}' // small d with stroke
+
+define v 'aeiou'
+define sa '{cx}{cy}{zx}{sx}{dx}'
+define ca 'bvgdzjklmnprstfhc' + sa
+define rg 'r'
+
+
+define cyr_to_lat as (
+
+    do repeat goto (
+        [substring] among (
+            '{cyrA}'    (<- 'a')
+            '{cyrB}'    (<- 'b')
+            '{cyrV}'    (<- 'v')
+            '{cyrG}'    (<- 'g')
+            '{cyrD}'    (<- 'd')
+            '{cyrDx}'   (<- '{dx}')
+            '{cyrE}'    (<- 'e')
+            '{cyrZh}'   (<- '{zx}')
+            '{cyrZ}'    (<- 'z')
+            '{cyrI}'    (<- 'i')
+            '{cyrJ}'    (<- 'j')
+            '{cyrK}'    (<- 'k')
+            '{cyrL}'    (<- 'l')
+            '{cyrLJ}'   (<- 'lj')
+            '{cyrM}'    (<- 'm')
+            '{cyrN}'    (<- 'n')
+            '{cyrNJ}'   (<- 'nj')
+            '{cyrO}'    (<- 'o')
+            '{cyrP}'    (<- 'p')
+            '{cyrR}'    (<- 'r')
+            '{cyrS}'    (<- 's')
+            '{cyrT}'    (<- 't')
+            '{cyrCy}'   (<- '{cy}')
+            '{cyrU}'    (<- 'u')
+            '{cyrF}'    (<- 'f')
+            '{cyrH}'    (<- 'h')
+            '{cyrC}'    (<- 'c')
+            '{cyrCx}'   (<- '{cx}')
+            '{cyrDzx}'  (<- 'd{zx}')
+            '{cyrSx}'   (<- '{sx}')
+        )
+    )
+
+)
+
+define prelude as (
+
+    do repeat goto (
+        ca ['ije'] ca <- 'e'
+    )
+
+    do repeat goto (
+        ca ['je'] ca <- 'e'
+    )
+
+    do repeat goto (
+        ['dj'] <- '{dx}'
+    )
+
+)
+
+define mark_regions as (
+
+    set no_diacritics
+
+    do (
+        gopast sa unset no_diacritics
+    )
+
+    $p1 = limit
+
+    do (
+        gopast v setmark p1
+        ($p1 < 2) (
+            gopast non-v
+            setmark p1
+        )
+    )
+    do (
+        gopast 'r'
+        $(cursor >= 2) or (gopast non-rg)
+        $(p1 - cursor > 1) setmark p1
+    )
+
+)
+
+backwardmode (
+
+    define R1 as $p1 <= cursor
+
+    define Step_1 as (
+        [substring] among (
+            'lozi'
+            'lozima'        (<-'loga')
+            'pesi'
+            'pesima'        (<-'peh')
+            'vojci'         (<-'vojka')
+            'bojci'         (<-'bojka')
+            'jaci'
+            'jacima'        (<-'jak')
+            '{cx}ajan'      (<-'{cx}ajni')
+            'cajan'         (no_diacritics <-'cajni')
+            'eran'          (<-'erni')
+            'laran'         (<-'larni')
+            'esan'          (<-'esni')
+            'anjac'         (<-'anjca')
+            'ajac'
+            'ajaca'         (<-'ajca')
+            'ljaca'
+            'ljac'          (<-'ljca')
+            'ejac'
+            'ejaca'         (<-'ejca')
+            'ojac'
+            'ojaca'         (<-'ojca')
+            'ajaka'         (<-'ajka')
+            'ojaka'         (<-'ojka')
+            '{sx}aca'
+            '{sx}ac'        (<-'{sx}ca')
+            'inzima'
+            'inzi'          (<-'ing')
+            'tvenici'       (<-'tvenik')
+            'tetici'
+            'teticima'      (<-'tetika')
+            'nstava'        (<-'nstva')
+            'nicima'        (<-'nik')
+            'ticima'        (<-'tik')
+            'zicima'        (<-'zik')
+            'snici'         (<-'snik')
+            'kuse'          (<-'kusi')
+            'kusan'         (<-'kusni')
+            'kustava'       (<-'kustva')
+            'du{sx}an'      (<-'du{sx}ni')
+            'dusan'         (no_diacritics <-'dusni')
+            'antan'         (<-'antni')
+            'bilan'         (<-'bilni')
+            'tilan'         (<-'tilni')
+            'avilan'        (<-'avilni')
+            'silan'         (<-'silni')
+            'gilan'         (<-'gilni')
+            'rilan'         (<-'rilni')
+            'nilan'         (<-'nilni')
+            'alan'          (<-'alni')
+            'ozan'          (<-'ozni')
+            'rave'          (<-'ravi')
+            'stavan'        (<-'stavni')
+            'pravan'        (<-'pravni')
+            'tivan'         (<-'tivni')
+            'sivan'         (<-'sivni')
+            'atan'          (<-'atni')
+            'enat'          (<-'enta')
+            'tetan'         (<-'tetni')
+            'pletan'        (<-'pletni')
+            '{sx}ave'       (<-'{sx}avi')
+            'save'          (no_diacritics <-'savi')
+            'anata'         (<-'anta')
+            'a{cx}ak'
+            'a{cx}aka'      (<-'a{cx}ka')
+            'acak'
+            'acaka'         (no_diacritics <-'acka')
+            'u{sx}ak'       (<-'u{sx}ka')
+            'usak'          (no_diacritics <-'uska')
+            'atak'
+            'ataka'
+            'atci'
+            'atcima'        (<-'atka')
+            'etak'
+            'etaka'         (<-'etka')
+            'itak'
+            'itaka'
+            'itci'          (<-'itka')
+            'otak'
+            'otaka'         (<-'otka')
+            'utak'
+            'utaka'
+            'utci'
+            'utcima'        (<-'utka')
+            'eskan'         (<-'eskna')
+            'ti{cx}an'      (<-'ti{cx}ni')
+            'tican'         (no_diacritics <-'ticni')
+            'ojsci'         (<-'ojska')
+            'esama'         (<-'esma')
+            'metar'
+            'metara'        (<-'metra')
+            'centar'
+            'centara'       (<-'centra')
+            'istar'
+            'istara'        (<-'istra')
+            'o{sx}{cy}u'    (<-'osti')
+            'oscu'          (no_diacritics <-'osti')
+            'daba'          (<-'dba')
+            '{cx}cima'
+            '{cx}ci'        (<-'{cx}ka')
+            'mac'
+            'maca'          (<-'mca')
+            'naca'
+            'nac'           (<-'nca')
+            'voljan'        (<-'voljni')
+            'anaka'         (<-'anki')
+            'vac'
+            'vaca'          (<-'vca')
+            'saca'
+            'sac'           (<-'sca')
+            'raca'
+            'rac'           (<-'rca')
+            'aoca'
+            'alaca'
+            'alac'          (<-'alca')
+            'elaca'
+            'elac'          (<-'elca')
+            'olaca'
+            'olac'
+            'olce'          (<-'olca')
+            'njac'
+            'njaca'         (<-'njca')
+            'ekata'
+            'ekat'          (<-'ekta')
+            'izam'
+            'izama'         (<-'izma')
+            'jebe'          (<-'jebi')
+            'baci'          (<-'baci')
+            'a{sx}an'       (<-'a{sx}ni')
+            'asan'          (no_diacritics <-'asni')
+        )
+    )
+
+    define Step_2 as (
+        [substring] R1 among (
+            'skijima'
+            'skijega'
+            'skijemu'
+            'skijem'
+            'skega'
+            'skemu'
+            'skem'
+            'skijim'
+            'skijih'
+            'skijoj'
+            'skijeg'
+            'skiji'
+            'skije'
+            'skija'
+            'skoga'
+            'skome'
+            'skomu'
+            'skima'
+            'skog'
+            'skom'
+            'skim'
+            'skih'
+            'skoj'
+            'ski'
+            'ske'
+            'sko'
+            'ska'
+            'sku'           (<-'sk')
+            '{sx}kijima'
+            '{sx}kijega'
+            '{sx}kijemu'
+            '{sx}kijem'
+            '{sx}kega'
+            '{sx}kemu'
+            '{sx}kem'
+            '{sx}kijim'
+            '{sx}kijih'
+            '{sx}kijoj'
+            '{sx}kijeg'
+            '{sx}kiji'
+            '{sx}kije'
+            '{sx}kija'
+            '{sx}koga'
+            '{sx}kome'
+            '{sx}komu'
+            '{sx}kima'
+            '{sx}kog'
+            '{sx}kom'
+            '{sx}kim'
+            '{sx}kih'
+            '{sx}koj'
+            '{sx}ki'
+            '{sx}ke'
+            '{sx}ko'
+            '{sx}ka'
+            '{sx}ku'        (<-'{sx}k')
+            'stvima'
+            'stvom'
+            'stvo'
+            'stva'
+            'stvu'          (<-'stv')
+            '{sx}tvima'
+            '{sx}tvom'
+            '{sx}tvo'
+            '{sx}tva'
+            '{sx}tvu'       (<-'{sx}tv')
+            'tanijama'
+            'tanijima'
+            'tanijom'
+            'tanija'
+            'taniju'
+            'tanije'
+            'taniji'        (<-'tanij')
+            'manijama'
+            'manijima'
+            'manijom'
+            'manija'
+            'maniju'
+            'manije'
+            'maniji'        (<-'manij')
+            'panijama'
+            'panijima'
+            'panijom'
+            'panija'
+            'paniju'
+            'panije'
+            'paniji'        (<-'panij')
+            'ranijama'
+            'ranijima'
+            'ranijom'
+            'ranija'
+            'raniju'
+            'ranije'
+            'raniji'        (<-'ranij')
+            'ganijama'
+            'ganijima'
+            'ganijom'
+            'ganija'
+            'ganiju'
+            'ganije'
+            'ganiji'        (<-'ganij')
+            'aninom'
+            'anina'
+            'aninu'
+            'anine'
+            'anima'
+            'anin'
+            'anom'
+            'anu'
+            'ani'
+            'ana'
+            'ane'           (<-'an')
+            'inima'
+            'inama'
+            'inom'
+            'ina'
+            'ine'
+            'ini'
+            'inu'
+            'ino'           (<-'in')
+            'onovima'
+            'onova'
+            'onove'
+            'onovi'
+            'onima'
+            'onom'
+            'ona'
+            'one'
+            'oni'
+            'onu'           (<-'on')
+            'nijima'
+            'nijega'
+            'nijemu'
+            'nijeg'
+            'nijem'
+            'nega'
+            'nemu'
+            'neg'
+            'nem'
+            'nijim'
+            'nijih'
+            'nijoj'
+            'niji'
+            'nije'
+            'nija'
+            'niju'
+            'nima'
+            'nome'
+            'nomu'
+            'noga'
+            'noj'
+            'nom'
+            'nih'
+            'nim'
+            'nog'
+            'no'
+            'ne'
+            'na'
+            'nu'
+            'ni'            (<-'n')
+            'a{cy}oga'
+            'a{cy}ome'
+            'a{cy}omu'
+            'a{cy}ega'
+            'a{cy}emu'
+            'a{cy}ima'
+            'a{cy}oj'
+            'a{cy}ih'
+            'a{cy}om'
+            'a{cy}eg'
+            'a{cy}em'
+            'a{cy}og'
+            'a{cy}uh'
+            'a{cy}im'
+            'a{cy}e'
+            'a{cy}a'        (<-'a{cy}')
+            'e{cy}oga'
+            'e{cy}ome'
+            'e{cy}omu'
+            'e{cy}ega'
+            'e{cy}emu'
+            'e{cy}ima'
+            'e{cy}oj'
+            'e{cy}ih'
+            'e{cy}om'
+            'e{cy}eg'
+            'e{cy}em'
+            'e{cy}og'
+            'e{cy}uh'
+            'e{cy}im'
+            'e{cy}e'
+            'e{cy}a'        (<-'e{cy}')
+            'u{cy}oga'
+            'u{cy}ome'
+            'u{cy}omu'
+            'u{cy}ega'
+            'u{cy}emu'
+            'u{cy}ima'
+            'u{cy}oj'
+            'u{cy}ih'
+            'u{cy}om'
+            'u{cy}eg'
+            'u{cy}em'
+            'u{cy}og'
+            'u{cy}uh'
+            'u{cy}im'
+            'u{cy}e'
+            'u{cy}a'        (<-'u{cy}')
+            'ugovima'
+            'ugovi'
+            'ugove'
+            'ugova'         (<-'ugov')
+            'ugama'
+            'ugom'
+            'uga'
+            'uge'
+            'ugi'
+            'ugu'
+            'ugo'           (<-'ug')
+            'logama'
+            'logom'
+            'loga'
+            'logu'
+            'loge'          (<-'log')
+            'govima'
+            'gama'
+            'govi'
+            'gove'
+            'gova'
+            'gom'
+            'ga'
+            'ge'
+            'gi'
+            'gu'
+            'go'            (<-'g')
+            'rarijem'
+            'rarija'
+            'rariju'
+            'rario'         (<-'rari')
+            'otijem'
+            'otija'
+            'otiju'
+            'otio'          (<-'oti')
+            'sijem'
+            'sija'
+            'siju'
+            'sio'           (<-'si')
+            'lijem'
+            'lija'
+            'liju'
+            'lio'           (<-'li')
+            'uju{cy}i'
+            'ujemo'
+            'ujete'
+            'ujmo'
+            'ujem'
+            'uje{sx}'
+            'uje'
+            'uju'           (<-'uj')
+            'cajevima'
+            'cajevi'
+            'cajeva'
+            'cajeve'
+            'cajama'
+            'cajima'
+            'cajem'
+            'caja'
+            'caje'
+            'caji'
+            'caju'          (<-'caj')
+            '{cx}ajevima'
+            '{cx}ajevi'
+            '{cx}ajeva'
+            '{cx}ajeve'
+            '{cx}ajama'
+            '{cx}ajima'
+            '{cx}ajem'
+            '{cx}aja'
+            '{cx}aje'
+            '{cx}aji'
+            '{cx}aju'       (<-'{cx}aj')
+            '{cy}ajevima'
+            '{cy}ajevi'
+            '{cy}ajeva'
+            '{cy}ajeve'
+            '{cy}ajama'
+            '{cy}ajima'
+            '{cy}ajem'
+            '{cy}aja'
+            '{cy}aje'
+            '{cy}aji'
+            '{cy}aju'       (<-'{cy}aj')
+            '{dx}ajevima'
+            '{dx}ajevi'
+            '{dx}ajeva'
+            '{dx}ajeve'
+            '{dx}ajama'
+            '{dx}ajima'
+            '{dx}ajem'
+            '{dx}aja'
+            '{dx}aje'
+            '{dx}aji'
+            '{dx}aju'       (<-'{dx}aj')
+            'lajevima'
+            'lajevi'
+            'lajeva'
+            'lajeve'
+            'lajama'
+            'lajima'
+            'lajem'
+            'laja'
+            'laje'
+            'laji'
+            'laju'          (<-'laj')
+            'rajevima'
+            'rajevi'
+            'rajeva'
+            'rajeve'
+            'rajama'
+            'rajima'
+            'rajem'
+            'raja'
+            'raje'
+            'raji'
+            'raju'          (<-'raj')
+            'bijima'
+            'bijama'
+            'bijom'
+            'bija'
+            'bije'
+            'biji'
+            'biju'
+            'bijo'          (<-'bij')
+            'cijima'
+            'cijama'
+            'cijom'
+            'cija'
+            'cije'
+            'ciji'
+            'ciju'
+            'cijo'          (<-'cij')
+            'dijima'
+            'dijama'
+            'dijom'
+            'dija'
+            'dije'
+            'diji'
+            'diju'
+            'dijo'          (<-'dij')
+            'lijima'
+            'lijama'
+            'lijom'
+            'lije'
+            'liji'
+            'lijo'          (<-'lij')
+            'nijama'
+            'nijom'
+            'nijo'          (<-'nij')
+            'mijima'
+            'mijama'
+            'mijom'
+            'mija'
+            'mije'
+            'miji'
+            'miju'
+            'mijo'          (<-'mij')
+            '{zx}ijima'
+            '{zx}ijama'
+            '{zx}ijom'
+            '{zx}ija'
+            '{zx}ije'
+            '{zx}iji'
+            '{zx}iju'
+            '{zx}ijo'       (<-'{zx}ij')
+            'gijima'
+            'gijama'
+            'gijom'
+            'gija'
+            'gije'
+            'giji'
+            'giju'
+            'gijo'          (<-'gij')
+            'fijima'
+            'fijama'
+            'fijom'
+            'fija'
+            'fije'
+            'fiji'
+            'fiju'
+            'fijo'          (<-'fij')
+            'pijima'
+            'pijama'
+            'pijom'
+            'pija'
+            'pije'
+            'piji'
+            'piju'
+            'pijo'          (<-'pij')
+            'rijima'
+            'rijama'
+            'rijom'
+            'rija'
+            'rije'
+            'riji'
+            'riju'
+            'rijo'          (<-'rij')
+            'sijima'
+            'sijama'
+            'sijom'
+            'sije'
+            'siji'
+            'sijo'          (<-'sij')
+            'tijima'
+            'tijama'
+            'tijom'
+            'tija'
+            'tije'
+            'tiji'
+            'tiju'
+            'tijo'          (<-'tij')
+            'zijima'
+            'zijama'
+            'zijom'
+            'zija'
+            'zije'
+            'ziji'
+            'ziju'
+            'zijo'          (<-'zij')
+            'nalima'
+            'nalama'
+            'nalom'
+            'nala'
+            'nale'
+            'nali'
+            'nalu'
+            'nalo'          (<-'nal')
+            'ijalima'
+            'ijalama'
+            'ijalom'
+            'ijala'
+            'ijale'
+            'ijali'
+            'ijalu'
+            'ijalo'         (<-'ijal')
+            'ozilima'
+            'ozilom'
+            'ozila'
+            'ozile'
+            'ozilu'
+            'ozili'         (<-'ozil')
+            'olovima'
+            'olovi'
+            'olova'
+            'olove'         (<-'olov')
+            'olima'
+            'olom'
+            'ola'
+            'olu'
+            'ole'
+            'oli'           (<-'ol')
+            'lemama'
+            'lemima'
+            'lemom'
+            'lema'
+            'leme'
+            'lemi'
+            'lemu'
+            'lemo'          (<-'lem')
+            'ramama'
+            'ramom'
+            'rama'
+            'rame'
+            'rami'
+            'ramu'
+            'ramo'          (<-'ram')
+            'arama'
+            'arima'
+            'arom'
+            'aru'
+            'ara'
+            'are'
+            'ari'           (<-'ar')
+            'drama'
+            'drima'
+            'drom'
+            'dru'
+            'dra'
+            'dre'
+            'dri'           (<-'dr')
+            'erama'
+            'erima'
+            'erom'
+            'eru'
+            'era'
+            'ere'
+            'eri'           (<-'er')
+            'orama'
+            'orima'
+            'orom'
+            'oru'
+            'ora'
+            'ore'
+            'ori'           (<-'or')
+            'esima'
+            'esom'
+            'ese'
+            'esa'
+            'esu'           (<-'es')
+            'isima'
+            'isom'
+            'ise'
+            'isa'
+            'isu'           (<-'is')
+            'ta{sx}ama'
+            'ta{sx}ima'
+            'ta{sx}om'
+            'ta{sx}em'
+            'ta{sx}a'
+            'ta{sx}u'
+            'ta{sx}i'
+            'ta{sx}e'       (<-'ta{sx}')
+            'na{sx}ama'
+            'na{sx}ima'
+            'na{sx}om'
+            'na{sx}em'
+            'na{sx}a'
+            'na{sx}u'
+            'na{sx}i'
+            'na{sx}e'       (<-'na{sx}')
+            'ja{sx}ama'
+            'ja{sx}ima'
+            'ja{sx}om'
+            'ja{sx}em'
+            'ja{sx}a'
+            'ja{sx}u'
+            'ja{sx}i'
+            'ja{sx}e'       (<-'ja{sx}')
+            'ka{sx}ama'
+            'ka{sx}ima'
+            'ka{sx}om'
+            'ka{sx}em'
+            'ka{sx}a'
+            'ka{sx}u'
+            'ka{sx}i'
+            'ka{sx}e'       (<-'ka{sx}')
+            'ba{sx}ama'
+            'ba{sx}ima'
+            'ba{sx}om'
+            'ba{sx}em'
+            'ba{sx}a'
+            'ba{sx}u'
+            'ba{sx}i'
+            'ba{sx}e'       (<-'ba{sx}')
+            'ga{sx}ama'
+            'ga{sx}ima'
+            'ga{sx}om'
+            'ga{sx}em'
+            'ga{sx}a'
+            'ga{sx}u'
+            'ga{sx}i'
+            'ga{sx}e'       (<-'ga{sx}')
+            'va{sx}ama'
+            'va{sx}ima'
+            'va{sx}om'
+            'va{sx}em'
+            'va{sx}a'
+            'va{sx}u'
+            'va{sx}i'
+            'va{sx}e'       (<-'va{sx}')
+            'e{sx}ima'
+            'e{sx}ama'
+            'e{sx}om'
+            'e{sx}em'
+            'e{sx}i'
+            'e{sx}e'
+            'e{sx}a'
+            'e{sx}u'        (<-'e{sx}')
+            'i{sx}ima'
+            'i{sx}ama'
+            'i{sx}om'
+            'i{sx}em'
+            'i{sx}i'
+            'i{sx}e'
+            'i{sx}a'
+            'i{sx}u'        (<-'i{sx}')
+            'ikatima'
+            'ikatom'
+            'ikata'
+            'ikate'
+            'ikati'
+            'ikatu'
+            'ikato'         (<-'ikat')
+            'latima'
+            'latom'
+            'lata'
+            'late'
+            'lati'
+            'latu'
+            'lato'          (<-'lat')
+            'etama'
+            'etima'
+            'etom'
+            'eta'
+            'ete'
+            'eti'
+            'etu'
+            'eto'           (<-'et')
+            'estima'
+            'estama'
+            'estom'
+            'esta'
+            'este'
+            'esti'
+            'estu'
+            'esto'          (<-'est')
+            'istima'
+            'istama'
+            'istom'
+            'ista'
+            'iste'
+            'isti'
+            'istu'
+            'isto'          (<-'ist')
+            'kstima'
+            'kstama'
+            'kstom'
+            'ksta'
+            'kste'
+            'ksti'
+            'kstu'
+            'ksto'          (<-'kst')
+            'ostima'
+            'ostama'
+            'ostom'
+            'osta'
+            'oste'
+            'osti'
+            'ostu'
+            'osto'          (<-'ost')
+            'i{sx}tima'
+            'i{sx}tem'
+            'i{sx}ta'
+            'i{sx}te'
+            'i{sx}tu'       (<-'i{sx}t')
+            'ovasmo'
+            'ovaste'
+            'ovahu'
+            'ovati'
+            'ova{sx}e'
+            'ovali'
+            'ovala'
+            'ovale'
+            'ovalo'
+            'ovat'
+            'ovah'
+            'ovao'          (<-'ova')
+            'avijemu'
+            'avijima'
+            'avijega'
+            'avijeg'
+            'avijem'
+            'avemu'
+            'avega'
+            'aveg'
+            'avem'
+            'avijim'
+            'avijih'
+            'avijoj'
+            'avoga'
+            'avome'
+            'avomu'
+            'avima'
+            'avama'
+            'aviji'
+            'avije'
+            'avija'
+            'aviju'
+            'avim'
+            'avih'
+            'avoj'
+            'avom'
+            'avog'
+            'avi'
+            'ava'
+            'avu'
+            'ave'
+            'avo'           (<-'av')
+            'evijemu'
+            'evijima'
+            'evijega'
+            'evijeg'
+            'evijem'
+            'evemu'
+            'evega'
+            'eveg'
+            'evem'
+            'evijim'
+            'evijih'
+            'evijoj'
+            'evoga'
+            'evome'
+            'evomu'
+            'evima'
+            'evama'
+            'eviji'
+            'evije'
+            'evija'
+            'eviju'
+            'evim'
+            'evih'
+            'evoj'
+            'evom'
+            'evog'
+            'evi'
+            'eva'
+            'evu'
+            'eve'
+            'evo'           (<-'ev')
+            'ivijemu'
+            'ivijima'
+            'ivijega'
+            'ivijeg'
+            'ivijem'
+            'ivemu'
+            'ivega'
+            'iveg'
+            'ivem'
+            'ivijim'
+            'ivijih'
+            'ivijoj'
+            'ivoga'
+            'ivome'
+            'ivomu'
+            'ivima'
+            'ivama'
+            'iviji'
+            'ivije'
+            'ivija'
+            'iviju'
+            'ivim'
+            'ivih'
+            'ivoj'
+            'ivom'
+            'ivog'
+            'ivi'
+            'iva'
+            'ivu'
+            'ive'
+            'ivo'           (<-'iv')
+            'ovijemu'
+            'ovijima'
+            'ovijega'
+            'ovijeg'
+            'ovijem'
+            'ovemu'
+            'ovega'
+            'oveg'
+            'ovijim'
+            'ovijih'
+            'ovijoj'
+            'ovoga'
+            'ovome'
+            'ovomu'
+            'ovima'
+            'oviji'
+            'ovije'
+            'ovija'
+            'oviju'
+            'ovim'
+            'ovih'
+            'ovoj'
+            'ovom'
+            'ovog'
+            'ovi'
+            'ova'
+            'ovu'
+            'ove'
+            'ovo'           (<-'ov')
+            'movima'
+            'movom'
+            'mova'
+            'movu'
+            'move'
+            'movi'          (<-'mov')
+            'lovima'
+            'lovom'
+            'lova'
+            'lovu'
+            'love'
+            'lovi'          (<-'lov')
+            'elijemu'
+            'elijima'
+            'elijega'
+            'elijeg'
+            'elijem'
+            'elemu'
+            'elega'
+            'eleg'
+            'elem'
+            'elijim'
+            'elijih'
+            'elijoj'
+            'eloga'
+            'elome'
+            'elomu'
+            'elima'
+            'eliji'
+            'elije'
+            'elija'
+            'eliju'
+            'elim'
+            'elih'
+            'eloj'
+            'elom'
+            'elog'
+            'eli'
+            'ela'
+            'elu'
+            'ele'
+            'elo'           (<-'el')
+            'anjijemu'
+            'anjijima'
+            'anjijega'
+            'anjijeg'
+            'anjijem'
+            'anjemu'
+            'anjega'
+            'anjeg'
+            'anjem'
+            'anjijim'
+            'anjijih'
+            'anjijoj'
+            'anjoga'
+            'anjome'
+            'anjomu'
+            'anjima'
+            'anjiji'
+            'anjije'
+            'anjija'
+            'anjiju'
+            'anjim'
+            'anjih'
+            'anjoj'
+            'anjom'
+            'anjog'
+            'anja'
+            'anje'
+            'anji'
+            'anjo'
+            'anju'          (<-'anj')
+            'enjijemu'
+            'enjijima'
+            'enjijega'
+            'enjijeg'
+            'enjijem'
+            'enjemu'
+            'enjega'
+            'enjeg'
+            'enjem'
+            'enjijim'
+            'enjijih'
+            'enjijoj'
+            'enjoga'
+            'enjome'
+            'enjomu'
+            'enjima'
+            'enjiji'
+            'enjije'
+            'enjija'
+            'enjiju'
+            'enjim'
+            'enjih'
+            'enjoj'
+            'enjom'
+            'enjog'
+            'enja'
+            'enje'
+            'enji'
+            'enjo'
+            'enju'          (<-'enj')
+            '{sx}njijemu'
+            '{sx}njijima'
+            '{sx}njijega'
+            '{sx}njijeg'
+            '{sx}njijem'
+            '{sx}njemu'
+            '{sx}njega'
+            '{sx}njeg'
+            '{sx}njem'
+            '{sx}njijim'
+            '{sx}njijih'
+            '{sx}njijoj'
+            '{sx}njoga'
+            '{sx}njome'
+            '{sx}njomu'
+            '{sx}njima'
+            '{sx}njiji'
+            '{sx}njije'
+            '{sx}njija'
+            '{sx}njiju'
+            '{sx}njim'
+            '{sx}njih'
+            '{sx}njoj'
+            '{sx}njom'
+            '{sx}njog'
+            '{sx}nja'
+            '{sx}nje'
+            '{sx}nji'
+            '{sx}njo'
+            '{sx}nju'       (<-'{sx}nj')
+            'anemu'
+            'anega'
+            'aneg'
+            'anem'          (<-'an')
+            'enemu'
+            'enega'
+            'eneg'
+            'enem'          (<-'en')
+            '{sx}nemu'
+            '{sx}nega'
+            '{sx}neg'
+            '{sx}nem'       (<-'{sx}n')
+            '{cx}inama'
+            '{cx}inome'
+            '{cx}inomu'
+            '{cx}inoga'
+            '{cx}inima'
+            '{cx}inog'
+            '{cx}inom'
+            '{cx}inim'
+            '{cx}inih'
+            '{cx}inoj'
+            '{cx}ina'
+            '{cx}inu'
+            '{cx}ini'
+            '{cx}ino'
+            '{cx}ine'       (<-'{cx}in')
+            'ro{sx}iv{sx}i'
+            'ro{sx}ismo'
+            'ro{sx}iste'
+            'ro{sx}i{sx}e'
+            'ro{sx}imo'
+            'ro{sx}ite'
+            'ro{sx}iti'
+            'ro{sx}ili'
+            'ro{sx}ila'
+            'ro{sx}ilo'
+            'ro{sx}ile'
+            'ro{sx}im'
+            'ro{sx}i{sx}'
+            'ro{sx}it'
+            'ro{sx}ih'
+            'ro{sx}io'      (<-'ro{sx}i')
+            'o{sx}ijemu'
+            'o{sx}ijima'
+            'o{sx}ijega'
+            'o{sx}ijeg'
+            'o{sx}ijem'
+            'o{sx}emu'
+            'o{sx}ega'
+            'o{sx}eg'
+            'o{sx}em'
+            'o{sx}ijim'
+            'o{sx}ijih'
+            'o{sx}ijoj'
+            'o{sx}oga'
+            'o{sx}ome'
+            'o{sx}omu'
+            'o{sx}ima'
+            'o{sx}iji'
+            'o{sx}ije'
+            'o{sx}ija'
+            'o{sx}iju'
+            'o{sx}im'
+            'o{sx}ih'
+            'o{sx}oj'
+            'o{sx}om'
+            'o{sx}og'
+            'o{sx}i'
+            'o{sx}a'
+            'o{sx}u'
+            'o{sx}e'        (<-'o{sx}')
+            'evitijima'
+            'evitijega'
+            'evitijemu'
+            'evitijem'
+            'evitega'
+            'evitemu'
+            'evitem'
+            'evitijim'
+            'evitijih'
+            'evitijoj'
+            'evitijeg'
+            'evitiji'
+            'evitije'
+            'evitija'
+            'evitoga'
+            'evitome'
+            'evitomu'
+            'evitima'
+            'evitog'
+            'evitom'
+            'evitim'
+            'evitih'
+            'evitoj'
+            'eviti'
+            'evite'
+            'evito'
+            'evita'
+            'evitu'         (<-'evit')
+            'ovitijima'
+            'ovitijega'
+            'ovitijemu'
+            'ovitijem'
+            'ovitega'
+            'ovitemu'
+            'ovitem'
+            'ovitijim'
+            'ovitijih'
+            'ovitijoj'
+            'ovitijeg'
+            'ovitiji'
+            'ovitije'
+            'ovitija'
+            'ovitoga'
+            'ovitome'
+            'ovitomu'
+            'ovitima'
+            'ovitog'
+            'ovitom'
+            'ovitim'
+            'ovitih'
+            'ovitoj'
+            'oviti'
+            'ovite'
+            'ovito'
+            'ovita'
+            'ovitu'         (<-'ovit')
+            'astijima'
+            'astijega'
+            'astijemu'
+            'astijem'
+            'astega'
+            'astemu'
+            'astem'
+            'astijim'
+            'astijih'
+            'astijoj'
+            'astijeg'
+            'astiji'
+            'astije'
+            'astija'
+            'astoga'
+            'astome'
+            'astomu'
+            'astima'
+            'astog'
+            'astom'
+            'astim'
+            'astih'
+            'astoj'
+            'asti'
+            'aste'
+            'asto'
+            'asta'
+            'astu'          (<-'ast')
+            'kijemu'
+            'kijima'
+            'kijega'
+            'kijeg'
+            'kijem'
+            'kemu'
+            'kega'
+            'keg'
+            'kem'
+            'kijim'
+            'kijih'
+            'kijoj'
+            'koga'
+            'kome'
+            'komu'
+            'kima'
+            'kiji'
+            'kije'
+            'kija'
+            'kiju'
+            'kim'
+            'kih'
+            'koj'
+            'kom'
+            'kog'
+            'kov'
+            'ki'
+            'ka'
+            'ku'
+            'ke'
+            'ko'            (<-'k')
+            'evaju{cy}i'
+            'evasmo'
+            'evaste'
+            'evajmo'
+            'evajte'
+            'evaju'
+            'evala'
+            'evale'
+            'evali'
+            'evalo'
+            'evamo'
+            'evana'
+            'evane'
+            'evani'
+            'evano'
+            'evate'
+            'evati'
+            'eva{sx}e'
+            'evahu'
+            'evah'
+            'evaj'
+            'evam'
+            'evan'
+            'evao'
+            'evat'
+            'evav'
+            'eva{sx}'       (<-'eva')
+            'avaju{cy}i'
+            'avasmo'
+            'avaste'
+            'avajmo'
+            'avajte'
+            'avaju'
+            'avala'
+            'avale'
+            'avali'
+            'avalo'
+            'avamo'
+            'avana'
+            'avane'
+            'avani'
+            'avano'
+            'avate'
+            'avati'
+            'ava{sx}e'
+            'avahu'
+            'avah'
+            'avaj'
+            'avam'
+            'avan'
+            'avao'
+            'avat'
+            'avav'
+            'ava{sx}'       (<-'ava')
+            'ivaju{cy}i'
+            'ivasmo'
+            'ivaste'
+            'ivajmo'
+            'ivajte'
+            'ivaju'
+            'ivala'
+            'ivale'
+            'ivali'
+            'ivalo'
+            'ivamo'
+            'ivana'
+            'ivane'
+            'ivani'
+            'ivano'
+            'ivate'
+            'ivati'
+            'iva{sx}e'
+            'ivahu'
+            'ivah'
+            'ivaj'
+            'ivam'
+            'ivan'
+            'ivao'
+            'ivat'
+            'ivav'
+            'iva{sx}'       (<-'iva')
+            'uvaju{cy}i'
+            'uvasmo'
+            'uvaste'
+            'uvajmo'
+            'uvajte'
+            'uvaju'
+            'uvala'
+            'uvale'
+            'uvali'
+            'uvalo'
+            'uvamo'
+            'uvana'
+            'uvane'
+            'uvani'
+            'uvano'
+            'uvate'
+            'uvati'
+            'uva{sx}e'
+            'uvahu'
+            'uvah'
+            'uvaj'
+            'uvam'
+            'uvan'
+            'uvao'
+            'uvat'
+            'uvav'
+            'uva{sx}'       (<-'uva')
+            'irujemo'
+            'irujete'
+            'iruju{cy}i'
+            'iraju{cy}i'
+            'irivat'
+            'irujem'
+            'iruje{sx}'
+            'irujmo'
+            'irujte'
+            'irav{sx}i'
+            'irasmo'
+            'iraste'
+            'irati'
+            'iramo'
+            'irate'
+            'iraju'
+            'ira{sx}e'
+            'irahu'
+            'irala'
+            'iralo'
+            'irali'
+            'irale'
+            'iruje'
+            'iruju'
+            'iruj'
+            'iral'
+            'iran'
+            'iram'
+            'ira{sx}'
+            'irat'
+            'irah'
+            'irao'          (<-'ir')
+            'a{cx}ismo'
+            'a{cx}iste'
+            'a{cx}iti'
+            'a{cx}imo'
+            'a{cx}ite'
+            'a{cx}i{sx}e'
+            'a{cx}e{cy}i'
+            'a{cx}ila'
+            'a{cx}ilo'
+            'a{cx}ili'
+            'a{cx}ile'
+            'a{cx}ena'
+            'a{cx}eno'
+            'a{cx}eni'
+            'a{cx}ene'
+            'a{cx}io'
+            'a{cx}im'
+            'a{cx}i{sx}'
+            'a{cx}it'
+            'a{cx}ih'
+            'a{cx}en'
+            'a{cx}i'
+            'a{cx}e'        (<-'a{cx}')
+            'a{cx}av{sx}i'
+            'a{cx}asmo'
+            'a{cx}aste'
+            'a{cx}ahu'
+            'a{cx}ati'
+            'a{cx}amo'
+            'a{cx}ate'
+            'a{cx}a{sx}e'
+            'a{cx}ala'
+            'a{cx}alo'
+            'a{cx}ali'
+            'a{cx}ale'
+            'a{cx}aju'
+            'a{cx}ana'
+            'a{cx}ano'
+            'a{cx}ani'
+            'a{cx}ane'
+            'a{cx}ao'
+            'a{cx}am'
+            'a{cx}a{sx}'
+            'a{cx}at'
+            'a{cx}ah'
+            'a{cx}an'       (<-'a{cx}a')
+            'nuv{sx}i'
+            'nusmo'
+            'nuste'
+            'nu{cy}i'
+            'nimo'
+            'nite'
+            'nemo'
+            'nete'
+            'nula'
+            'nulo'
+            'nule'
+            'nuli'
+            'nuto'
+            'nuti'
+            'nuta'
+            'ne{sx}'
+            'nuo'
+            'nut'           (<-'n')
+            'niv{sx}i'
+            'nismo'
+            'niste'
+            'niti'
+            'nila'
+            'nilo'
+            'nile'
+            'nili'
+            'ni{sx}'
+            'nio'           (<-'ni')
+            'aju{cy}i'
+            'av{sx}i'
+            'asmo'
+            'ajmo'
+            'ajte'
+            'ajem'
+            'aloj'
+            'amo'
+            'ate'
+            'aje'
+            'aju'
+            'ati'
+            'a{sx}e'
+            'ahu'
+            'ala'
+            'ali'
+            'ale'
+            'alo'
+            'ano'
+            'at'
+            'ah'
+            'ao'
+            'aj'
+            'an'
+            'am'
+            'a{sx}'         (<-'a')
+            'uraju{cy}i'
+            'urasmo'
+            'uraste'
+            'urajmo'
+            'urajte'
+            'uramo'
+            'urate'
+            'uraju'
+            'urati'
+            'ura{sx}e'
+            'urahu'
+            'urala'
+            'urali'
+            'urale'
+            'uralo'
+            'urana'
+            'urano'
+            'urani'
+            'urane'
+            'ural'
+            'urat'
+            'urah'
+            'urao'
+            'uraj'
+            'uran'
+            'uram'
+            'ura{sx}'       (<-'ur')
+            'astajasmo'
+            'astajaste'
+            'astajahu'
+            'astajati'
+            'astajemo'
+            'astajete'
+            'astaja{sx}e'
+            'astajali'
+            'astaju{cy}i'
+            'astajala'
+            'astajalo'
+            'astajale'
+            'astajmo'
+            'astajao'
+            'astajem'
+            'astaje{sx}'
+            'astajat'
+            'astajah'
+            'astajte'
+            'astaje'
+            'astaju'        (<-'astaj')
+            'istajasmo'
+            'istajaste'
+            'istajahu'
+            'istajati'
+            'istajemo'
+            'istajete'
+            'istaja{sx}e'
+            'istajali'
+            'istaju{cy}i'
+            'istajala'
+            'istajalo'
+            'istajale'
+            'istajmo'
+            'istajao'
+            'istajem'
+            'istaje{sx}'
+            'istajat'
+            'istajah'
+            'istajte'
+            'istaje'
+            'istaju'        (<-'istaj')
+            'ostajasmo'
+            'ostajaste'
+            'ostajahu'
+            'ostajati'
+            'ostajemo'
+            'ostajete'
+            'ostaja{sx}e'
+            'ostajali'
+            'ostaju{cy}i'
+            'ostajala'
+            'ostajalo'
+            'ostajale'
+            'ostajmo'
+            'ostajao'
+            'ostajem'
+            'ostaje{sx}'
+            'ostajat'
+            'ostajah'
+            'ostajte'
+            'ostaje'
+            'ostaju'        (<-'ostaj')
+            'alama'
+            'alima'
+            'alom'
+            'alu'
+            'al'            (<-'a')
+            'ajevima'
+            'ajevi'
+            'ajeva'
+            'ajeve'
+            'ajama'
+            'ajima'
+            'aja'
+            'aji'           (<-'aj')
+            'astadosmo'
+            'astadoste'
+            'astado{sx}e'
+            'astanemo'
+            'astademo'
+            'astanete'
+            'astadete'
+            'astanimo'
+            'astanite'
+            'astanila'
+            'astav{sx}i'
+            'astanem'
+            'astadem'
+            'astane{sx}'
+            'astade{sx}'
+            'astadoh'
+            'astade'
+            'astati'
+            'astane'
+            'astanu'
+            'astadu'
+            'astala'
+            'astali'
+            'astalo'
+            'astale'
+            'astat'
+            'astao'         (<-'asta')
+            'istadosmo'
+            'istadoste'
+            'istado{sx}e'
+            'istanemo'
+            'istademo'
+            'istanete'
+            'istadete'
+            'istanimo'
+            'istanite'
+            'istanila'
+            'istav{sx}i'
+            'istanem'
+            'istadem'
+            'istane{sx}'
+            'istade{sx}'
+            'istadoh'
+            'istade'
+            'istati'
+            'istane'
+            'istanu'
+            'istadu'
+            'istala'
+            'istali'
+            'istalo'
+            'istale'
+            'istat'
+            'istao'         (<-'ista')
+            'ostadosmo'
+            'ostadoste'
+            'ostado{sx}e'
+            'ostanemo'
+            'ostademo'
+            'ostanete'
+            'ostadete'
+            'ostanimo'
+            'ostanite'
+            'ostanila'
+            'ostav{sx}i'
+            'ostanem'
+            'ostadem'
+            'ostane{sx}'
+            'ostade{sx}'
+            'ostadoh'
+            'ostade'
+            'ostati'
+            'ostane'
+            'ostanu'
+            'ostadu'
+            'ostala'
+            'ostali'
+            'ostalo'
+            'ostale'
+            'ostat'
+            'ostao'         (<-'osta')
+            'tasmo'
+            'taste'
+            'tajmo'
+            'tajte'
+            'tav{sx}i'
+            'tati'
+            'tamo'
+            'tate'
+            'taju'
+            'tala'
+            'talo'
+            'tale'
+            'tali'
+            'tana'
+            'tano'
+            'tani'
+            'tane'
+            'tan'
+            'taj'
+            'tao'
+            'tam'
+            'ta{sx}'
+            'tat'
+            'tah'           (<-'ta')
+            'injasmo'
+            'injaste'
+            'injati'
+            'injemo'
+            'injete'
+            'injali'
+            'injala'
+            'injalo'
+            'injale'
+            'inja{sx}e'
+            'injahu'
+            'injem'
+            'inje{sx}'
+            'injat'
+            'injah'
+            'injao'         (<-'inj')
+            'astemo'
+            'astete'
+            'astimo'
+            'astite'
+            'astu{cy}i'
+            'aste{sx}'
+            'asli'
+            'asla'
+            'aslo'
+            'asle'          (<-'as')
+            'iv{sx}i'
+            'ie{cy}i'
+            'ismo'
+            'imo'
+            'ite'
+            'iti'
+            'ili'
+            'ila'
+            'ilo'
+            'ile'
+            'im'
+            'i{sx}'
+            'it'
+            'ih'
+            'io'            (<-'i')
+            'ijemo'
+            'ijete'
+            'ijem'
+            'ije{sx}'
+            'ijmo'
+            'ijte'
+            'iju'
+            'ije'
+            'ij'
+            'ilu'           (<-'i')
+            'lu{cx}ujete'
+            'lu{cx}uju{cy}i'
+            'lu{cx}ujemo'
+            'lu{cx}ujem'
+            'lu{cx}uje{sx}'
+            'lu{cx}ismo'
+            'lu{cx}iste'
+            'lu{cx}ujmo'
+            'lu{cx}ujte'
+            'lu{cx}uje'
+            'lu{cx}uju'
+            'lu{cx}i{sx}e'
+            'lu{cx}iti'
+            'lu{cx}imo'
+            'lu{cx}ite'
+            'lu{cx}ila'
+            'lu{cx}ilo'
+            'lu{cx}ili'
+            'lu{cx}ile'
+            'lu{cx}ena'
+            'lu{cx}eno'
+            'lu{cx}eni'
+            'lu{cx}ene'
+            'lu{cx}uj'
+            'lu{cx}io'
+            'lu{cx}en'
+            'lu{cx}im'
+            'lu{cx}i{sx}'
+            'lu{cx}it'
+            'lu{cx}ih'
+            'lu{cx}e'
+            'lu{cx}i'       (<-'lu{cx}')
+            'jetismo'
+            'jetiste'
+            'jeti{sx}e'
+            'jetimo'
+            'jetite'
+            'jetiti'
+            'jetili'
+            'jetila'
+            'jetilo'
+            'jetile'
+            'jetim'
+            'jeti{sx}'
+            'jetit'
+            'jetih'
+            'jetio'         (<-'jeti')
+            'emo'
+            'em'
+            'e{sx}'
+            'elama'
+            'el'            (<-'e')
+            'ilama'
+            'ilima'
+            'ilom'
+            'il'            (<-'i')
+            'atijega'
+            'atijemu'
+            'atijima'
+            'atijeg'
+            'atijem'
+            'atega'
+            'atemu'
+            'ateg'
+            'atem'
+            'atijih'
+            'atijim'
+            'atima'
+            'atoga'
+            'atome'
+            'atomu'
+            'atiji'
+            'atije'
+            'atija'
+            'atiju'
+            'atoj'
+            'atog'
+            'atom'
+            'atim'
+            'atih'
+            'ata'
+            'atu'
+            'ato'           (<-'at')
+            'etav{sx}i'
+            'etu{cy}i'
+            'etemo'
+            'etimo'
+            'etem'
+            'ete{sx}'       (<-'et')
+            'lucujuci'
+            'lucujemo'
+            'lucujete'
+            'lucujem'
+            'lucujes'
+            'lucujmo'
+            'lucujte'
+            'lucismo'
+            'luciste'
+            'luciti'
+            'lucite'
+            'lucise'
+            'lucuje'
+            'lucuju'
+            'lucila'
+            'lucile'
+            'lucili'
+            'lucilo'
+            'lucena'
+            'luceni'
+            'lucene'
+            'luceno'
+            'lucimo'
+            'lucim'
+            'lucis'
+            'lucih'
+            'lucit'
+            'lucio'
+            'lucuj'
+            'lucen'
+            'luce'
+            'luci'          (no_diacritics <-'luc')
+            'snjijima'
+            'snjijemu'
+            'snjijega'
+            'snjijim'
+            'snjijih'
+            'snjijeg'
+            'snjijoj'
+            'snjiji'
+            'snjija'
+            'snjije'
+            'snjiju'
+            'snjima'
+            'snjemu'
+            'snjomu'
+            'snjome'
+            'snjega'
+            'snjoga'
+            'snjih'
+            'snjim'
+            'snjem'
+            'snjom'
+            'snjeg'
+            'snjog'
+            'snjoj'
+            'snja'
+            'snje'
+            'snji'
+            'snjo'
+            'snju'          (no_diacritics <-'snj')
+            'osijima'
+            'osijemu'
+            'osijega'
+            'snjijem'
+            'osijih'
+            'osijim'
+            'osijem'
+            'osijeg'
+            'osijoj'
+            'osima'
+            'osemu'
+            'osomu'
+            'osome'
+            'osega'
+            'osoga'
+            'osija'
+            'osije'
+            'osiji'
+            'osiju'
+            'osih'
+            'osim'
+            'osem'
+            'osom'
+            'oseg'
+            'osog'
+            'osoj'
+            'osa'
+            'ose'
+            'osi'
+            'osu'           (no_diacritics <-'os')
+            'acismo'
+            'aciste'
+            'acima'
+            'acimo'
+            'acome'
+            'acomu'
+            'acite'
+            'aciti'
+            'acise'
+            'acila'
+            'acile'
+            'acili'
+            'acilo'
+            'acega'
+            'acene'
+            'aceci'
+            'aceni'
+            'acemu'
+            'acena'
+            'aceno'
+            'acoga'
+            'acoj'
+            'acih'
+            'acem'
+            'acom'
+            'acen'
+            'acog'
+            'acit'
+            'acio'
+            'aceg'
+            'acim'
+            'acuh'
+            'acis'
+            'ace'
+            'aca'
+            'aci'           (no_diacritics <-'ac')
+            'ecome'
+            'ecoga'
+            'ecemu'
+            'ecima'
+            'ecega'
+            'ecomu'
+            'ecoj'
+            'ecuh'
+            'ecom'
+            'ecog'
+            'eceg'
+            'ecih'
+            'ecem'
+            'ecim'
+            'eca'
+            'ece'           (no_diacritics <-'ec')
+            'ucomu'
+            'ucome'
+            'ucima'
+            'ucoga'
+            'ucega'
+            'ucemu'
+            'ucih'
+            'ucog'
+            'uceg'
+            'ucom'
+            'ucem'
+            'ucim'
+            'ucuh'
+            'ucoj'
+            'uca'
+            'uce'           (no_diacritics <-'uc')
+            'rosismo'
+            'rosivsi'
+            'rosiste'
+            'rositi'
+            'rosili'
+            'rosise'
+            'rosite'
+            'rosilo'
+            'rosimo'
+            'rosile'
+            'rosila'
+            'rosit'
+            'rosis'
+            'rosio'
+            'rosim'
+            'rosih'         (no_diacritics <-'rosi')
+            'acavsi'
+            'acaste'
+            'acasmo'
+            'acaju'
+            'acane'
+            'acate'
+            'acali'
+            'acani'
+            'acati'
+            'acale'
+            'acahu'
+            'acase'
+            'acano'
+            'acamo'
+            'acalo'
+            'acana'
+            'acala'
+            'acam'
+            'acan'
+            'acao'
+            'acas'
+            'acat'
+            'acah'          (no_diacritics <-'aca')
+            'jasima'
+            'jasama'
+            'jasem'
+            'jasom'
+            'jase'
+            'jasi'
+            'jasa'
+            'jasu'          (no_diacritics <-'jas')
+            'tasima'
+            'tasama'
+            'tasem'
+            'tasom'
+            'tase'
+            'tasa'
+            'tasu'
+            'tasi'          (no_diacritics <-'tas')
+            'gasima'
+            'gasama'
+            'gasem'
+            'gasom'
+            'gasi'
+            'gasu'
+            'gase'
+            'gasa'          (no_diacritics <-'gas')
+            'nasama'
+            'nasima'
+            'nasem'
+            'nasom'
+            'nasu'
+            'nasi'
+            'nase'
+            'nasa'          (no_diacritics <-'nas')
+            'kasama'
+            'kasima'
+            'kasom'
+            'kasem'
+            'kasi'
+            'kasu'
+            'kase'
+            'kasa'          (no_diacritics <-'kas')
+            'vasama'
+            'vasima'
+            'vasom'
+            'vasem'
+            'vasi'
+            'vase'
+            'vasa'
+            'vasu'          (no_diacritics <-'vas')
+            'basama'
+            'basima'
+            'basom'
+            'basem'
+            'basi'
+            'base'
+            'basu'
+            'basa'          (no_diacritics <-'bas')
+            'astuci'
+            'astes'         (no_diacritics <-'as')
+            'cinima'
+            'cinome'
+            'cinama'
+            'cinomu'
+            'cinoga'
+            'cinom'
+            'cinih'
+            'cinim'
+            'cinog'
+            'cinoj'
+            'cino'
+            'cini'
+            'cinu'
+            'cine'
+            'cina'          (no_diacritics <-'cin')
+            'astajase'
+            'astajuci'
+            'astajes'       (no_diacritics <-'astaj')
+            'istajase'
+            'istajuci'
+            'istajes'       (no_diacritics <-'istaj')
+            'ostajase'
+            'ostajuci'
+            'ostajes'       (no_diacritics <-'ostaj')
+            'astadose'
+            'astades'
+            'astanes'
+            'astavsi'       (no_diacritics <-'asta')
+            'istadose'
+            'istades'
+            'istanes'
+            'istavsi'       (no_diacritics <-'ista')
+            'ostadose'
+            'ostades'
+            'ostanes'
+            'ostavsi'       (no_diacritics <-'osta')
+            'avajuci'
+            'avase'
+            'avas'          (no_diacritics <-'ava')
+            'evajuci'
+            'evase'
+            'evas'          (no_diacritics <-'eva')
+            'ivajuci'
+            'ivase'
+            'ivas'          (no_diacritics <-'iva')
+            'uvajuci'
+            'uvase'
+            'uvas'          (no_diacritics <-'uva')
+            'ovase'         (no_diacritics <-'ova')
+            'jetise'
+            'jetis'         (no_diacritics <-'jeti')
+            'injase'
+            'injes'         (no_diacritics <-'inj')
+            'istem'         (no_diacritics <-'ist')
+            'esama'
+            'esem'
+            'esi'           (no_diacritics <-'es')
+            'etavsi'
+            'etuci'
+            'etes'          (no_diacritics <-'et')
+            'isama'
+            'isem'
+            'isi'           (no_diacritics <-'is')
+            'irajuci'
+            'irujuci'
+            'irujes'
+            'iravsi'
+            'irase'
+            'iras'          (no_diacritics <-'ir')
+            'urajuci'
+            'urase'
+            'uras'          (no_diacritics <-'ur')
+            'ujuci'
+            'ujes'          (no_diacritics <-'uj')
+            'nivsi'
+            'nis'           (no_diacritics <-'ni')
+            'snega'
+            'snemu'
+            'snem'
+            'sneg'          (no_diacritics <-'sn')
+            'tavsi'
+            'tas'           (no_diacritics <-'ta')
+            'ajuci'
+            'avsi'
+            'ase'
+            'as'            (no_diacritics <-'a')
+            'ijes'
+            'ivsi'
+            'ieci'
+            'is'            (no_diacritics <-'i')
+            'es'            (no_diacritics <-'e')
+            'nuvsi'
+            'nuci'
+            'nes'           (no_diacritics <-'n')
+        )
+    )
+
+    define Step_3 as (
+        [substring] R1 among (
+            'enom'
+            'enoj'
+            'enog'
+            'enim'
+            'enih'
+            'anoj'
+            'anog'
+            'anim'
+            'anih'
+            'ost'
+            'eno'
+            'eni'
+            'oga'
+            'ima'
+            'enu'
+            'ena'
+            'ama'
+            'ano'
+            'ani'
+            'om'
+            'og'
+            'u'
+            'o'
+            'i'
+            'e'
+            'a'             (<-'')
+        )
+    )
+)
+
+define stem as (
+    do cyr_to_lat
+    do prelude
+    do mark_regions
+    backwards (
+        do Step_1
+        do (Step_2 or Step_3)
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/serbian/stemmer.tt b/algorithms/serbian/stemmer.tt new file mode 100644 index 0000000..c7298ba --- /dev/null +++ b/algorithms/serbian/stemmer.tt @@ -0,0 +1,273 @@ +[% header('Serbian stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([ +'ambasade', +'ambasadi', +'abdominalna', +'abdominalno', +'abdominalnih', +'abdominalnim', +'abdominalnog', +'abdominalnoj', +'abdominalnom', +'abdominalnu', +'abeceda', +'abecede', +'abecedi', +'abecedni', +'abecednih', +'abecednim', +'abecedno', +'abecednog', +'abecednom', +'abecedom', +'abecedu', +'abecendom', +'ablendovanje', +'ablenduje', +'ablenduju', +'ablendujući', +'abnormalan', +'abnormalije', +'abnormalijom', +'abnormalna', +'abnormalne', +'abnormalni', +'abnormalnih', +'abnormalnim', +'abnormalno', +'abnormalnog', +'abnormalnom', +'abnormalnost', +'abnormalnosti', +'abnormalnostima', +'abnormalnu', +], [ +'obustavila', +'obustavile', +'obustavili', +'obustavilo', +'obustavio', +'ocenjivala', +'ocenjivali', +'ocenjivan', +'ocenjivana', +'ocenjivane', +'ocenjivani', +'ocenjivano', +'ocenjivanja', +'ocenjivanje', +'ocenjivanju', +'ocenjivao', +'ocenjivati', +'ocenjuje', +'ocenjujem', +'ocenjujemo', +'ocenjuješ', +'ocenjujete', +'ocenjuju', +'ocenjujući', +'očajan', +'očajna', +'očajne', +'očajni', +'očajno', +'padobrana', +'padobranaca', +'padobranci', +'padobrancima', +'padobranom', +'padobranskim', +'padobransku', +'padobranu', +'paganska', +'paganske', +'paganski', +'paganskih', +]) %] + +

The stemming algorithm

+ +

+The Serbian language is a Slavic language (Indo-European) of the South Slavic +subgroup. It is highly inflected and uses similar rules for morphological +derivation and flexion as other Slavic languages, especially ones derived from +the Serbo-Croatian language used in the former Yugoslavia. Because of this +highly inflected characteristic a stemmer for Serbian language will have many +more rules than stemmers for less inflected languages. +

+ +

+Serbian Stemmer described in this document is based on the Croatian +Stemmer which is published under the GNU Lesser General Public License. +Mark Regions, Morphological Changes (Step_1) and Stemming +(Step_2) routines are based on the Croatian Stemming Algorithm. In +addition, some of the existing rules for Morphological Changes and Stemming +(Step_1 and Step_2 among lists) have been modified and new rules have +been added for the needs of the Serbian Stemmer. +

+ +

+Latin alphabet in Serbian includes the following letters with diacritics: +

+ +
+ č   ć   đ   š   ž +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u +
+ +

+There is also letter - r - that isn't a vowel but it is sometimes used for syllabification. +

+ +

Main Routines of Serbian Stemming Algorithm are:

+ +
    +
  1. Conversion of Cyrillic alphabet to Latin alphabet +

    +The Serbian language uses both Cyrillic and Latin alphabets, but +these days most people use the Latin alphabet on their PCs, Phones, etc. This +algorithm is developed mostly for the purposes of the Information Retrieval, +therefore the first thing it does is to convert Cyrillic letters to +Latin. +

    +
  2. + +
  3. Prelude +

    +In Serbian language there are two dialects: Ekavian and +Ijekavian. For example words: +

      +
    • senka (Ekavian) +
    • sjenka (Ijekavian) +
    +have the same meaning (Shadow), also words: +
      +
    • mleko (Ekavian) +
    • mlijeko (Ijekavian) +
    +have the same meaning (Milk) but are spelled differently and because +mostly used dialect in Serbia is Ekavian the next thing to do is to +replace Ijekavian dialect with it. +

    + +

    +These days it is also common, although not valid, to use combination of letters +"d" and "j" instead of a single letter "đ". For example +people will more often write "Novak Djoković" instead of "Novak +Đoković" and because this algorithm is developed with Information Retrieval +in mind they should be treated as the same terms. +

    +
  4. +
  5. Mark Regions +

    +R1 is either: +

      +
    1. a region after the first vowel if there are at least two letters outside +of it, otherwise it is a region after the first non-vowel following a vowel, +
    2. a region after the first "r" if there are at least two letters +outside of it, otherwise it is a region after the first non-"r" +following an "r". +
    +

    +Note that every suffix which the stemmer can remove contains at least one +vowel, so in the degenerate case of an input which contains no vowels there +is nothing to be done. The Snowball implementation of this stemmer sets +R1 to be a zero length region at the end of the word if the input +contains no vowels and no "r". +

    +

    +In Serbian language there are some words in which "r" letter is used for +syllabification and in such words vowels can appear at the very end - for +example word "grmlje". +

    + +

    +So before algorithm decide what will R1 be, it needs to look if and +where "r" letter occurs and where is the first vowel. If it finds "r" +that occurred before the first vowel and there is at least one letter between +them this means that "r" is used for syllabification and R1 is +2), otherwise R1 is 1). +

    + +

    +For example: +

      +
    • "tr|go|va|čki" - in this word "tr" is the first syllable + which means that "r" is used for syllabification and R1 = + "govački" + +
    • "tre|ne|rka" - in this word there is a letter "r" before the + first vowel but there aren't any letters between them which means that + "r" isn't used for syllabification and R1 = "nerka". + +
    • "r|ta|njski" - in this word "r" is the first syllable but if + we use "tanjski" as R1 it won't left enough letters outside + of it, so we need to shrink it down to a region after the first + non-"r" following an "r" which is in this case = + "anjski". + +
    • "a|vi|on" - similar to the previous case but with a vowel instead + of an "r". +
    + +Inside Mark Regions routine there is a test routine that is used to +check for letters with diacritics and is used later to apply certain rules in +stemming. Result of this test routine is stored inside no_diacritics flag. +This test routine is used because people these days tend to use letters without +diacritics (instead of the proper ones with diacritics) and we need to take +this into account also. +

    +
  6. + +
  7. Morphological Changes +

    +Very last thing to do, before any stemming is done, are morphological changes. +These changes are applied so that we get the same stems for different forms of a +word. +

    + +

    +For example words: +

      +
    • "pravilan" (Masculine, Singular) +
    • "pravilna" (Feminine, Singular) +
    • "pravilno" (Neuter, Singular) +
    +should have the same stem. To get that result the algorithm will first change +word "pravilan" (Masculine, Singular) to "pravilni" (Masculine, +Plural) and after that the word will be stemmed. +

    +
  8. + +
  9. Stemming +

    +There are two steps for stemming. The first contains most of the rules and is +the primary stemming routine and the second one will try to stem the word only +if the first one failed to do so - whether it was because there were no rules +that could be applied or the rule overlapped the R1 region. The second +step contains a few rules that will do proper stemming for most words that +couldn't be stemmed using the rules from the first step. +

    +
  10. +
+ +

The full algorithm in Snowball

+ +[% highlight_file('serbian') %] + +[% footer %] diff --git a/algorithms/spanish/stemmer.html b/algorithms/spanish/stemmer.html new file mode 100644 index 0000000..bf754fb --- /dev/null +++ b/algorithms/spanish/stemmer.html @@ -0,0 +1,692 @@ + + + + + + + + + + Spanish stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Spanish stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Spanish vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+che
+checa
+checar
+checo
+checoslovaquia
+chedraoui
+chefs
+cheliabinsk
+chelo
+chemical
+chemicalweek
+chemise
+chepo
+cheque
+chequeo
+cheques
+cheraw
+chesca
+chester
+chetumal
+chetumaleños
+chevrolet
+cheyene
+cheyenne
+chi
+chía
+chiapaneca
+chiapas
+chiba
+chic
+chica
+chicago
+chicana
+chicano
+chicas
+chicharrones
+chichen
+chichimecas
+chicles
+chico
+
+che
+chec
+chec
+chec
+checoslovaqui
+chedraoui
+chefs
+cheliabinsk
+chel
+chemical
+chemicalweek
+chemis
+chep
+chequ
+cheque
+chequ
+cheraw
+chesc
+chest
+chetumal
+chetumaleñ
+chevrolet
+cheyen
+cheyenn
+chi
+chi
+chiapanec
+chiap
+chib
+chic
+chic
+chicag
+chican
+chican
+chic
+chicharron
+chich
+chichimec
+chicl
+chic
+
+torá
+tórax
+torcer
+toreado
+toreados
+toreándolo
+torear
+toreara
+torearlo
+toreó
+torero
+toreros
+torio
+tormenta
+tormentas
+tornado
+tornados
+tornar
+tornen
+torneo
+torneos
+tornillo
+tornillos
+torniquete
+torno
+toro
+toronto
+toros
+torpedearon
+torpeza
+torrado
+torralba
+torre
+torrencial
+torrenciales
+torrente
+torreon
+torreón
+torres
+torrescano
+
+tor
+torax
+torc
+tor
+tor
+tor
+tor
+tor
+tor
+tore
+torer
+torer
+tori
+torment
+torment
+torn
+torn
+torn
+torn
+torne
+torne
+tornill
+tornill
+torniquet
+torn
+tor
+toront
+tor
+torped
+torpez
+torr
+torralb
+torr
+torrencial
+torrencial
+torrent
+torreon
+torreon
+torr
+torrescan
+
+
+ +

The stemming algorithm

+ +

+Letters in Spanish include the following accented forms, +

+ +
+ á   é   í   ó   ú   ü   ñ +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   á   é   í   ó   ú   ü +
+ +

+R2 is defined in the usual way — +see the note on R1 and R2. +

+ +

+RV is defined as follows (and this is not the same as the + French stemmer +definition): +

+ +

+If the second letter is a consonant, RV is the region after the next +following vowel, or if the first two letters are vowels, RV is the region +after the next consonant, and otherwise (consonant-vowel case) RV is the +region after the third letter. But RV is the end of the word if these +positions cannot be found. +

+ +

+For example, +

+ +
+    m a c h o     o l i v a     t r a b a j o     á u r e o
+         |...|         |...|         |.......|         |...|
+
+ +

+Always do steps 0 and 1. +

+ +

+Step 0: Attached pronoun +

+ +
+ Search for the longest among the following suffixes +
+ me   se   sela   selo   selas   selos   la   le   lo   las   les   los   nos +

+ and delete it, if comes after one of +
+ (a) iéndo   ándo   ár   ér   ír
+ (b) ando   iendo   ar   er   ir
+ (c) yendo following u +
+

+ in RV. In the case of (c), yendo must lie in RV, but the preceding + u can be outside it. +

+ +

+ In the case of (a), deletion is followed by removing the acute accent + (for example, haciéndolahaciendo). +

+
+ +

+Step 1: Standard suffix removal +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
anza   anzas   ico   ica   icos   icas   ismo   ismos   able   ables   ible   ibles   ista +   istas   oso   osa   osos   osas   amiento   amientos   imiento   + imientos +
delete if in R2 +
adora   ador   ación   adoras   adores   aciones   ante   antes   ancia   ancias +
delete if in R2 +
if preceded by ic, delete if in R2 +
logía   logías +
replace with log if in R2 +
ución   uciones +
replace with u if in R2 +
encia   encias +
replace with ente if in R2 +
amente +
delete if in R1 +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by os, ic or ad, delete if in R2 +
mente +
delete if in R2 +
if preceded by ante, able or ible, delete if in R2 +
idad   idades +
delete if in R2 +
if preceded by abil, ic or iv, delete if in R2 +
iva   ivo   ivas   ivos +
delete if in R2 +
if preceded by at, delete if in R2 +
+
+ +

+Do step 2a if no ending was removed by step 1. +

+ +

+Step 2a: Verb suffixes beginning y +

+ +
+ Search for the longest among the following suffixes in RV, and if found, + delete if preceded by u. +
+ ya   ye   yan   yen   yeron   yendo   yo   yó   yas   yes   yais   + yamos +
+ (Note that the preceding u need not be in RV.) +
+ +

+Do Step 2b if step 2a was done, but failed to remove a suffix. +

+ +

+Step 2b: Other verb suffixes +

+ +
+ Search for the longest among the following suffixes in RV, and perform the + action indicated. +
+
en   es   éis   emos +
delete, and if preceded by gu delete the u (the gu need not be in + RV) +
arían   arías   arán   arás   aríais   aría   aréis   aríamos   aremos +   ará   aré +   erían   erías   erán   erás   eríais   ería   eréis   eríamos   eremos +   erá   eré +   irían   irías   irán   irás   iríais   iría   iréis   iríamos   iremos +   irá   iré +   aba   ada   ida   ía   ara   iera   ad   ed   id   ase   iese   aste   iste   an   aban   ían +   aran   ieran   asen   iesen   aron   ieron   ado   ido   ando   iendo   ió   ar   er   ir   as +   abas   adas   idas   ías   aras   ieras   ases   ieses   ís   áis   abais   íais +   arais   ierais     aseis   ieseis   asteis   isteis   ados   idos   amos   ábamos +   íamos   imos   áramos   iéramos   iésemos   ásemos +
delete +
+
+ +

+Always do step 3. +

+ +

+Step 3: residual suffix +

+ +
+ Search for the longest among the following suffixes in RV, and perform the + action indicated. +
+
os   a   o   á   í   ó +
delete if in RV +
e   é +
delete if in RV, and if preceded by gu with the u in RV delete the u +
+
+ +

+And finally: +

+ +
+ Remove acute accents +
+ +

The same algorithm in Snowball

+ +
routines (
+           postlude mark_regions
+           RV R1 R2
+           attached_pronoun
+           standard_suffix
+           y_verb_suffix
+           verb_suffix
+           residual_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a'   '{U+00E1}'  // a-acute
+stringdef e'   '{U+00E9}'  // e-acute
+stringdef i'   '{U+00ED}'  // i-acute
+stringdef o'   '{U+00F3}'  // o-acute
+stringdef u'   '{U+00FA}'  // u-acute
+stringdef u"   '{U+00FC}'  // u-diaeresis
+stringdef n~   '{U+00F1}'  // n-tilde
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+    $p2 = limit  // defaults
+
+    do (
+        ( v (non-v gopast v) or (v gopast non-v) )
+        or
+        ( non-v (non-v gopast v) or (v next) )
+        setmark pV
+    )
+    do (
+        gopast v gopast non-v setmark p1
+        gopast v gopast non-v setmark p2
+    )
+)
+
+define postlude as repeat (
+    [substring] among(
+        '{a'}' (<- 'a')
+        '{e'}' (<- 'e')
+        '{i'}' (<- 'i')
+        '{o'}' (<- 'o')
+        '{u'}' (<- 'u')
+        // and possibly {u"}->u here, or in prelude
+        ''     (next)
+    ) //or next
+)
+
+backwardmode (
+
+    define RV as $pV <= cursor
+    define R1 as $p1 <= cursor
+    define R2 as $p2 <= cursor
+
+    define attached_pronoun as (
+        [substring] among(
+            'me' 'se'  'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
+            'las' 'les' 'los' 'nos'
+        )
+        substring RV among(
+            'i{e'}ndo' (] <- 'iendo')
+            '{a'}ndo'  (] <- 'ando')
+            '{a'}r'    (] <- 'ar')
+            '{e'}r'    (] <- 'er')
+            '{i'}r'    (] <- 'ir')
+            'ando'
+            'iendo'
+            'ar' 'er' 'ir'
+                       (delete)
+            'yendo'    ('u' delete)
+        )
+    )
+
+    define standard_suffix as (
+        [substring] among(
+
+            'anza' 'anzas'
+            'ico' 'ica' 'icos' 'icas'
+            'ismo' 'ismos'
+            'able' 'ables'
+            'ible' 'ibles'
+            'ista' 'istas'
+            'oso' 'osa' 'osos' 'osas'
+            'amiento' 'amientos'
+            'imiento' 'imientos'
+            (
+                R2 delete
+            )
+            'adora' 'ador' 'aci{o'}n'
+            'adoras' 'adores' 'aciones'
+            'ante' 'antes' 'ancia' 'ancias'// Note 1
+            (
+                R2 delete
+                try ( ['ic'] R2 delete )
+            )
+            'log{i'}a'
+            'log{i'}as'
+            (
+                R2 <- 'log'
+            )
+            'uci{o'}n' 'uciones'
+            (
+                R2 <- 'u'
+            )
+            'encia' 'encias'
+            (
+                R2 <- 'ente'
+            )
+            'amente'
+            (
+                R1 delete
+                try (
+                    [substring] R2 delete among(
+                        'iv' (['at'] R2 delete)
+                        'os'
+                        'ic'
+                        'ad'
+                    )
+                )
+            )
+            'mente'
+            (
+                R2 delete
+                try (
+                    [substring] among(
+                        'ante' // Note 1
+                        'able'
+                        'ible' (R2 delete)
+                    )
+                )
+            )
+            'idad'
+            'idades'
+            (
+                R2 delete
+                try (
+                    [substring] among(
+                        'abil'
+                        'ic'
+                        'iv'   (R2 delete)
+                    )
+                )
+            )
+            'iva' 'ivo'
+            'ivas' 'ivos'
+            (
+                R2 delete
+                try (
+                    ['at'] R2 delete // but not a further   ['ic'] R2 delete
+                )
+            )
+        )
+    )
+
+    define y_verb_suffix as (
+        setlimit tomark pV for ([substring]) among(
+            'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
+            'yas' 'yes' 'yais' 'yamos'
+                ('u' delete)
+        )
+    )
+
+    define verb_suffix as (
+        setlimit tomark pV for ([substring]) among(
+
+            'en' 'es' '{e'}is' 'emos'
+                (try ('u' test 'g') ] delete)
+
+            'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+            'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
+            'ar{e'}'
+            'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+            'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+            'er{e'}'
+            'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+            'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+            'ir{e'}'
+
+            'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
+            'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+            'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+            'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
+            'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+            'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+            'ierais'  'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+            'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
+            '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+                (delete)
+        )
+    )
+
+    define residual_suffix as (
+        [substring] among(
+            'os'
+            'a' 'o' '{a'}' '{i'}' '{o'}'
+                ( RV delete )
+            'e' '{e'}'
+                ( RV delete try( ['u'] test 'g' RV delete ) )
+        )
+    )
+)
+
+define stem as (
+    do mark_regions
+    backwards (
+        do attached_pronoun
+        do ( standard_suffix or
+             y_verb_suffix or
+             verb_suffix
+           )
+        do residual_suffix
+    )
+    do postlude
+)
+
+/*
+    Note 1: additions of 15 Jun 2005
+*/
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/spanish/stemmer.tt b/algorithms/spanish/stemmer.tt new file mode 100644 index 0000000..2098c3b --- /dev/null +++ b/algorithms/spanish/stemmer.tt @@ -0,0 +1,205 @@ +[% header('Spanish stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'che', 'torá']) %] + +

The stemming algorithm

+ +

+Letters in Spanish include the following accented forms, +

+ +
+ á   é   í   ó   ú   ü   ñ +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   á   é   í   ó   ú   ü +
+ +

+R2 is defined in the usual way — +see the note on R1 and R2. +

+ +

+RV is defined as follows (and this is not the same as the + French stemmer +definition): +

+ +

+If the second letter is a consonant, RV is the region after the next +following vowel, or if the first two letters are vowels, RV is the region +after the next consonant, and otherwise (consonant-vowel case) RV is the +region after the third letter. But RV is the end of the word if these +positions cannot be found. +

+ +

+For example, +

+ +
+    m a c h o     o l i v a     t r a b a j o     á u r e o
+         |...|         |...|         |.......|         |...|
+
+ +

+Always do steps 0 and 1. +

+ +

+Step 0: Attached pronoun +

+ +
+ Search for the longest among the following suffixes +
+ me   se   sela   selo   selas   selos   la   le   lo   las   les   los   nos +

+ and delete it, if comes after one of +
+ (a) iéndo   ándo   ár   ér   ír
+ (b) ando   iendo   ar   er   ir
+ (c) yendo following u +
+

+ in RV. In the case of (c), yendo must lie in RV, but the preceding + u can be outside it. +

+ +

+ In the case of (a), deletion is followed by removing the acute accent + (for example, haciéndolahaciendo). +

+
+ +

+Step 1: Standard suffix removal +

+ +
+ Search for the longest among the following suffixes, and perform the + action indicated. +
+
anza   anzas   ico   ica   icos   icas   ismo   ismos   able   ables   ible   ibles   ista +   istas   oso   osa   osos   osas   amiento   amientos   imiento   + imientos +
delete if in R2 +
adora   ador   ación   adoras   adores   aciones   ante   antes   ancia   ancias +
delete if in R2 +
if preceded by ic, delete if in R2 +
logía   logías +
replace with log if in R2 +
ución   uciones +
replace with u if in R2 +
encia   encias +
replace with ente if in R2 +
amente +
delete if in R1 +
if preceded by iv, delete if in R2 (and if further preceded by at, + delete if in R2), otherwise, +
if preceded by os, ic or ad, delete if in R2 +
mente +
delete if in R2 +
if preceded by ante, able or ible, delete if in R2 +
idad   idades +
delete if in R2 +
if preceded by abil, ic or iv, delete if in R2 +
iva   ivo   ivas   ivos +
delete if in R2 +
if preceded by at, delete if in R2 +
+
+ +

+Do step 2a if no ending was removed by step 1. +

+ +

+Step 2a: Verb suffixes beginning y +

+ +
+ Search for the longest among the following suffixes in RV, and if found, + delete if preceded by u. +
+ ya   ye   yan   yen   yeron   yendo   yo   yó   yas   yes   yais   + yamos +
+ (Note that the preceding u need not be in RV.) +
+ +

+Do Step 2b if step 2a was done, but failed to remove a suffix. +

+ +

+Step 2b: Other verb suffixes +

+ +
+ Search for the longest among the following suffixes in RV, and perform the + action indicated. +
+
en   es   éis   emos +
delete, and if preceded by gu delete the u (the gu need not be in + RV) +
arían   arías   arán   arás   aríais   aría   aréis   aríamos   aremos +   ará   aré +   erían   erías   erán   erás   eríais   ería   eréis   eríamos   eremos +   erá   eré +   irían   irías   irán   irás   iríais   iría   iréis   iríamos   iremos +   irá   iré +   aba   ada   ida   ía   ara   iera   ad   ed   id   ase   iese   aste   iste   an   aban   ían +   aran   ieran   asen   iesen   aron   ieron   ado   ido   ando   iendo   ió   ar   er   ir   as +   abas   adas   idas   ías   aras   ieras   ases   ieses   ís   áis   abais   íais +   arais   ierais     aseis   ieseis   asteis   isteis   ados   idos   amos   ábamos +   íamos   imos   áramos   iéramos   iésemos   ásemos +
delete +
+
+ +

+Always do step 3. +

+ +

+Step 3: residual suffix +

+ +
+ Search for the longest among the following suffixes in RV, and perform the + action indicated. +
+
os   a   o   á   í   ó +
delete if in RV +
e   é +
delete if in RV, and if preceded by gu with the u in RV delete the u +
+
+ +

+And finally: +

+ +
+ Remove acute accents +
+ +

The same algorithm in Snowball

+ +[% highlight_file('spanish') %] + +[% footer %] diff --git a/algorithms/spanish/stop.txt b/algorithms/spanish/stop.txt new file mode 100644 index 0000000..fd323a4 --- /dev/null +++ b/algorithms/spanish/stop.txt @@ -0,0 +1,348 @@ + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/algorithms/swedish/stemmer.html b/algorithms/swedish/stemmer.html new file mode 100644 index 0000000..0ca315e --- /dev/null +++ b/algorithms/swedish/stemmer.html @@ -0,0 +1,436 @@ + + + + + + + + + + Swedish stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Swedish stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Swedish vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+jakt
+jaktbössa
+jakten
+jakthund
+jaktkarl
+jaktkarlar
+jaktkarlarne
+jaktkarlens
+jaktlöjtnant
+jaktlöjtnanten
+jaktlöjtnantens
+jalusi
+jalusien
+jalusier
+jalusierna
+jamaika
+jamat
+jamrande
+jamt
+jande
+januari
+japanska
+jaquette
+jaquettekappa
+jargong
+jasmin
+jasminen
+jasminer
+jasminhäck
+jaspis
+jaså
+javäl
+jazzvindens
+jcrn
+jcsus
+je
+jemföra
+jemföras
+jemförelse
+jemförelser
+
+jakt
+jaktböss
+jakt
+jakthund
+jaktkarl
+jaktkarl
+jaktkarl
+jaktkarl
+jaktlöjtnant
+jaktlöjtnant
+jaktlöjtnant
+jalusi
+jalusi
+jalusi
+jalusi
+jamaik
+jam
+jamr
+jamt
+jand
+januari
+japansk
+jaquet
+jaquettekapp
+jargong
+jasmin
+jasmin
+jasmin
+jasminhäck
+jaspis
+jaså
+javäl
+jazzvind
+jcrn
+jcsus
+je
+jemför
+jemför
+jemför
+jemför
+
+klo
+kloaken
+klock
+klocka
+klockan
+klockans
+klockare
+klockaren
+klockarens
+klockarfar
+klockarn
+klockarsonen
+klockas
+klockkedjan
+klocklikt
+klockor
+klockorna
+klockornas
+klockors
+klockringning
+kloekornas
+klok
+kloka
+klokare
+klokast
+klokaste
+kloke
+klokhet
+klokheten
+klokt
+kloliknande
+klor
+klorna
+kloroform
+kloster
+klostergården
+klosterlik
+klot
+klotb
+klotrund
+
+klo
+kloak
+klock
+klock
+klockan
+klockan
+klock
+klock
+klock
+klockarf
+klockarn
+klockarson
+klock
+klockkedjan
+klocklik
+klock
+klock
+klock
+klockor
+klockringning
+kloek
+klok
+klok
+klok
+klok
+klok
+klok
+klok
+klok
+klokt
+klolikn
+klor
+klorn
+kloroform
+klost
+klostergård
+klosterlik
+klot
+klotb
+klotrund
+
+
+ +

The stemming algorithm

+ +

+The Swedish alphabet includes the following additional letters, +

+ +
+ ä   å   ö +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   ä   å   ö +
+ +

+R2 is not used: R1 is defined in the same way as in the +German stemmer. +(See the note on R1 and R2.) +

+ +

+Define a valid s-ending as one of +

+ +
+b   c   d   f   g   h   j   k   +l   m   n   o   p   r   t   v   +y +
+ +

+Do each of steps 1, 2 and 3. +

+ +

+Step 1: +

+ +
+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +
+
(a) + a   arna   erna   heterna   orna   ad   e   ade   + ande   arne   are   aste   en   anden   aren   heten   + ern   ar   er   heter   or   as   arnas   ernas   + ornas   es   ades   andes   ens   arens   hetens   erns +   at   andet   het   ast +
delete +
(b) + s +
delete if preceded by a valid s-ending +
+ (Of course the letter of the valid s-ending is + not necessarily in R1) +
+ +

+Step 2: +

+ +
+ Search for one of the following suffixes in R1, and if found + delete the last letter. +
+ dd   gd   nn   dt   gt   kt   tt +
+ (For example, frisktfrisk, fröknarnn fröknarn) +
+ +

+Step 3: +

+ +
+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +
+
lig   ig   els +
delete +
löst +
replace with lös +
fullt +
replace with full +
+
+ +

The same algorithm in Snowball

+ +
routines (
+           mark_regions
+           main_suffix
+           consonant_pair
+           other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a"   '{U+00E4}'
+stringdef ao   '{U+00E5}'
+stringdef o"   '{U+00F6}'
+
+define v 'aeiouy{a"}{ao}{o"}'
+
+define s_ending  'bcdfghjklmnoprtvy'
+
+define mark_regions as (
+
+    $p1 = limit
+    test ( hop 3 setmark x )
+    goto v gopast non-v  setmark p1
+    try ( $p1 < x  $p1 = x )
+)
+
+backwardmode (
+
+    define main_suffix as (
+        setlimit tomark p1 for ([substring])
+        among(
+
+            'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
+            'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
+            'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
+            'hetens' 'erns' 'at' 'andet' 'het' 'ast'
+                (delete)
+            's'
+                (s_ending delete)
+        )
+    )
+
+    define consonant_pair as setlimit tomark p1 for (
+        among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
+        and ([next] delete)
+    )
+
+    define other_suffix as setlimit tomark p1 for (
+        [substring] among(
+            'lig' 'ig' 'els' (delete)
+            'l{o"}st'        (<-'l{o"}s')
+            'fullt'          (<-'full')
+        )
+    )
+)
+
+define stem as (
+
+    do mark_regions
+    backwards (
+        do main_suffix
+        do consonant_pair
+        do other_suffix
+    )
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/swedish/stemmer.tt b/algorithms/swedish/stemmer.tt new file mode 100644 index 0000000..aaf8c82 --- /dev/null +++ b/algorithms/swedish/stemmer.tt @@ -0,0 +1,107 @@ +[% header('Swedish stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([40, 'jakt', 'klo']) %] + +

The stemming algorithm

+ +

+The Swedish alphabet includes the following additional letters, +

+ +
+ ä   å   ö +
+ +

+The following letters are vowels: +

+ +
+ a   e   i   o   u   y   ä   å   ö +
+ +

+R2 is not used: R1 is defined in the same way as in the +German stemmer. +(See the note on R1 and R2.) +

+ +

+Define a valid s-ending as one of +

+ +
+b   c   d   f   g   h   j   k   +l   m   n   o   p   r   t   v   +y +
+ +

+Do each of steps 1, 2 and 3. +

+ +

+Step 1: +

+ +
+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +
+
(a) + a   arna   erna   heterna   orna   ad   e   ade   + ande   arne   are   aste   en   anden   aren   heten   + ern   ar   er   heter   or   as   arnas   ernas   + ornas   es   ades   andes   ens   arens   hetens   erns +   at   andet   het   ast +
delete +
(b) + s +
delete if preceded by a valid s-ending +
+ (Of course the letter of the valid s-ending is + not necessarily in R1) +
+ +

+Step 2: +

+ +
+ Search for one of the following suffixes in R1, and if found + delete the last letter. +
+ dd   gd   nn   dt   gt   kt   tt +
+ (For example, frisktfrisk, fröknarnn fröknarn) +
+ +

+Step 3: +

+ +
+ Search for the longest among the following suffixes in R1, and + perform the action indicated. +
+
lig   ig   els +
delete +
löst +
replace with lös +
fullt +
replace with full +
+
+ +

The same algorithm in Snowball

+ +[% highlight_file('swedish') %] + +[% footer %] diff --git a/algorithms/swedish/stop.txt b/algorithms/swedish/stop.txt new file mode 100644 index 0000000..493b76a --- /dev/null +++ b/algorithms/swedish/stop.txt @@ -0,0 +1,125 @@ + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitt | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/algorithms/turkish/accompanying_paper.doc b/algorithms/turkish/accompanying_paper.doc new file mode 100644 index 0000000..f0b325a Binary files /dev/null and b/algorithms/turkish/accompanying_paper.doc differ diff --git a/algorithms/turkish/stemmer.html b/algorithms/turkish/stemmer.html new file mode 100644 index 0000000..3a736ef --- /dev/null +++ b/algorithms/turkish/stemmer.html @@ -0,0 +1,585 @@ + + + + + + + + + + Turkish stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Turkish stemming algorithm

+ + +

Links to resources

+ + + +

+The Turkish stemming algorithm was provided by Evren Kapusuz Cilden. It stems +only noun and nominal verb suffixes because noun stems are more important for +information retrieval, and only handling these simplifies the algorithm +significantly. +

+ +

+In her paper (linked above) Evren explains +

+ +
+

+The stemmer can be enhanced to stem all kinds of verb suffixes. In Turkish, +there are over fifty suffixes that can be affixed to verbs [2]. The +morphological structure of verb suffixes is more complicated than noun +suffixes. Despite this, one can use the methodology presented in this paper to +enhance the stemmer to find stems of all kinds of Turkish words. +

+
+ +

where [2] is a reference to the following paper:

+ +
+

+Gulsen Eryigit and Esref Adali. +An Affix Stripping Morphological Analyzer for Turkish +Proceedings of the IAESTED International +Conference +ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, Innsbruck, Austria. +

+
+ +

The algorithm in Snowball

+ +
/* Stemmer for Turkish
+	* author: Evren (Kapusuz) Çilden
+	* email: evren.kapusuz at gmail.com
+	* version: 1.0 (15.01.2007)
+
+
+	* stems nominal verb suffixes
+	* stems nominal inflections
+	* more than one syllable word check
+	* (y,n,s,U) context check
+	* vowel harmony check
+	* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
+
+	* The stemming algorithm is based on the paper "An Affix Stripping
+	* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
+	* Eşref Adalı (Proceedings of the IAESTED International Conference
+	* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
+	* Innsbruck, Austria
+
+	* Turkish is an agglutinative language and has a very rich morphological
+	* structure. In Turkish, you can form many different words from a single stem
+	* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
+	* "You had been the doctor of him". The stem of the word is "doktor" and it
+	* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
+	* the append order of suffixes can be clearly described as FSMs.
+	* The paper referenced above defines some FSMs for right to left
+	* morphological analysis. I generated a method for constructing snowball
+	* expressions from right to left FSMs for stemming suffixes.
+*/
+
+routines (
+	append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
+	check_vowel_harmony	// tests vowel harmony for suffixes
+	is_reserved_word	// tests whether current string is a reserved word ('ad','soyad')
+	mark_cAsInA		// nominal verb suffix
+	mark_DA			// noun suffix
+	mark_DAn		// noun suffix
+	mark_DUr		// nominal verb suffix
+	mark_ki			// noun suffix
+	mark_lAr		// noun suffix, nominal verb suffix
+	mark_lArI		// noun suffix
+	mark_nA			// noun suffix
+	mark_ncA		// noun suffix
+	mark_ndA		// noun suffix
+	mark_ndAn		// noun suffix
+	mark_nU			// noun suffix
+	mark_nUn		// noun suffix
+	mark_nUz		// nominal verb suffix
+	mark_sU			// noun suffix
+	mark_sUn		// nominal verb suffix
+	mark_sUnUz		// nominal verb suffix
+	mark_possessives	// -(U)m,-(U)n,-(U)mUz,-(U)nUz,
+	mark_yA			// noun suffix
+	mark_ylA		// noun suffix
+	mark_yU			// noun suffix
+	mark_yUm		// nominal verb suffix
+	mark_yUz		// nominal verb suffix
+	mark_yDU		// nominal verb suffix
+	mark_yken		// nominal verb suffix
+	mark_ymUs_		// nominal verb suffix
+	mark_ysA		// nominal verb suffix
+
+	mark_suffix_with_optional_y_consonant
+	mark_suffix_with_optional_U_vowel
+	mark_suffix_with_optional_n_consonant
+	mark_suffix_with_optional_s_consonant
+
+	more_than_one_syllable_word
+
+	post_process_last_consonants
+	postlude
+
+	stem_nominal_verb_suffixes
+	stem_noun_suffixes
+	stem_suffix_chain_before_ki
+)
+
+stringescapes	{ }
+
+/* Special characters in Unicode Latin-1 and Latin Extended-A */
+stringdef cc	'{U+00E7}'	// LATIN SMALL LETTER C WITH CEDILLA
+stringdef g~	'{U+011F}'	// LATIN SMALL LETTER G WITH BREVE
+stringdef i'	'{U+0131}'	// LATIN SMALL LETTER I WITHOUT DOT
+stringdef o"	'{U+00F6}'	// LATIN SMALL LETTER O WITH DIAERESIS
+stringdef s,	'{U+015F}'	// LATIN SMALL LETTER S WITH CEDILLA
+stringdef u"	'{U+00FC}'	// LATIN SMALL LETTER U WITH DIAERESIS
+
+booleans	( continue_stemming_noun_suffixes )
+
+groupings	( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
+
+define vowel	'ae{i'}io{o"}u{u"}'
+define U	'{i'}iu{u"}'
+
+// the vowel grouping definitions below are used for checking vowel harmony
+define vowel1	'a{i'}ou'		// vowels that can end with suffixes containing 'a'
+define vowel2	'ei{o"}{u"}'		// vowels that can end with suffixes containing 'e'
+define vowel3	'a{i'}'			// vowels that can end with suffixes containing 'i''
+define vowel4	'ei'			// vowels that can end with suffixes containing 'i'
+define vowel5	'ou'			// vowels that can end with suffixes containing 'o' or 'u'
+define vowel6	'{o"}{u"}'		// vowels that can end with suffixes containing 'o"' or 'u"'
+
+externals	( stem )
+
+backwardmode (
+	// checks vowel harmony for possible suffixes,
+	// helps to detect whether the candidate for suffix applies to vowel harmony
+	// this rule is added to prevent over stemming
+	define check_vowel_harmony as (
+		test
+		(
+			(goto vowel)   // if there is a vowel
+			(
+				('a' goto vowel1) or
+				('e' goto vowel2) or
+				('{i'}' goto vowel3) or
+				('i' goto vowel4) or
+				('o' goto vowel5) or
+				('{o"}' goto vowel6) or
+				('u' goto vowel5) or
+				('{u"}' goto vowel6)
+			)
+		)
+	)
+
+	// if the last consonant before suffix is vowel and n then advance and delete
+	// if the last consonant before suffix is non vowel and n do nothing
+	// if the last consonant before suffix is not n then only delete the suffix
+	// assumption: slice beginning is set correctly
+	define mark_suffix_with_optional_n_consonant as (
+		('n' (test vowel))
+		or
+		((not(test 'n')) test(next vowel))
+
+	)
+
+	// if the last consonant before suffix is vowel and s then advance and delete
+	// if the last consonant before suffix is non vowel and s do nothing
+	// if the last consonant before suffix is not s then only delete the suffix
+	// assumption: slice beginning is set correctly
+	define mark_suffix_with_optional_s_consonant as (
+		('s' (test vowel))
+		or
+		((not(test 's')) test(next vowel))
+	)
+
+	// if the last consonant before suffix is vowel and y then advance and delete
+	// if the last consonant before suffix is non vowel and y do nothing
+	// if the last consonant before suffix is not y then only delete the suffix
+	// assumption: slice beginning is set correctly
+	define mark_suffix_with_optional_y_consonant as (
+		('y' (test vowel))
+		or
+		((not(test 'y')) test(next vowel))
+	)
+
+	define mark_suffix_with_optional_U_vowel as (
+		(U (test non-vowel))
+		or
+		((not(test U)) test(next non-vowel))
+
+	)
+
+	define mark_possessives as (
+		among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
+		       'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
+		(mark_suffix_with_optional_U_vowel)
+	)
+
+	define mark_sU as (
+		check_vowel_harmony
+		U
+		(mark_suffix_with_optional_s_consonant)
+	)
+
+	define mark_lArI as (
+		among ('leri' 'lar{i'}')
+	)
+
+	define mark_yU as (
+		check_vowel_harmony
+		U
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_nU as (
+		check_vowel_harmony
+		among ('n{i'}' 'ni' 'nu' 'n{u"}')
+	)
+
+	define mark_nUn as (
+		check_vowel_harmony
+		among ('{i'}n' 'in' 'un' '{u"}n')
+		(mark_suffix_with_optional_n_consonant)
+	)
+
+	define mark_yA as (
+		check_vowel_harmony
+		among('a' 'e')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_nA as (
+		check_vowel_harmony
+		among('na' 'ne')
+	)
+
+	define mark_DA as (
+		check_vowel_harmony
+		among('da' 'de' 'ta' 'te')
+	)
+
+	define mark_ndA as (
+		check_vowel_harmony
+		among('nda' 'nde')
+	)
+
+	define mark_DAn as (
+		check_vowel_harmony
+		among('dan' 'den' 'tan' 'ten')
+	)
+
+	define mark_ndAn as (
+		check_vowel_harmony
+		among('ndan' 'nden')
+	)
+
+	define mark_ylA as (
+		check_vowel_harmony
+		among('la' 'le')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_ki as (
+		'ki'
+	)
+
+	define mark_ncA as (
+		check_vowel_harmony
+		among('ca' 'ce')
+		(mark_suffix_with_optional_n_consonant)
+	)
+
+	define mark_yUm as (
+		check_vowel_harmony
+		among ('{i'}m' 'im' 'um' '{u"}m')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_sUn as (
+		check_vowel_harmony
+		among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
+	)
+
+	define mark_yUz as (
+		check_vowel_harmony
+		among ('{i'}z' 'iz' 'uz' '{u"}z')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_sUnUz as (
+		among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
+	)
+
+	define mark_lAr as (
+		check_vowel_harmony
+		among ('ler' 'lar')
+	)
+
+	define mark_nUz as (
+		check_vowel_harmony
+		among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
+	)
+
+	define mark_DUr as (
+		check_vowel_harmony
+		among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
+	)
+
+	define mark_cAsInA as (
+		among ('cas{i'}na' 'cesine')
+	)
+
+	define mark_yDU as (
+		check_vowel_harmony
+		among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
+			't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
+			't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
+			't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	// does not fully obey vowel harmony
+	define mark_ysA as (
+		among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_ymUs_ as (
+		check_vowel_harmony
+		among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}')
+		(mark_suffix_with_optional_y_consonant)
+	)
+
+	define mark_yken as (
+		'ken' (mark_suffix_with_optional_y_consonant)
+	)
+
+	define stem_nominal_verb_suffixes as (
+		[
+			set continue_stemming_noun_suffixes
+			(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
+			or
+			(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
+			or
+			(
+				mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
+				unset continue_stemming_noun_suffixes
+			)
+			or
+			(mark_nUz (mark_yDU or mark_ysA))
+			or
+			((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
+			or
+			(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
+		]delete
+	)
+
+	// stems noun suffix chains ending with -ki
+	define stem_suffix_chain_before_ki as (
+		[
+			mark_ki
+			(
+				(mark_DA] delete try([
+					(mark_lAr] delete try(stem_suffix_chain_before_ki))
+					or
+					(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+
+				))
+				or
+				(mark_nUn] delete try([
+					(mark_lArI] delete)
+					or
+					([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+					or
+					(stem_suffix_chain_before_ki)
+				))
+				or
+				(mark_ndA (
+					(mark_lArI] delete)
+					or
+					((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
+					or
+					(stem_suffix_chain_before_ki)
+				))
+			)
+	)
+
+	define stem_noun_suffixes as (
+		([mark_lAr] delete try(stem_suffix_chain_before_ki))
+		or
+		([mark_ncA] delete
+			try(
+				([mark_lArI] delete)
+				or
+				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+				or
+				([mark_lAr] delete stem_suffix_chain_before_ki)
+			)
+		)
+		or
+		([(mark_ndA or mark_nA)
+			(
+				(mark_lArI] delete)
+				or
+				(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+				or
+				(stem_suffix_chain_before_ki)
+			)
+		)
+		or
+		([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
+		or
+		( [mark_DAn] delete try ([
+			(
+				(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+				or
+				(mark_lAr] delete try(stem_suffix_chain_before_ki))
+				or
+				(stem_suffix_chain_before_ki)
+			))
+		)
+		or
+		([mark_nUn or mark_ylA] delete
+			try(
+				([mark_lAr] delete stem_suffix_chain_before_ki)
+				or
+				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+				or
+				stem_suffix_chain_before_ki
+			)
+		)
+		or
+		([mark_lArI] delete)
+		or
+		(stem_suffix_chain_before_ki)
+		or
+		([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
+		or
+		([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+	)
+
+	define post_process_last_consonants as (
+		[substring] among (
+			'b' (<- 'p')
+			'c' (<- '{cc}')
+			'd' (<- 't')
+			'{g~}' (<- 'k')
+		)
+	)
+
+	// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
+	// like in 'kedim' -> 'ked'
+	// Turkish words don't usually end with 'd' or 'g'
+	// some very well known words are ignored (like 'ad' 'soyad'
+	// appends U to stems ending with d or g, decides which vowel to add
+	// based on the last vowel in the stem
+	define append_U_to_stems_ending_with_d_or_g as (
+		test('d' or 'g')
+		(test((goto vowel) 'a' or '{i'}') <+ '{i'}')
+		or
+		(test((goto vowel) 'e' or 'i') <+ 'i')
+		or
+		(test((goto vowel) 'o' or 'u') <+ 'u')
+		or
+		(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
+	)
+
+	define is_reserved_word as (
+		'ad' try 'soy' atlimit
+	)
+)
+
+// Tests if there are more than one syllables
+// In Turkish each vowel indicates a distinct syllable
+define more_than_one_syllable_word as (
+	test (loop 2 gopast vowel)
+)
+
+define postlude as (
+	backwards (
+		not(is_reserved_word)
+		do append_U_to_stems_ending_with_d_or_g
+		do post_process_last_consonants
+
+	)
+)
+
+define stem as (
+	(more_than_one_syllable_word)
+	(
+		backwards (
+			do stem_nominal_verb_suffixes
+			continue_stemming_noun_suffixes
+			do stem_noun_suffixes
+		)
+
+	postlude
+	)
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/turkish/stemmer.tt b/algorithms/turkish/stemmer.tt new file mode 100644 index 0000000..3a8c2a4 --- /dev/null +++ b/algorithms/turkish/stemmer.tt @@ -0,0 +1,47 @@ +[% header('Turkish stemming algorithm') %] + +

Links to resources

+ + + +

+The Turkish stemming algorithm was provided by Evren Kapusuz Cilden. It stems +only noun and nominal verb suffixes because noun stems are more important for +information retrieval, and only handling these simplifies the algorithm +significantly. +

+ +

+In her paper (linked above) Evren explains +

+ +
+

+The stemmer can be enhanced to stem all kinds of verb suffixes. In Turkish, +there are over fifty suffixes that can be affixed to verbs [2]. The +morphological structure of verb suffixes is more complicated than noun +suffixes. Despite this, one can use the methodology presented in this paper to +enhance the stemmer to find stems of all kinds of Turkish words. +

+
+ +

where [2] is a reference to the following paper:

+ +
+

+Gulsen Eryigit and Esref Adali. +An Affix Stripping Morphological Analyzer for Turkish +Proceedings of the IAESTED International +Conference +ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, Innsbruck, Austria. +

+
+ +

The algorithm in Snowball

+ +[% highlight_file('turkish') %] + +[% footer %] diff --git a/algorithms/yiddish/stemmer.html b/algorithms/yiddish/stemmer.html new file mode 100644 index 0000000..22ecb96 --- /dev/null +++ b/algorithms/yiddish/stemmer.html @@ -0,0 +1,818 @@ + + + + + + + + + + Yiddish stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Yiddish stemming algorithm

+ + +

Links to resources

+ + + +

+Here is a sample of Yiddish vocabulary, with the stemmed forms that will be generated by this algorithm: +

+ +
+ + + + + + + + + + + + + + + + + + +
word stem          word stem
+אַװעקבלאָנדזשען
+אַװעקבלאָנדזשענדיק
+אַװעקבלאָנדזשענדיקן
+אַװעקבלאָנדזשענדיקס
+אַװעקבלאָנדזשענדיקע
+אַװעקבלאָנדזשענדיקער
+אַװעקגײן
+אַװעקגײנדיק
+אַװעקגײנדיקן
+אַװעקגײנדיקס
+אַװעקגײנדיקע
+אַװעקגײנדיקער
+אַװעקגנבֿענען
+אַװעקגנבֿענענדיק
+אַװעקגנבֿענענדיקן
+אַװעקגנבֿענענדיקס
+אַװעקגנבֿענענדיקע
+אַװעקגנבֿענענדיקער
+אַװעקגעבלאָנדזשעט
+אַװעקגעבלאָנדזשעטן
+אַװעקגעבלאָנדזשעטס
+אַװעקגעבלאָנדזשעטע
+אַװעקגעבלאָנדזשעטער
+אַװעקגעבן
+אַװעקגעבנדיק
+אַװעקגעבנדיקן
+אַװעקגעבנדיקס
+אַװעקגעבנדיקע
+אַװעקגעבנדיקער
+אַװעקגעגאַנגען
+אַװעקגעגאַנגענס
+אַװעקגעגאַנגענע
+אַװעקגעגאַנגענעם
+אַװעקגעגאַנגענער
+אַװעקגעגנבֿעט
+
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגנבענ
+אװעקגנבענ
+אװעקגנבענ
+אװעקגנבענ
+אװעקגנבענ
+אװעקגנבענ
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקבלאנדזש
+אװעקגעב
+אװעקגעב
+אװעקגעב
+אװעקגעב
+אװעקגעב
+אװעקגעב
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגײ
+אװעקגנב
+
+אַבֿידות
+אַבסטראַקטסטער
+אַדורכבײַסנדיקער
+אַדורכגעביסן
+אַדורכגעשמועסט
+אַדורכפֿירנדיק
+אַװעקגעגאַנגען
+אַװעקגעגאַנגענעם
+אַװעקגענומענער
+אמתדיק
+אמתדיקן
+אמתדיקע
+אמתדיקער
+באַהאַלטן
+ביכער
+געאַכלט
+געאײַלט
+געאײַלן
+געבאָדענעם
+געבאָטענעם
+געשדכנטע
+עראָפּלאַנען
+פֿאַרגאַנגענהײט
+פֿאָרױסגעגאַנגענע
+קינדהײט
+װילן
+װילסט
+
+אבידה
+אבסטראקט
+אדורכבײס
+אדורכבײס
+אדורכשמוע
+אדורכפיר
+אװעקגײ
+אװעקגײ
+אװעקנעמ
+אמת
+אמת
+אמת
+אמת
+באהאל
+ביכ
+אכל
+אײל
+אײל
+באד
+באט
+שדכנ
+עראפלאנ
+פארגאנגענ
+פארױסגײ
+קינד
+װיל
+װיל
+
+
+ +

The stemming algorithm

+ +

Groupings

+

We setup the following groupings:

+ +
+
Niked
+
All niked used in Yiddish and Hebrew
+
AlefBeys
+
+ All actual letters in the Hebrew alphabet, including: +
    +
  • The alphabet itself: א ב ג ד ה ו ז ח ט י כ ל מ נ ס ע פ צ ק ר ש ת
  • +
  • Final consonants: ך ם ן ף ץ
  • +
  • Ligatures: װ ױ ײ
  • +
+
+
Vowel
+
א ו י ע ױ ײ
+
Consonant
+
AlefBeys - Vowel
+
+ +

Pre-processing

+
    +
  • We replace two ו, where the second one is not וּ, with װ.
  • +
  • We replace ו י, where the י is not a יִ, with ױ.
  • +
  • We replace two י, where the second one is not a יִ, with ײ.
  • +
  • We replace final forms (e.g. ץ) with their normal form (e.g. צ).
  • +
  • We remove all niked.
  • +
+ +

Marking regions

+

+ Only a single marker is used: P1. + To begin with, this is set at the end of the word. +

+ +
    +
  • If the word begins with גע (except for געלט and געבן) it is replaced with "GE" and the cursor is advanced.
  • +
  • + Next, if the word begins with any verbal prefix, the cursor is advanced past this prefix. + Prefixes include (niked added for clarity, not included in algorithm): +
  • +
      +
    • Free stressed: אַדורכ, דורכ, אַהינ, אַהער, אַװעק, מיט, אַנטקעגנ, אַקעגנ, אַנידער, אַראָפּ, אַרױס, אַרױפ, אַרומ, אַרײנ, אַרונטער, אַריבער, נאָכ, פאַרבײַ, אַהײמ, אַפיר, פאַרױס, פונאַנדער, צוזאַמענ, צונױפ, צוריק.
    • +
    • Stressed: אױס, אױפ, אומ, אונטער, איבער, אײֲנ, אָנ, אָפּ, בײַ, פאָר, צו.
    • +
    • Unstressed: אַנט, באַ, דער, צע
    • +
    +
  • If the verbal prefix is followed by גע (except for געבן), it is replaced with "GE" and the cursor is advanced (e.g. אַװעקגעגאַנגען).
  • +
  • If the verbal prefix is followed by צו (except for צוגן, צוקט or צוקן with nothing afterwards), it is replaced with "TSU" and the cursor is advanced (e.g. אַרומצוגײן).
  • +
+ +

We are now at the start of the main portion of the word (past any verbal prefix and past participle marker).

+ +
    +
  • The following valid Yiddish three-consonant sequences are skipped: שפר, שטר, שטש, דזש.
  • +
  • If there is a sequence of three consonants, the cursor is advanced past them, and P1 is marked.
  • +
  • Otherwise, the cursor is advanced to the first vowel, and then up to the first non-vowel, minus 1, and P1 is marked.
  • +
  • If P1 is not at least 3 letters beyond the main portion, it is advanced past the 3rd letter.
  • +
+ +

Backwards mode

+ +

Unless otherwise stated, all deletes ensure we are beyond P1.

+

In each pass, at the first level of bullets, the longest matching suffix always wins.

+ +

First pass:

+
    +
  • Delete plural/adjective endings: ער, ערס, ן, ס, ען, נס, ענערס, ענס, עס.
  • +
      +
    • Exceptions: יע is not deleted (e.g. אגיטאַציע), יעס becomes יע
    • +
    +
  • Delete plural/adjective endings: ענעם, ענער, ענע, ענס
  • +
      +
    • If preceded by an irregular past participle ending in ן, replace it with the stem, e.g. געגאַנגען becomes גײ.
    • +
    +
  • Delete the verb/past participle ending: ט.
  • +
      +
    • Because of the above, also delete noun/adjectives ending in טן, טע, טער, טס so that they stem identically to the equivalent noun with the ט.
    • +
    • Similarly for past participles: -tns, -tene, -tenem, -tener טנס, טענע, טענעם, טענער
    • +
    • If the ט was before P1, we try to perform the same action while leaving the ט in place
    • +
    • At the same time, if preceded by an irregular past participles ending in ט, replace it with the stem, e.g. געבראַכט becomes ברענג.
    • +
    +
  • Delete the past participle endings: עט, עטן, עטס, עטע, עטער +
  • Anything ending with גײן is transformed to גײ.
  • +
  • Anything ending in an irregular past participle is corrected to the stem.
  • +
  • Delete noun endings: ונג, הײט, קײט, יקײט, שאַפֿט.
  • +
  • Delete noun endings: יזם, יזמס.
  • +
  • Delete Hebraic plural ending: ים
  • +
  • Replace the Hebraic plural ending ות with ה.
  • +
  • Delete the diminutive endings: עלעך, עלע, לעך, עלעס, עלען.
  • +
  • Delete the noun endings: יסט, יסטן.
  • +
      +
    • Exceptions to the above: words ending in גיסט or שיסט.
    • +
    +
  • Delete the verb ending סטו.
  • +
  • Delete the superlative endings: סטער, סטע, סטן.
  • +
  • Delete the verb ending: סט.
  • +
+ +

Second pass - after the first pass, do the following to the remaining stem:

+
    +
  • Delete noun endings: ונג, הײט, קײט, יקײט, שאַפֿט.
  • +
  • Delete the diminutive ending ל if it follows a consonant.
  • +
+ +

Third pass - after the second pass, do the following to the remaining stem:

+
    +
  • Delete the adjective endings יק, יג, ניק, דיק, יש, for words like אָפהענגיקײט.
  • +
      +
    • Exceptions to the above: words ending in גליק or בליק.
    • +
    +
  • Delete the present participle ending: נדיק
  • +
  • + Delete the present participle ending ענדיק if it follows נג, נק, נ, מ, a consonant and ל or a vowel. + Otherwise, delete just the נדיק portion. +
  • +
+ +Finally, all remaining GE and TSU are deleted. + +

The same algorithm in Snowball

+ +
/* *******************************************
+ * Stemmer for Yiddish language in YIVO script
+ *
+ * Author: Assaf Urieli
+ * Emails: assaf.urieli at gmail.com
+ * Version: 0.1 (15.05.2020)
+ *
+ ********************************************* */
+
+routines (
+       prelude
+       mark_regions
+       R1
+       R1plus3
+       standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( vowel niked alefBeys consonant )
+
+stringescapes {}
+
+// AlefBeys
+stringdef Alef         '{U+05D0}'
+stringdef Beys         '{U+05D1}'
+stringdef Giml         '{U+05D2}'
+stringdef Dalet        '{U+05D3}'
+stringdef Hey          '{U+05D4}'
+stringdef Vov          '{U+05D5}'
+stringdef Zayen        '{U+05D6}'
+stringdef Khes         '{U+05D7}'
+stringdef Tes          '{U+05D8}'
+stringdef Yud          '{U+05D9}'
+stringdef LangerKhof   '{U+05DA}'
+stringdef Khof         '{U+05DB}'
+stringdef Lamed        '{U+05DC}'
+stringdef ShlosMem     '{U+05DD}'
+stringdef Mem          '{U+05DE}'
+stringdef LangerNun    '{U+05DF}'
+stringdef Nun          '{U+05E0}'
+stringdef Samekh       '{U+05E1}'
+stringdef Ayen         '{U+05E2}'
+stringdef LangerFey    '{U+05E3}'
+stringdef Fey          '{U+05E4}'
+stringdef LangerTsadek '{U+05E5}'
+stringdef Tsadek       '{U+05E6}'
+stringdef Kuf          '{U+05E7}'
+stringdef Reysh        '{U+05E8}'
+stringdef Shin         '{U+05E9}'
+stringdef Sof          '{U+05EA}'
+stringdef TsveyVovn    '{U+05F0}'
+stringdef VovYud       '{U+05F1}'
+stringdef TsveyYudn    '{U+05F2}'
+
+// Niked
+stringdef Shvo          '{U+05B0}'
+stringdef Khirik        '{U+05B4}'
+stringdef Tseyre        '{U+05B5}'
+stringdef Segl          '{U+05B6}'
+stringdef ReducedSegl   '{U+05B1}'
+stringdef Pasekh        '{U+05B7}'
+stringdef ReducedPasekh '{U+05B2}'
+stringdef Komets        '{U+05B8}'
+stringdef ReducedKomets '{U+05B3}'
+stringdef Rafe          '{U+05BF}'
+stringdef SinDot        '{U+05C2}'
+stringdef ShinDot       '{U+05C1}'
+stringdef Khoylm        '{U+05B9}'
+stringdef Melupm        '{U+05BC}'
+stringdef Kubuts        '{U+05BB}'
+
+// Groupings
+define niked       '{Shvo}{Khirik}{Tseyre}{Segl}{ReducedSegl}{Pasekh}{ReducedPasekh}{Komets}{ReducedKomets}{SinDot}{ShinDot}{Khoylm}{Melupm}{Kubuts}{Rafe}'
+define alefBeys    '{Alef}{Beys}{Giml}{Dalet}{Hey}{Vov}{Zayen}{Khes}{Tes}{Yud}{LangerKhof}{Khof}{Lamed}{ShlosMem}{Mem}{LangerNun}{Nun}{Samekh}{Ayen}{LangerFey}{Fey}{LangerTsadek}{Tsadek}{Kuf}{Reysh}{Shin}{Sof}{TsveyVovn}{VovYud}{TsveyYudn}'
+define vowel           '{Alef}{Vov}{Yud}{Ayen}{VovYud}{TsveyYudn}'
+define consonant   alefBeys - vowel
+
+define prelude as (
+  do (
+    repeat goto (
+      [substring] among (
+        '{Vov}{Vov}' ( not '{Melupm}' <- '{TsveyVovn}' )
+        '{Vov}{Yud}' ( not '{Khirik}' <- '{VovYud}' )
+        '{Yud}{Yud}' ( not '{Khirik}' <- '{TsveyYudn}' )
+        '{LangerKhof}' ( <- '{Khof}')
+        '{ShlosMem}' ( <- '{Mem}' )
+        '{LangerNun}' ( <- '{Nun}' )
+        '{LangerFey}' ( <- '{Fey}' )
+        '{LangerTsadek}' ( <- '{Tsadek}' )
+      )
+    )
+  )
+
+  do (repeat goto ( [niked] delete ))
+)
+
+define mark_regions as (
+  $p1 = limit
+
+  (
+    try (
+      // Replace past participle ge- at start of word
+      // Unless word starts with gelt- or gebn-
+      ['{Giml}{Ayen}']
+      not ('{Lamed}{Tes}' or '{Beys}{Nun}') <- 'GE'
+    )
+
+    try (
+      // skip verbal prefix
+      among(
+        // Free stressed: Adurkh-, Durkh-, Ahin-, Aher-, Avek-, Mit-, Antkegn-, Akegn-, Anider-, Arop-, Aroys-, Aroyf-, Arum-, Arayn-, Arunter-, Ariber-, Nokh-, Farbay-, Aheym-, Afir-, Faroys-, Funander-, Tsuzamen-, Tsunoyf-, Tsurik-
+        '{Alef}{Dalet}{Vov}{Reysh}{Khof}' '{Dalet}{Vov}{Reysh}{Khof}' '{Alef}{Hey}{Yud}{Nun}' '{Alef}{Hey}{Ayen}{Reysh}' '{Alef}{TsveyVovn}{Ayen}{Kuf}' '{Mem}{Yud}{Tes}' '{Alef}{Nun}{Tes}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Nun}{Yud}{Dalet}{Ayen}{Reysh}' '{Alef}{Reysh}{Alef}{Fey}' '{Alef}{Reysh}{VovYud}{Samekh}' '{Alef}{Reysh}{VovYud}{Fey}' '{Alef}{Reysh}{Vov}{Mem}' '{Alef}{Reysh}{TsveyYudn}{Nun}' '{Alef}{Reysh}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Reysh}{Yud}{Beys}{Ayen}{Reysh}' '{Nun}{Alef}{Khof}' '{Fey}{Alef}{Reysh}{Beys}{TsveyYudn}' '{Alef}{Hey}{TsveyYudn}{Mem}' '{Alef}{Fey}{Yud}{Reysh}' '{Fey}{Alef}{Reysh}{VovYud}{Samekh}' '{Fey}{Vov}{Nun}{Alef}{Nun}{Dalet}{Ayen}{Reysh}' '{Tsadek}{Vov}{Zayen}{Alef}{Mem}{Ayen}{Nun}' '{Tsadek}{Vov}{Nun}{VovYud}{Fey}' '{Tsadek}{Vov}{Reysh}{Yud}{Kuf}'
+
+        // Stressed: Oys-, Oyf-, Um-, Unter-, Iber-, Ayn-, On-, Op-, Bay-, For-, Tsu-.
+        '{Alef}{VovYud}{Samekh}' '{Alef}{VovYud}{Fey}' '{Alef}{Vov}{Mem}' '{Alef}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Yud}{Beys}{Ayen}{Reysh}' '{Alef}{TsveyYudn}{Nun}' '{Alef}{Nun}' '{Alef}{Fey}' '{Beys}{TsveyYudn}' '{Fey}{Alef}{Reysh}' '{Tsadek}{Vov}'
+
+        // Unstressed:  Ant-, Ba-, Der-, Tse-. Far- already covered by For-. Ge- comes later.
+        '{Alef}{Nun}{Tes}' '{Beys}{Alef}' '{Dalet}{Ayen}{Reysh}' '{Tsadek}{Ayen}'
+
+        // If verbal prefix followed by Tsu- or Ge-, replace it
+        (
+          // Don't mark the TSU- prefix inside verbs like "oys-tsugn"
+          test (('{Tsadek}{Vov}{Giml}{Nun}' or '{Tsadek}{Vov}{Kuf}{Tes}' or '{Tsadek}{Vov}{Kuf}{Nun}') atlimit)
+          or
+          // Don't mark the GE- prefix inside verbs like "avek-gebn"
+          test ('{Giml}{Ayen}{Beys}{Nun}')
+          or
+          ( ['{Giml}{Ayen}'] <- 'GE')
+          or
+          (['{Tsadek}{Vov}'] <- 'TSU')
+        )
+      )
+    )
+
+    test(hop 3 setmark x)
+
+    // We want to allow three-consonant Hebrew roots.
+    // To this end, we skip three-consonant combinations that exist in non-Hebraic Yiddish.
+    try (
+      among(
+        '{Shin}{Fey}{Reysh}' '{Shin}{Tes}{Reysh}' '{Shin}{Tes}{Shin}' '{Dalet}{Zayen}{Shin}'
+          ( true )
+      )
+    )
+
+    // Either 3 consonants or the first non-vowel after a vowel
+    (
+      not (consonant consonant consonant setmark p1)
+      goto vowel repeat vowel setmark p1
+    )
+    try($p1 < x  $p1 = x)  // at least 3 past the prefix
+  )
+
+)
+
+backwardmode (
+  define R1 as $p1 <= cursor
+  // Like R1, but also allows the cursor to be outside R1 by the width of Giml Yud Samekh
+  define R1plus3 as $p1 <= cursor + sizeof '{Giml}{Yud}{Samekh}'
+
+  define standard_suffix as (
+    do (
+      [substring] among(
+        // Plural/adjective endings: -er, -ers, -e, -n, -s, -en, -ns, -eners, -ens, -es
+         '{Ayen}{Reysh}{Samekh}' '{Ayen}{Nun}' '{Nun}{Samekh}' '{Ayen}{Nun}{Ayen}{Reysh}{Samekh}' '{Ayen}{Samekh}' '{Ayen}' '{Nun}' '{Samekh}' '{Ayen}{Mem}' '{Ayen}{Reysh}'
+          ( R1 delete )
+
+        // Exception: don't delete noun endings -ie, like "agitatsie"
+        '{Yud}{Ayen}'
+          ( true )
+
+        // -ies => ie
+        '{Yud}{Ayen}{Samekh}'
+          ( R1  <- '{Yud}{Ayen}'  )
+
+        // Plural/adjective endings: -enem, -ener, -ene, -ens
+        '{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}'
+          (R1 delete 
+            [substring] among (
+              // -gegangen => -gey
+              '{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}')
+              // -genumen => -nem
+              '{Nun}{Vov}{Mem}' (<- '{Nun}{Ayen}{Mem}')
+              // -gemiten => -mayd
+              '{Mem}{Yud}{Tes}' (<- '{Mem}{TsveyYudn}{Dalet}')
+              // -gebiten => -bayt
+              '{Beys}{Yud}{Tes}' (<- '{Beys}{TsveyYudn}{Tes}')
+              // -gebisen => -bays
+              '{Beys}{Yud}{Samekh}' (<- '{Beys}{TsveyYudn}{Samekh}')
+              // -gevizen => -vayz
+              '{TsveyVovn}{Yud}{Zayen}' (<- '{TsveyVovn}{TsveyYudn}{Zayen}')
+              // -getriben => -trayb
+              '{Tes}{Reysh}{Yud}{Beys}' (<- '{Tes}{Reysh}{TsveyYudn}{Beys}')
+              // -geliten => -layt
+              '{Lamed}{Yud}{Tes}' (<- '{Lamed}{TsveyYudn}{Tes}')
+              // -gekliben => -klayb
+              '{Kuf}{Lamed}{Yud}{Beys}' (<- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
+              // -geriben => -rayb
+              '{Reysh}{Yud}{Beys}' (<- '{Reysh}{TsveyYudn}{Beys}')
+              // -gerisen => -rays
+              '{Reysh}{Yud}{Samekh}' (<- '{Reysh}{TsveyYudn}{Samekh}')
+              // -geshvigen => -shvayg
+              '{Shin}{TsveyVovn}{Yud}{Giml}' (<- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
+              // -geshmisen => -shmays
+              '{Shin}{Mem}{Yud}{Samekh}' (<- '{Shin}{Mem}{TsveyYudn}{Samekh}')
+              // -geshniten => -shnayd
+              '{Shin}{Nun}{Yud}{Tes}' (<- '{Shin}{Nun}{TsveyYudn}{Dalet}')
+              // -geshriben => -shrayb
+              '{Shin}{Reysh}{Yud}{Beys}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}')
+              // -gebunden => -bind
+              '{Beys}{Vov}{Nun}{Dalet}' (<- '{Beys}{Yud}{Nun}{Dalet}')
+              // -gevuntshn => -vintsh
+              '{TsveyVovn}{Vov}{Tes}{Shin}' (<- '{TsveyVovn}{Yud}{Tes}{Shin}')
+              // -gezungen => -zing
+              '{Zayen}{Vov}{Nun}{Giml}' (<- '{Zayen}{Yud}{Nun}{Giml}')
+              // -getrunken => -trink
+              '{Tes}{Reysh}{Vov}{Nun}{Kuf}' (<- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
+              // -getsvungen => -tsving
+              '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}' (<- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
+              // -geshlungen => -shling
+              '{Shin}{Lamed}{Vov}{Nun}{Giml}' (<- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
+              // -geboygen => -beyg
+              '{Beys}{VovYud}{Giml}' (<- '{Beys}{TsveyYudn}{Giml}')
+              // -gehoyben => -heyb
+              '{Hey}{VovYud}{Beys}' (<- '{Hey}{TsveyYudn}{Beys}')
+              // -farloyren => -farlir
+              '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}' (<- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
+              // -shtanen => -shtey
+              '{Shin}{Tes}{Alef}{Nun}' (<- '{Shin}{Tes}{TsveyYudn}')
+              // -geshvoyrn => -shver
+              '{Shin}{TsveyVovn}{VovYud}{Reysh}' (<- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
+            )
+         )
+
+        // Verb/past participle ending: -t
+        '{Tes}'
+          ( R1 delete ) 
+
+        // As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate
+        // Similarly for past participles: -tns, -tene, -tenem, -tener
+        // If the Tes was before R1, we try to perform the same action while leaving the Tes in place
+        '{Tes}{Nun}' '{Tes}{Ayen}' '{Tes}{Ayen}{Reysh}' '{Tes}{Samekh}'
+        '{Tes}{Nun}{Samekh}' '{Tes}{Ayen}{Nun}{Ayen}' '{Tes}{Ayen}{Nun}{Ayen}{Mem}' '{Tes}{Ayen}{Nun}{Ayen}{Reysh}'
+          ( ((R1 delete) or ( <- '{Tes}'))
+            // -(ge)brakht => -breng
+            ['{Beys}{Reysh}{Alef}{Khof}' try '{Giml}{Ayen}'] <- '{Beys}{Reysh}{Ayen}{Nun}{Giml}'
+          )
+
+        // Past participles: -et, -etn, -ets, -ete, -eter
+        '{Ayen}{Tes}' '{Ayen}{Tes}{Nun}' '{Ayen}{Tes}{Samekh}' '{Ayen}{Tes}{Ayen}' '{Ayen}{Tes}{Ayen}{Reysh}'
+          ( R1 delete )
+
+        // -geyn shorted to -gey
+        '{Giml}{TsveyYudn}{Nun}'
+                    ( <- '{Giml}{TsveyYudn}')
+
+        // ##################### Long list of irregular past participles
+        // -(ge)gangen (shortened to -gangen after prefixes) => -gey
+        '{Giml}{Alef}{Nun}{Giml}{Ayen}{Nun}'
+                    ( <- '{Giml}{TsveyYudn}' )
+
+        // -(ge)numen (shortened to -numen after prefixes) => -nem
+        '{Nun}{Vov}{Mem}{Ayen}{Nun}'
+          (<- '{Nun}{Ayen}{Mem}' )
+
+        // -(ge)shribn (shortened to -shribn after prefixes) => -shrayb
+        '{Shin}{Reysh}{Yud}{Beys}{Nun}'
+          (<- '{Shin}{Reysh}{TsveyYudn}{Beys}' )
+
+        // -gemiten => -mayd
+        'GE{Mem}{Yud}{Tes}{Nun}' 
+          (<- '{Mem}{TsveyYudn}{Dalet}')
+
+        // -gebiten => -bayt
+        'GE{Beys}{Yud}{Tes}{Nun}' 
+          (<- '{Beys}{TsveyYudn}{Tes}')
+
+        // -gebisen => -bays
+        'GE{Beys}{Yud}{Samekh}{Nun}' 
+          ( <- '{Beys}{TsveyYudn}{Samekh}')
+
+        // -gevizen => -vayz
+        '{TsveyVovn}{Yud}{Zayen}{Nun}' 
+           ( <- '{TsveyVovn}{TsveyYudn}{Zayen}')
+
+        // -getriben => -trayb
+        '{Tes}{Reysh}{Yud}{Beys}{Nun}' 
+           ( <- '{Tes}{Reysh}{TsveyYudn}{Beys}')
+
+        // -geliten => -layt
+        'GE{Lamed}{Yud}{Tes}{Nun}' 
+           ( <- '{Lamed}{TsveyYudn}{Tes}')
+
+        // -gekliben => -klayb
+        '{Kuf}{Lamed}{Yud}{Beys}{Nun}' 
+           ( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
+
+        // -geriben => -rayb
+        '{Reysh}{Yud}{Beys}{Nun}' 
+           ( <- '{Reysh}{TsveyYudn}{Beys}')
+
+        // -gerisen => -rays
+        'GE{Reysh}{Yud}{Samekh}{Nun}' 
+           ( <- '{Reysh}{TsveyYudn}{Samekh}')
+
+        // -geshvigen => -shvayg
+        '{Shin}{TsveyVovn}{Yud}{Giml}{Nun}' 
+           ( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
+
+        // -geshmisen => -shmays
+        '{Shin}{Mem}{Yud}{Samekh}{Nun}' 
+           ( <- '{Shin}{Mem}{TsveyYudn}{Samekh}')
+
+        // -geshniten => -shnayd
+        '{Shin}{Nun}{Yud}{Tes}{Nun}' 
+           ( <- '{Shin}{Nun}{TsveyYudn}{Dalet}')
+
+        // -gebunden => -bind
+        '{Beys}{Vov}{Nun}{Dalet}{Nun}' 
+         ( <- '{Beys}{Yud}{Nun}{Dalet}')
+
+        // -gevuntshn => -vintsh
+        '{TsveyVovn}{Vov}{Tes}{Shin}{Nun}' 
+         ( <- '{TsveyVovn}{Yud}{Tes}{Shin}')
+
+        // -gezungen => -zing
+        '{Zayen}{Vov}{Nun}{Giml}{Nun}' 
+         ( <- '{Zayen}{Yud}{Nun}{Giml}')
+
+        // -getrunken => -trink
+        '{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}' 
+         ( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
+
+        // -getsvungen => -tsving
+        '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}' 
+         ( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
+
+        // -geshlungen => -shling
+        '{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}' 
+         ( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
+
+        // -geboygen => -beyg
+        '{Beys}{VovYud}{Giml}{Nun}' 
+         ( <- '{Beys}{TsveyYudn}{Giml}')
+
+        // -gehoyben => -heyb
+        '{Hey}{VovYud}{Beys}{Nun}' 
+         ( <- '{Hey}{TsveyYudn}{Beys}')
+
+        // -farloyren => -farlir
+        '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}' 
+         ( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
+
+        // -shtanen => -shtey
+        '{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}' 
+         ( <- '{Shin}{Tes}{TsveyYudn}')
+
+         // -geshvoyrn => -shver
+        '{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}' 
+         ( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
+
+        // -(ge)brakht (shortened to -brakht after prefixes) => -breng
+        '{Beys}{Reysh}{Alef}{Khof}{Tes}'
+          (<- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' )
+
+        // ###### End of irregular past participles
+
+        // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft
+        '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}'
+          ( R1 delete )
+
+        // Noun endings: -izm, izmen
+        '{Yud}{Zayen}{Mem}' '{Yud}{Zayen}{Mem}{Ayen}{Nun}'
+          ( R1 delete )
+
+        // Plural ending: -im
+        '{Yud}{Mem}'
+          ( R1 delete )
+          
+        // Plural ending: -os (Hebraic), replace with -h
+        '{Vov}{Sof}'
+          ( R1  <- '{Hey}' )
+
+        // Diminutive endings: -elekh, -ele, -lekh, -eles, -elen
+        '{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}'
+           ( R1 delete )
+        
+        // Noun ending: -ist
+        '{Yud}{Samekh}{Tes}'
+          (
+            // Exceptions: -gist, -shist
+            ( ('{Giml}' or '{Shin}') try (R1plus3 <- '{Yud}{Samekh}') )
+            or
+            ( R1 delete )
+          )
+
+        // Noun ending: -istn
+        '{Yud}{Samekh}{Tes}{Nun}'
+          ( R1 delete )
+        
+        // Verb ending: -stu
+        '{Samekh}{Tes}{Vov}'
+          ( R1 delete )
+
+        // Superlative ending: -ster, -ste, -stn 
+        '{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}' 
+          ( R1 delete )
+          
+        // Ambiguous verb ending: -st
+        '{Samekh}{Tes}'
+          ( R1 delete ) 
+      )
+    )
+
+    do (
+      [substring] among(
+        // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft
+        '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}'
+          ( R1 delete )
+
+        // Diminutive endings: -l
+        '{Lamed}'
+          ( R1 consonant delete )
+      )
+    )
+
+    do (
+      [substring] among(
+        // Adjective endings: -ig, -ik, -ish, -nik, -dik
+        '{Yud}{Giml}' '{Yud}{Kuf}' '{Yud}{Shin}' '{Nun}{Yud}{Kuf}' '{Dalet}{Yud}{Kuf}'
+          ( R1 delete )
+
+        // Exceptions to above: -blik, -glik
+        '{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}'
+          ( true )
+          
+        // Present participle endings: -ndik
+        '{Nun}{Dalet}{Yud}{Kuf}'
+           ( R1 delete )
+
+        // Present participle ending -endik: delete if after a -ng, -nk, -n, -m, consonant+l, or vowel.
+        // Otherwise, delete just the -ndik part.
+        '{Ayen}{Nun}{Dalet}{Yud}{Kuf}'
+           ( R1 delete )
+      )
+    )
+
+    do (repeat goto ( ['GE' or 'TSU'] delete ))
+  )
+)
+
+define stem as (
+  do prelude
+  do mark_regions
+  backwards
+    do standard_suffix
+)
+
+ + +
+
+
+ +
+ +
+ + + + diff --git a/algorithms/yiddish/stemmer.tt b/algorithms/yiddish/stemmer.tt new file mode 100644 index 0000000..136e9b0 --- /dev/null +++ b/algorithms/yiddish/stemmer.tt @@ -0,0 +1,202 @@ +[% header('Yiddish stemming algorithm') %] + +

Links to resources

+ + + +[% algorithm_vocab([ +'אַװעקבלאָנדזשען', +'אַװעקבלאָנדזשענדיק', +'אַװעקבלאָנדזשענדיקן', +'אַװעקבלאָנדזשענדיקס', +'אַװעקבלאָנדזשענדיקע', +'אַװעקבלאָנדזשענדיקער', +'אַװעקגײן', +'אַװעקגײנדיק', +'אַװעקגײנדיקן', +'אַװעקגײנדיקס', +'אַװעקגײנדיקע', +'אַװעקגײנדיקער', +'אַװעקגנבֿענען', +'אַװעקגנבֿענענדיק', +'אַװעקגנבֿענענדיקן', +'אַװעקגנבֿענענדיקס', +'אַװעקגנבֿענענדיקע', +'אַװעקגנבֿענענדיקער', +'אַװעקגעבלאָנדזשעט', +'אַװעקגעבלאָנדזשעטן', +'אַװעקגעבלאָנדזשעטס', +'אַװעקגעבלאָנדזשעטע', +'אַװעקגעבלאָנדזשעטער', +'אַװעקגעבן', +'אַװעקגעבנדיק', +'אַװעקגעבנדיקן', +'אַװעקגעבנדיקס', +'אַװעקגעבנדיקע', +'אַװעקגעבנדיקער', +'אַװעקגעגאַנגען', +'אַװעקגעגאַנגענס', +'אַװעקגעגאַנגענע', +'אַװעקגעגאַנגענעם', +'אַװעקגעגאַנגענער', +'אַװעקגעגנבֿעט', +], [ +'אַבֿידות', +'אַבסטראַקטסטער', +'אַדורכבײַסנדיקער', +'אַדורכגעביסן', +'אַדורכגעשמועסט', +'אַדורכפֿירנדיק', +'אַװעקגעגאַנגען', +'אַװעקגעגאַנגענעם', +'אַװעקגענומענער', +'אמתדיק', +'אמתדיקן', +'אמתדיקע', +'אמתדיקער', +'באַהאַלטן', +'ביכער', +'געאַכלט', +'געאײַלט', +'געאײַלן', +'געבאָדענעם', +'געבאָטענעם', +'געשדכנטע', +'עראָפּלאַנען', +'פֿאַרגאַנגענהײט', +'פֿאָרױסגעגאַנגענע', +'קינדהײט', +'װילן', +'װילסט', +]) %] + +

The stemming algorithm

+ +

Groupings

+

We setup the following groupings:

+ +
+
Niked
+
All niked used in Yiddish and Hebrew
+
AlefBeys
+
+ All actual letters in the Hebrew alphabet, including: +
    +
  • The alphabet itself: א ב ג ד ה ו ז ח ט י כ ל מ נ ס ע פ צ ק ר ש ת
  • +
  • Final consonants: ך ם ן ף ץ
  • +
  • Ligatures: װ ױ ײ
  • +
+
+
Vowel
+
א ו י ע ױ ײ
+
Consonant
+
AlefBeys - Vowel
+
+ +

Pre-processing

+ + +

Marking regions

+

+ Only a single marker is used: P1. + To begin with, this is set at the end of the word. +

+ + + +

We are now at the start of the main portion of the word (past any verbal prefix and past participle marker).

+ + + +

Backwards mode

+ +

Unless otherwise stated, all deletes ensure we are beyond P1.

+

In each pass, at the first level of bullets, the longest matching suffix always wins.

+ +

First pass:

+ + +

Second pass - after the first pass, do the following to the remaining stem:

+ + +

Third pass - after the second pass, do the following to the remaining stem:

+ + +Finally, all remaining GE and TSU are deleted. + +

The same algorithm in Snowball

+ +[% highlight_file('yiddish') %] + +[% footer %] diff --git a/codesets/guide.html b/codesets/guide.html new file mode 100644 index 0000000..b61c0d9 --- /dev/null +++ b/codesets/guide.html @@ -0,0 +1,160 @@ + + + + + + + + + + Character codes - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Character codes

+ + +

+Snowball (since version 2.0) supports specifying non-ASCII characters using +the standard Unicode notation U+XXXX where XXXX is a string of +hex digits. However, this doesn't make for very readable source code, so the +Snowball scripts on this site define more mnemonic representations of the +non-ASCII characters which they use - for example, the German stemmer includes +the lines +

+ +
    /* special characters */
+
+    stringdef a"   '{U+00E4}'
+    stringdef o"   '{U+00F6}'
+    stringdef u"   '{U+00FC}'
+    stringdef ss   '{U+00DF}'
+
+ + +

+(In Unicode, hex values E4, F6, FC and DF are the numeric values +of characters ä, ö, ü and ß respectively.) +

+ +

+Then the code which follows uses '{a"}' + + + when it wants +ä, etc. +

+ +

+Using literal Unicode character in strings in the source file may work in some +cases, but isn't really supported - the snowball compiler doesn't (currently at +least) have the concept of "source character set", so at best you'll limit +which programming languages your stemmer can be used with. +

+ +

+If you wish to describe other Latin-alphabet based codesets for use in stemmers +we recommend using the following conventions: +

+ +
+
accent ASCII form example +
acute single quote  e' for é +
grave grave  a` for à +
umlaut double quote  u" for ü +
circumflex circumflex  i^ for î +
cedilla letter c  cc for ç +
tilde tilde  n~ for ñ +
ring letter o  ao for å +
line through solidus  o/ for ø + +
breve plus  a+ for ă +
double acute letter q  oq for ő +
comma below ,  t, for ț +
+ +

+And, should they ever arise, use  r  for left and right +hook (as in Polish), and  v  for hacek (as in Czech). +

+ +

+The ‘line-through’ accent covers a numbers of miscellaneous cases: the +Scandinavian  o/, Icelandic  d/  and Polish  l/. +

+ +

+Use  ae  and  ss  for æ ligature and the German +ß, with +upper case forms  AE  and  SS. Use  th  for Icelandic thorn. +

+ +

+We used to recommend , for cedilla, but we need a way to +represent comma-below for Romanian, so we've repurposed , +for that and now recommend c for cedilla instead. +

+ +

+If you're writing a new stemmer, see below for a file of suitable +stringdef lines you can cut and paste into your code. +

+ +

Links

+ + + +
+
+
+ +
+ +
+ + + + diff --git a/codesets/guide.tt b/codesets/guide.tt new file mode 100644 index 0000000..eb02f9f --- /dev/null +++ b/codesets/guide.tt @@ -0,0 +1,93 @@ +[% header('Character codes') %] + +

+Snowball (since version 2.0) supports specifying non-ASCII characters using +the standard Unicode notation U+XXXX where XXXX is a string of +hex digits. However, this doesn't make for very readable source code, so the +Snowball scripts on this site define more mnemonic representations of the +non-ASCII characters which they use - for example, the German stemmer includes +the lines +

+ +[% highlight(' + /* special characters */ + + stringdef a" ' _ "'{U+00E4}'" _ ' + stringdef o" ' _ "'{U+00F6}'" _ ' + stringdef u" ' _ "'{U+00FC}' + stringdef ss '{U+00DF}' +") %] + +

+(In Unicode, hex values E4, F6, FC and DF are the numeric values +of characters ä, ö, ü and ß respectively.) +

+ +

+Then the code which follows uses [% highlight_inline("'{a" _ '"' _ "}'") %] when it wants +ä, etc. +

+ +

+Using literal Unicode character in strings in the source file may work in some +cases, but isn't really supported - the snowball compiler doesn't (currently at +least) have the concept of "source character set", so at best you'll limit +which programming languages your stemmer can be used with. +

+ +

+If you wish to describe other Latin-alphabet based codesets for use in stemmers +we recommend using the following conventions: +

+ +
+
accent ASCII form example +
acute single quote  e' for é +
grave grave  a` for à +
umlaut double quote  u" for ü +
circumflex circumflex  i^ for î +
cedilla letter c  cc for ç +
tilde tilde  n~ for ñ +
ring letter o  ao for å +
line through solidus  o/ for ø + +
breve plus  a+ for ă +
double acute letter q  oq for ő +
comma below ,  t, for ț +
+ +

+And, should they ever arise, use  r  for left and right +hook (as in Polish), and  v  for hacek (as in Czech). +

+ +

+The ‘line-through’ accent covers a numbers of miscellaneous cases: the +Scandinavian  o/, Icelandic  d/  and Polish  l/. +

+ +

+Use  ae  and  ss  for æ ligature and the German +ß, with +upper case forms  AE  and  SS. Use  th  for Icelandic thorn. +

+ +

+We used to recommend , for cedilla, but we need a way to +represent comma-below for Romanian, so we've repurposed , +for that and now recommend c for cedilla instead. +

+ +

+If you're writing a new stemmer, see below for a file of suitable +stringdef lines you can cut and paste into your code. +

+ +

Links

+ + + +[% footer %] diff --git a/codesets/latin-stringdef-list.txt b/codesets/latin-stringdef-list.txt new file mode 100644 index 0000000..87982e0 --- /dev/null +++ b/codesets/latin-stringdef-list.txt @@ -0,0 +1,76 @@ +/* stringdef list for accented Latin alphabet characters */ + +stringdef A` '{U+00C0}' +stringdef A' '{U+00C1}' +stringdef A^ '{U+00C2}' +stringdef A~ '{U+00C3}' +stringdef A" '{U+00C4}' +stringdef Ao '{U+00C5}' +stringdef AE '{U+00C6}' +stringdef C, '{U+00C7}' +stringdef E` '{U+00C8}' +stringdef E' '{U+00C9}' +stringdef E^ '{U+00CA}' +stringdef E" '{U+00CB}' +stringdef I` '{U+00CC}' +stringdef I' '{U+00CD}' +stringdef I^ '{U+00CE}' +stringdef I" '{U+00CF}' +stringdef D/ '{U+00D0}' +stringdef N~ '{U+00D1}' +stringdef O` '{U+00D2}' +stringdef O' '{U+00D3}' +stringdef O^ '{U+00D4}' +stringdef O~ '{U+00D5}' +stringdef O" '{U+00D6}' +// U+00D7 is "MULTIPLICATION SIGN" +stringdef O/ '{U+00D8}' +stringdef U` '{U+00D9}' +stringdef U' '{U+00DA}' +stringdef U^ '{U+00DB}' +stringdef U" '{U+00DC}' +stringdef Y' '{U+00DD}' +stringdef TH '{U+00DE}' +stringdef ss '{U+00DF}' +stringdef a` '{U+00E0}' +stringdef a' '{U+00E1}' +stringdef a^ '{U+00E2}' +stringdef a~ '{U+00E3}' +stringdef a" '{U+00E4}' +stringdef ao '{U+00E5}' +stringdef ae '{U+00E6}' +stringdef cc '{U+00E7}' // c cedilla +stringdef e` '{U+00E8}' +stringdef e' '{U+00E9}' +stringdef e^ '{U+00EA}' +stringdef e" '{U+00EB}' +stringdef i` '{U+00EC}' +stringdef i' '{U+00ED}' +stringdef i^ '{U+00EE}' +stringdef i" '{U+00EF}' +stringdef d/ '{U+00F0}' +stringdef n~ '{U+00F1}' +stringdef o` '{U+00F2}' +stringdef o' '{U+00F3}' +stringdef o^ '{U+00F4}' +stringdef o~ '{U+00F5}' +stringdef o" '{U+00F6}' +// U+00F7 is "DIVISION SIGN" +stringdef o/ '{U+00F8}' +stringdef u` '{U+00F9}' +stringdef u' '{U+00FA}' +stringdef u^ '{U+00FB}' +stringdef u" '{U+00FC}' +stringdef y' '{U+00FD}' +stringdef th '{U+00FE}' +stringdef y" '{U+00FF}' + +stringdef a+ '{U+0103}' // a breve +stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE +stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT +stringdef oq '{U+0151}' // o-double acute +stringdef sc '{U+015F}' // s cedilla +stringdef tc '{U+0163}' // t cedilla +stringdef uq '{U+0171}' // u-double acute +stringdef s, '{U+0219}' // s comma +stringdef t, '{U+021B}' // t comma diff --git a/compiler/snowman.html b/compiler/snowman.html new file mode 100644 index 0000000..9a7f2c5 --- /dev/null +++ b/compiler/snowman.html @@ -0,0 +1,1768 @@ + + + + + + + + + + Snowball Manual - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Snowball Manual

+ + +

Links to resources

+ + + +

Snowball definition

+ +

+Snowball is a small string-handling language, and its name was chosen as a +tribute to SNOBOL (Farber 1964, Griswold 1968 — +see the references at the end of the +introduction), +with which it shares the +concept of string patterns delivering signals that are used to control the +flow of the program. +

+ +

1 Data types

+ +

+The basic data types handled by Snowball are strings of characters, signed +integers, and boolean truth values, or more simply strings, integers +and booleans. Snowball supports Unicode characters, which may be represented +as UTF-8, 8-bit characters, or 16-bit wide characters (depending on the +programming language code is being generated for - for C, all these options are +supported). +

+ +

2 Names

+ +

+A name in Snowball starts with an ASCII letter, followed by zero or more ASCII +letters, digits and underscores. A name can be of type string, +integer, boolean, routine, external or +grouping. All names must be declared. A declaration has the form +

+ +
+    Ts ( ... )
+
+ +

+where symbol  T  is one of  string,  integer  etc, and the region in +brackets contains a list of names separated by whitespace. For example, +

+ +
    integers ( p1 p2 )
+    booleans ( Y_found )
+
+    routines (
+       shortv
+       R1 R2
+       Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
+    )
+
+    externals ( stem )
+
+    groupings ( v v_WXY v_LSZ )
+
+ + +

+p1  and  p2  are integers,  Y_found  is boolean, and so on. Snowball is quite +strict about the declarations, so all the names go in the same name space, +no name may be declared twice, all used names must be declared, no two +routine definitions can have the same name, etc. Names declared and +subsequently not used are merely reported in a warning message.

+ +

+A name may not be one of the reserved words of Snowball. Additionally, names +for externals must be valid function/method names in the language being +generated in most cases, which generally means they can't be reserved words +in that language (e.g. externals (null) + + + will generate +invalid Java code containing a method public boolean null().) +For internal symbols we add a prefix to avoid this issue, but an external +has to provide an external interface. When generating C code, the +-eprefix option provides a potential solution to this problem. +

+ +

+Names in Snowball are case-sensitive, but external names which differ only in +case will cause a problem for languages with case-insensitive identifiers (such +as Pascal). This issue is avoided for internal symbols in such languages by +encoding case difference via an added prefix. +

+ +

+So for portability a little care is needed when choosing names for externals. +The convention when using Snowball to implement stemming algorithms is to have +a single external named stem, which should be safe. +

+ +

3 Literals

+ +

3.1 Integer Literals

+ +

+A literal integer is an ASCII digit sequence, and is always interpreted as +decimal. +

+ +

3.2 String Literals

+ +

+A literal string is written between single quotes, for example, +

+ +
    'aeiouy'
+
+ + +

+Two special insert characters for use in literal strings are defined by +the directive stringescapes AB + + +, for example, +

+ +
    stringescapes {}
+
+ + +

+Conventionally { and } are used as the insert +characters, and we would recommend following this convention unless you want to +use these as literal characters in your strings a lot. However, + A  and  B  can be any printing +characters, except that  A  can't be a single quote. +(If A  and  B are the same then + A  itself can never be escaped.) +

+ +

+A subsequent occurrence of the stringescapes directive redefines +the insert characters (but any string macros already defined with +stringdef remain defined). +

+ +

+Within insert characters, the following sequences are understood: +

+ +
    +
  • +User-defined string macros which can be specified using +stringdef. Macro  m  is defined in the +form  stringdef m 'S', where  'S'  is a +string, and  m  a sequence of one or more printing +characters. Thereafter,  {m}  inside a string causes + S  to be substituted in place of  m. +

  • + +
  • +New in Snowball 2.0: Unicode codepoints can be specified using the syntax +U+ followed by one or more hex digits - for example, +'{U+FFFD}' + + +. These are automatically handled +appropriately in all cases except if you want to generate C code to handle a +single byte character set other than ISO-8859-1. Such cases are handled by +defining string macros for the U+ codes in the character set, +after which the same Snowball source can be used. You can't mix use of +U+ codes defined as string macros and with their default +meanings in the same compilation. When U+ codes are defined +as string macros, snowball will upper case the characters after the ++ if there's no macro defined with the case as given. +

  • + +
  • +By default  {'}  will substitute  '  and +{{}  will substitute  {, although macros  '  and  {  may subsequently be +redefined. +

  • + +
  • +A further feature is that  {W}  inside +a string, where  W  is a +sequence of whitespace characters including one or more newlines, is +ignored. This enables long strings to be written over a number of lines. +

  • +
+ +

+For example, +

+ +
    stringescapes {}
+
+    /* Spanish diacritics */
+
+    stringdef a'   '{U+00E1}'  // a-acute
+    stringdef e'   '{U+00E9}'  // e-acute
+    stringdef i'   '{U+00ED}'  // i-acute
+    stringdef o'   '{U+00F3}'  // o-acute
+    stringdef u'   '{U+00FA}'  // u-acute
+    stringdef u"   '{U+00FC}'  // u-diaeresis
+    stringdef n~   '{U+00F1}'  // n-tilde
+
+    /* All the characters in Spanish used to represent vowels */
+
+    define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
+
+ + +

4 Routines

+ +

+A routine definition has the form +

+ +
    define R as C
+
+ + +

+where  R  is the routine name and  C  is a command, or bracketed group of +commands. So a routine is defined as a sequence of zero or more commands. +Snowball routines do not (at present) take parameters. For example, +

+ +
    define Step_5b as (      // this defines Step_5b
+        ['l']                // three commands here: [, 'l' and ]
+        R2 'l'               // two commands, R2 and 'l'
+        delete               // delete is one command
+    )
+
+    define R1 as $p1 <= cursor
+        /* R1 is defined as the single command "$p1 <= cursor" */
+
+ + +

+A routine is called simply by using its name,  R, as a command. +

+ +

5 Commands and signals

+ +

+The flow of control in Snowball is arranged by the implicit use of +signals, rather than the explicit use of constructs like the  if, +else,  break  of C. The scheme is designed for handling strings, but is +perhaps easier to introduce using integers. Suppose  x,  y,  z  ... are +integers. The command +

+ +
    $x = 1
+
+ + +

+sets  x  to 1. The command +

+ +
    $x > 0
+
+ + +

+tests if  x  is greater than zero. Both commands give a signal t or f, +(true or false), but while the second command gives t if  x  is greater +than zero and f otherwise, the first command always gives t. In Snowball, +every command gives a t or f signal. A sequence of commands can be turned +into a single command by putting them in a list surrounded by round +brackets: +

+ +
+    ( C1 C2 C3 ... Ci Ci+1 ... )
+
+ +

+When this is obeyed,  Ci+1  will be obeyed if each of the preceding  C1  ... +Ci  give t, but as soon as a  Ci  gives f, the subsequent  Ci+1 Ci+2  ... +are ignored, and the whole sequence gives signal f. If all the  Ci  give t, +however, the bracketed command sequence also gives t. So, +

+ +
    $x > 0  $y = 1
+
+ + +

+sets  y  to 1 if  x  is greater than zero. If  x  is less than or equal to zero +the two commands give f. +

+ +

+If  C1  and  C2  are commands, we can build up the larger commands, +

+ +
+
C1 or C2 +
— Do  C1. If it gives t ignore  C2, otherwise do  C2. The resulting + signal is t if and only  C1  or  C2  gave t. +
C1 and C2 +
— Do  C1. If it gives f ignore  C2, otherwise do  C2. The resulting + signal is t if and only  C1  and  C2  gave t. +
not C +
— Do  C. The resulting signal is t if  C  gave f, otherwise f. +
try C +
— Do  C. The resulting signal is t whatever the signal of  C. +
fail C +
— Do  C. The resulting signal is f whatever the signal of  C. +
+ +

+So for example, +

+ +
+
($x > 0 $y = 1) or ($y = 0) + + +
— sets  y  to 1 if  x  is greater than zero, otherwise to zero. + +
try( ($x > 0) and ($z > 0) $y = 1) + + +
— sets  y  to 1 if both  x  and  z  are greater than 0, and gives t. +
+ +

+This last example is the same as +

+ +
    try($x > 0  $z > 0  $y = 1)
+
+ + +

+so that  and  seems unnecessary here. But we will see that  and  has a +particular significance in string commands. +

+ +

+When a ‘monadic’ construct like  not,  try  or  fail  is not followed by a +round bracket, the construct applies to the shortest following valid command. +So for example +

+ +
    try not $x < 1 $z > 0
+
+ + +

+would mean +

+ +
    try ( not ( $x < 1 ) ) $z > 0
+
+ + +

+because $x < 1 + + + is the shortest valid command following  not, and then +not $x < 1  is the shortest valid command following  try. +

+ +

+The ‘dyadic’ constructs like  and  and  or  must sit in a bracketed list +of commands anyway, for example, +

+ +
+    ( C1 C2 and C3 C4 or C5 )
+
+ +

+And then in this case  C2  and  C3  are connected by the  and;  C4  and  C5  are +connected by the  or. So +

+ +
    $x > 0  not $y > 0 or not $z > 0  $t > 0
+
+ + +

+means +

+ +
    $x > 0  ((not ($y > 0)) or (not ($z > 0)))  $t > 0
+
+ + +

+and  and  or  are equally binding, and bind from left to right, +so  C1 or C2 and C3  means  (C1 or C2) and C3  etc. +

+ +

6 Integer commands

+ +

+There are two sorts of integer commands - assignments and comparisons. Both +are built from Arithmetic Expressions (AEs). +

+ +

Arithmetic Expressions (AEs)

+ +

+An AE consists of integer names, literal numbers and a few other things +connected by dyadic  +,  -,  *  and  /, and monadic  -, with the same +binding powers and semantics as C. As well as integer names and literal +numbers, the following may be used in AEs: +

+ +
+ +
minint  — the minimum negative number +
maxint  — the maximum positive number +
cursor  — the current value of the string cursor +
limit  — the current value of the string limit +
size  — the size of the string, in "slots" +
sizeof s  — the number of "slots" in  s, where  s  is the name of a string or (since Snowball 2.1) a literal string +
New in Snowball 2.0:
len  — the length of the string, in Unicode characters +
lenof s  — the number of Unicode characters in  s, where  s  is the name of a string or (since Snowball 2.1) a literal string +
+ +

+size + + + and sizeof + + + count in +"slots" - see the "Character representation" section below for details. +

+ +

+The cursor and limit concepts are explained below. +

+ +

Integer assignments

+ +

+An integer assignment has the form +

+ +
+    $X assign_op AE
+
+ +

+where  X  is an integer name and assign_op is one of the five assignments + =,  +=,  -=,  *=, or  /=. +The meanings are the same as in C. +

+ +

+For example, +

+ +
    $p1 = limit    // set p1 to the string limit
+
+ + +

+Integer assignments always give the signal t. +

+ +

Integer comparisons

+ +

+An integer comparison has the form +

+ +
+    $X rel_op AE
+
+ +

+or (since Snowball 2.0): +

+ +
+    $(AE1 rel_op AE2)
+
+ +

+where  X  is an integer name and rel_op is one of the six tests + ==,  !=,  >=, + >, <=, or  <. +Again, the meanings are the same as in C. +

+ +

+Examples of integer comparisons are, +

+ +
    $p1 <= cursor  // signal is f if the cursor is before position p1
+    $(len >= 3)    // signal is f unless the string is at least 3 characters long
+
+ + +

+The second form is more general since an integer name is a valid AE, but it +also allows comparisons which don't involve integer variables. Before support +for this was added the second example could only be achieved by assigning +len to a variable and then testing that variable instead. +

+ +

7 String commands

+ +

+If  s  is a string name, a string command has the form +

+ +
    $s C
+
+ + +

+where  C  is a command that operate on the string. Strings can be processed +left-to-right or right-to-left, but we will describe only the +left-to-right case for now. The string has a cursor, which we will +denote by c, and a limit point, or limit, which we will denote by l. c +advances towards l in the course of a string command, but the various +constructs  and,  or,  not  etc have side-effects which keep moving it +backwards. Initially c is at the start and l the end of the string. For +example, +

+ +
+        'a|n|i|m|a|d|v|e|r|s|i|o|n'
+        |                         |
+        c                         l
+
+ +

+c, and l, mark the boundaries between characters, and not +characters themselves. The characters between c and l will be denoted by +c:l. +

+ +

+If  C  gives t, the cursor c will have a new, well-defined value. But if  C +gives f, c is undefined. Its later value will in fact be determined by the +outer context of commands in which  C  came to be obeyed, not by  C  itself. +

+ +

+Here is a list of the commands that can be used to operate on strings. +

+ +

a) Setting a value

+ +
+
= S +
where  S  is the name of a string or a literal string. c:l is set equal + to  S, and l is adjusted to point to the end of the copied string. The + signal is t. For example, + +
        $x  = 'animadversion'    /* literal string */
+        $y = x                  /* string name */
+
+ + +
+ +

b) Basic tests

+ +
+
S +
here and below,  S  is the name of a string or a literal string. If c:l + begins with the substring  S, c is repositioned to the end of this + substring, and the signal is t. Otherwise the signal is f. For example, + +
        $x 'anim'   /* gives t, assuming the string is 'animadversion' */
+        $x ('anim' 'ad' 'vers')
+                    /* ditto */
+
+        $t = 'anim'
+        $x t        /* ditto */
+
+ + +
true,  false +
true  is a dummy command that generates signal t.  false  generates + signal f. They are sometimes useful for emphasis, + +
        define start_off as true       // nothing to do
+        define exception_list as false // put in among(...) list later
+
+ + +  true  is equivalent to  () +
C1 or C2 +
This is like the case for integers described above, but the extra + touch is that if  C1  gives f, c is set back to its old position after +  C1  has given f and before  C2  is tried, so that the test takes place on + the same point in the string. So we have + +
        $x ('anim'  /* signal t */
+            'ation' /* signal f */
+           ) or
+           ( 'an'   /* signal t - from the beginning */
+           )
+
+ + +
C1 and C2 +
And similarly c is set back to its old position after  C1  has given t + and before  C2  is tried. So, + +
        $x 'anim' and 'an'   /* signal t */
+        $x ('anim'  'an')    /* signal f, since 'an' and 'ad' mis-match */
+
+ + +
not C +
try C +
These are like the integer tests, with the added feature that c is set + back to its old position after an f signal is turned into t. So, + +
        $x (not 'animation' not 'immersion')
+            /* both tests are done at the start of the string */
+
+        $x (try 'animus' try 'an'
+            'imad')
+            /* - gives t */
+
+ + +
+
 try C  is equivalent to  C or true +
+
test C +
This does command  C  but without advancing c. Its signal is the same as + the signal of  C, but following signal t, c is set back to its old + value. +
+
 test C  is equivalent to  not not C +
 test C1 C2  is equivalent to  C1 and C2 +
+
fail C +
This does  C  and gives signal f. It is equivalent to  C false. Like +  false  it is useful, but only rarely. + +
do C +
This does  C, puts c back to its old value and gives signal t. It is + very useful as a way of suppressing the side effect of f signals and + cursor movement. +
+
 do C  is equivalent to  try test C +
or  test try C +
+
goto C +
c is moved right until obeying  C  gives t. But if c cannot be moved + right because it is at l the signal is f. c is set back to the position + it had before the last obeying of  C, so the effect is to leave c before + the pattern which matched against  C. + +
        $x goto 'ad'         /* positions c after 'anim' */
+        $x goto 'ax'         /* signal f */
+
+ + +
gopast C +
Like goto, but c is not set back, so the effect is to leave c after + the pattern which matched against  C. + +
        $x gopast 'ad'       /* positions c after 'animad' */
+
+ + +
repeat C +
C  is repeated until it gives f. When this happens c is set back to the + position it had before the last repetition of  C, and  repeat C  gives + signal t. For example, + +
        $x repeat gopast 'a' /* position c after the last 'a' */
+
+ + +
loop AE C +
This is like  C C ... C  written out AE times, where AE is an arithmetic + expression. For example, + +
        $x loop 2 gopast ('a' or 'e' or 'i' or 'o' or 'u')
+            /* position c after the second vowel */
+
+ + + The equivalent expression in C has the shape, + +
        {    int i;
+             int limit = AE;
+             for (i = 0; i < limit; i++) C;
+        }
+
+ +
atleast AE C +
This is equivalent to  loop AE C repeat C. + +
hop AE +
moves c AE character positions towards l, but if AE is negative, or if + there are less than AE characters between c and l the signal is f. + For example, + +
        test hop 3
+
+ + + tests that c:l contains more than 2 characters. + +
next +
is equivalent to  hop 1. +
+ +

c) Moving text about

+ +

+We have seen in (a) that  $x = y, when  x  and  y  are strings, sets c:l of  x +to the value of  y. Conversely +

+ +
        $x => y
+
+ + +

+sets the value of  y  to the c:l region of  x. +

+ +

+A more delicate mechanism for pushing text around is to define a substring, +or slice of the string being tested. Then +

+ +
+
[ + + +
sets the left-end of the slice to c, +
] + + +
sets the right-end of the slice to c, +
-> s + + +
moves the slice to variable  s, +
<- S + + +
replaces the slice with variable (or literal)  S. +
+ +

+For example +

+ +
        /* assume x holds 'animadversion' */
+        $x ( [          // '[animadversion' - [ set as indicated
+             loop 2 gopast 'a'
+                       // '[anima|dversion' - c is marked by '|'
+             ]         // '[anima]dversion' - ] set as indicated
+             -> y      // y is 'anima'
+           )
+
+ + +

+For any string, the slice ends should be assumed to be unset until they are +set with the two commands  [,  ]. Thereafter the slice ends will retain +the same values until altered. +

+ +
+
delete + + +
is equivalent to <- '' + + +
+ +

+This next example deletes all vowels in x, +

+ +
        define vowel ('a' or 'e' or 'i' or 'o' or 'u')
+        /* ... */
+        $ x repeat ( gopast([vowel]) delete )
+
+ + +

+As this example shows, the slice markers  [  and  ]  often appear as +pairs in a bracketed style, which makes for easy reading of the Snowball +scripts. But it must be remembered that, unusually in a computer +programming language, they are not true brackets. +

+ +

+More simply, text can be inserted at c. +

+ +
+
insert S + + +
insert variable or literal  S  before c, moving c to the right of the + insert.  <+  is a synonym for  insert. + +
attach S + + +
the same, but leave c at the left of the insert. +
+ +

d) Marks

+ +

+The cursor, c, (and the limit, l) can be thought of as having a numeric +value, from zero upwards: +

+ +
+         | a | n | i | m | a | d | v | e | r | s | i | o | n |
+         0   1   2   3   4   5   6   7   8   9  10  11  12  13
+
+ +

+It is these numeric values of c and l which are accessible through +cursor  and  limit  in arithmetic expressions. +

+ +
+
setmark X + + +
sets  X  to the current value of c, where  X  is an integer variable. + It's equivalent to: = cursor + + + +
tomark AE + + +
moves c forward to the position given by AE, + +
atmark AE + + +
tests if c is at position AE (t or f signal). + It's equivalent to: (cursor == AE) + + +
+ +

+In the case of tomark AE + + +, a similar fail condition occurs as with hop AE + + +. +If c is already beyond AE, or if position l is before position AE, the +signal is f. +

+ +

+In the stemming algorithms, certain regions of the word are defined by +setting marks, and later the failure condition of tomark + + + is used to see if +c is inside a particular region. +

+ +

+Two other commands put c at l, and test if c is at l, +

+ +
+
tolimit + + +
moves c forward to l (signal t always), + +
atlimit + + +
tests if c is at l (t or f signal). +
+ +

e) Changing l

+ +

+In this account of string commands we see c moving right towards l, while +l stays fixed at the end. In fact l can be reset to a new position between +c and its old position, to act as a shorter barrier for the movement of c. +

+ +
+
setlimit C1 for C2 +
C1  is obeyed, and if it gives f the signal from  setlimit + is f with no further action. +

+ +

+ Otherwise, the final value of c becomes the new + position of l. c is then set back to its old value before  C1  was + obeyed, and  C2  is obeyed. Finally l is set back to its old position, + and the signal of  C2  becomes the signal of  setlimit. +

+ +

+ So the signal is f if either  C1  or  C2  gives f, otherwise t. + For example, +

+ +
    $x ( setlimit goto 's'  // 'animadver}sion' new l as marked '}'
+         for                // below, '|' marks c after each goto
+         ( goto 'a' and     // '|animadver}sion'
+           goto 'e' and     // 'animadv|er}sion'
+           goto 'i' and     // 'an|imadver}sion'
+         )
+       )
+
+ + +

+ This checks that x has characters ‘a’, ‘e’ and ‘i’ before the first + ‘s’. +

+ +
+ +

f) Backward processing

+ +

+String commands have been described with c to the left of l and moving +right. But the process can be reversed. +

+ +
+
backwards C + + +
c and l are swapped over, and c moves left towards l.  C  is obeyed, the + signal given by  C  becomes the signal of  backwards C, and c and l are + swapped back to their old values (except that l may have been adjusted + because of deletions and insertions).  C  cannot contain another + backwards + + + command. + +
reverse C + + +
A similar idea, but here c simply moves left instead of moving right, + with the beginning of the string as the limit, l.  C  can contain other + reverse + + + commands, but it cannot contain commands to do deletions or + insertions — it must be used for testing only. (Without this + restriction Snowball's semantics would become very untidy.) +
+ +

+Forward and backward processing are entirely symmetric, except that forward +processing is the default direction, and literal strings are always +written out forwards, even when they are being tested backwards. So the +following are equivalent, +

+ +
    $x (
+        'ani' 'mad' 'version' atlimit
+    )
+
+    $x backwards (
+        'version' 'mad' 'ani' atlimit
+    )
+
+ + +

+If a routine is defined for backwards mode processing, it must be included +inside a  backwardmode(...)  declaration. +

+ +

g) substring and among

+ +

+The use of substring + + + and among + + + is central to the implementation of the +stemming algorithms. It is like a case switch on strings. In its simpler +form, +

+ +
+        substring among('S1' 'S2' 'S3' ...)
+
+ +

+searches for the longest matching substring  'S1'  or  'S2'  or  'S3'  ... from +position c. (The  'Si'  must all be different.) So this has the same +semantics as +

+ +
+        ('S1' or 'S2' or 'S3' ...)
+
+ +

+— so long as the  'Si'  are written out in decreasing order of length. +

+ +

+substring  may be omitted, in which case it is attached to its following +among, so +

+ +
    among(/*...*/)
+
+ + +

+without a preceding substring + + + is equivalent to +

+ +
    (substring among(/*...*/))
+
+ + +

+substring + + + may also be detached from its among + + +, although it must +precede it textually in the same routine in which the among + + + appears. +The more general form of substring /* ... */ among + + + is, +

+ +
+    substring
+    C
+    among( 'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+Obeying  substring  searches for a longest match among the  'Sij'. The +signal from  substring  is t if a match is found, otherwise f. +Any commands C between the substring and among will be run after this +search and only if the search finds a match (it would be equivalent to remove C and replace each +Ci with C Ci). When the +among  comes to be obeyed, the  Ci  corresponding to the matched  'Sij'  is +obeyed, and its signal becomes the signal of the  among  command. +

+ +

+substring/among  pairs must match up textually inside each routine +definition. But there is no problem with an  among  containing other +substring/among  pairs, and  substring  is optional before  among  anyway. +The essential constraint is that two  substrings must be separated by an +among, and each  substring  must be followed by an  among. +

+ +

+The effect of obeying  among  when the preceding  substring  is not obeyed +is undefined. This would happen for example here, +

+ +
    try($x != 617 substring)
+    among(...) // 'substring' is bypassed in the exceptional case where x == 617
+
+ + +

+The significance of separating the  substring  from the  among  is to allow +them to work in different contexts. For example, +

+ +
+    setlimit tomark L for substring
+
+    among( 'S11' 'S12' ... (C1)
+           ...
+
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+Here the test for the longest  'Sij'  is constrained to the region between c +and the mark point given by integer  L. But the commands  Ci  operate outside +this limit. Another example is +

+ +
+    reverse substring
+
+    among( 'S11' 'S12' ... (C1)
+           ...
+
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+The substring test is in the opposite direction in the string to the +direction of the commands  Ci. +

+ +

+The last  (Cn)  may be omitted, in which case  (true)  is assumed. +

+ +

+Each string  'Sij'  may be optionally followed by a +routine name, +

+ +
+    among(
+           'S11' R11 'S12' R12 ... (C1)
+           'S21' R21 'S22' R22 ... (C2)
+           ...
+           'Sn1' Rn1 'Sn2' Rn1 ... (Cn)
+         )
+
+ +

+If a routine name is not specified, it is equivalent +to a routine which simply returns signal t, +

+ +
    define null as true
+
+ + +

+— so we can imagine each  'Sij'  having its associated routine +Rij. Then obeying the  among  causes a search for the longest +'Sij'  whose corresponding routine +Rij  gives t. +

+ +

+The routines Rij  should be written without any +side-effects, other than the inevitable cursor movement. (c is in +any case set back to its old value following a call of +Rij.) +

+ +

8 Booleans

+ +

+set B + + + and unset B + + + set  B  to true and false respectively, where  B  is a +boolean name. B + + + as a command gives a signal t if it is set true, f +otherwise. For example, +

+ +
    booleans ( Y_found )   // declare the boolean
+
+    /* ... */
+
+    unset Y_found          // unset it
+    do ( ['y'] <-'Y' set Y_found )
+       /* if c:l begins 'y' replace it by 'Y' and set Y_found */
+
+    do repeat(goto (v ['y']) <-'Y' set Y_found)
+       /* repeatedly move down the string looking for v 'y' and
+          replacing 'y' with 'Y'. Whenever the replacement takes
+          place set Y_found. v is a test for a vowel, defined as
+          a grouping (see below). */
+
+
+    /* Y_found means there are some letters Y in the string.
+       Later we can use this to trigger a conversion back to
+       lower case y. */
+
+    /* ... */
+
+    do (Y_found repeat(goto (['Y']) <- 'y')
+
+ + +

9 Groupings

+ +

+A grouping brings characters together and enables them to be looked for +with a single test. +

+ +

+If  G  is declared as a grouping, it can be defined by +

+ +
+    define G G1 op G2 op G3 ...
+
+ +

+where op is  +  or  -, and  G1,  G2,  G3  are literal strings, or groupings that +have already been defined. (There can be zero or more of these additional +op components). For example, +

+ +
    define capital_letter  'ABDEFGHIJKLMNOPQRSTUVWXYZ'
+    define small_letter    'abdefghijklmnopqrstuvwxyz'
+    define letter          capital_letter + small_letter
+    define vowel           'aeiou' + 'AEIOU'
+    define consonant       letter - vowel
+    define digit           '0123456789'
+    define alphanumeric    letter + digit
+
+ + +

+Once  G  is defined, it can be used as a command, and is equivalent to a test +

+ +
+    'ch1' or 'ch2' or ...
+
+ +

+where  ch1,  ch2  ... list all the characters in the grouping. +

+ +

+non G + + + is the converse test, and matches any character except the +characters of  G. Note that non G + + + is not the same as not G + + +, in fact +

+ +

+non G + + + is equivalent to (not G next) + + +

+ +

+non + + + may be optionally followed by hyphen, for example: +

+ +
    non-vowel
+    non-digit
+
+ + +

+Bear in mind that non-vowel + + + doesn't only match a +consonant - it'll match any character which isn't in the vowel +grouping. Failing to consider this has lead to bugs in stemming algorithms - +for example, here we intended to undouble a consonant: +

+ +
    [non-vowel] -> ch
+    ch
+    delete
+
+ + +

+The problem with this code is it will also mangle numbers with repeated digits, +for example 1900 would become 190. A good rule of +thumb here seems to be to use an inclusive grouping check if the code goes on +to delete the character matched: +

+ +
    [consonant] -> ch
+    ch
+    delete
+
+ + +

10 A Snowball program

+ +

+A complete program consists of a sequence of declarations followed by a +sequence of definitions of groupings and routines. Routines which are +implicitly defined as operating on c:l from right to left must be included +in a  backwardmode(...)  declaration. +

+ +

+A Snowball program is called up via a simple +API +through its defined externals. For example, +

+ +
    externals ( stem1 stem2 )
+    /* ... */
+    define stem1 as ( /* stem1 commands */ )
+    define stem2 as ( /* stem2 commands */ )
+
+ + +

+The API also allows a current string to be defined, and this becomes the +c:l string for the external routine to work on. Its final value is the +result handed back through the API. +

+ +

+The strings, integers and booleans are accessible from any point in the +program, and exist throughout the running of the Snowball program. They are +therefore like static declarations in C. +

+ +

11 Comments, and other whitespace fillers

+ +

+At a deeper level, a program is a sequence of tokens, interspersed with +whitespace. Names, reserved words, literal numbers and strings are all +tokens. Various symbols, made up of non-alphanumerics, are also tokens. +

+ +

+A name, reserved word or number is terminated by the first character that +cannot form part of it. A symbol is recognised as the longest sequence of +characters that forms a valid symbol. So  +=-  is two symbols,  +=  and +-, because  +=  is a valid symbol in the language while  +=-  is not. +Whitespace separates tokens but is otherwise ignored. This of course is +like C. +

+ +

+Occasionally a newer version of Snowball may add a new token. So as not to +break existing programs, any such tokens declared as a name (via +integers + + +, routines + + +, etc) +will lose their token status for the rest of the program. This applies +to the tokens +len + + +and +lenof + + +. +

+ +

+Anywhere that whitespace can occur, there may also occur: +

+ +

+(a) Comments, in the usual multi-line /* .... */ + + + or single line +// ... + + + format. +

+ +

+(b) Get directives. These are like  #include  commands in C, and have the form +get 'S' + + +, where  'S'  is a literal string. For example, +

+ +
    get '/home/martin/snowball/main-hdr' // include the file contents
+
+ + +

+(c) stringescapes XY + + + where  X  and  Y  are any two printing characters. +

+ +

+(d) stringdef m 'S' + + + where  m  is sequence of characters not including +whitespace and terminated with whitespace, and  'S'  is a literal string. +

+ +

12 Character representation

+ +

+In this description of Snowball, it is assumed that strings are composed of +characters, and that characters can be defined numerically, but the numeric range +of these characters is not defined. As implemented, three different schemes +are supported. Characters can either be (a) bytes in the range 0 to 255, +as in traditional C strings, or (b) byte pairs in the range 0 to 65535, +as in Java strings, or (c) UTF-8 encoded bytes sequences in the range 0 +to 65535, so that a character may occupy 1, 2 or 3 bytes. +

+ +

+For case (c), we need to make a slight separation of the concept of +characters into symbols, the units of text being represented, and +slots, the units of space into which they map. (So in case (a), all +slots are one byte; in case (b) all slots are two bytes.) +c and l have numeric values that can be used in AEs (arithmetic +expressions). These values count the number of slots. Similarly +setmark,  tomark  and  atmark  are remembering and then using slot +counts.  size  and  sizeof  measure string size +in slots, not symbols. However,  hop N  moves c over  N  symbols, +not  N  slots, and  next  is equivalent to  hop 1. +

+ +

+Snowball 2.0 adds len and lenof, which measure string length in symbols +(so they're the same as size and sizeof in cases (a) and (b), but +different in case (c)). +

+ +

+So long as these simple distinctions are recognised, the same Snowball +script can be compiled to work with any of the three encoding schemes. +

+ +

13 Legacy Features

+ +

13.1 hex and decimal

+ +

+This section documents features of Snowball for which there's a strongly +prefered alternative. They're still support for compatibility with +existing code which uses them, but you shouldn't use them in then code. +We document them here so that their meaning in existing code can be +understood, and especially to aid updated to the preferred alternatives. +

+ +

+In a  stringdef , string may be preceded by the word  hex, +or the word  decimal. This was how non-ASCII characters +were specified before support for specifying Unicode codepoints using the +U+ notation was added. +

+ +

+hex and decimal mean that the contents of the string +are interpreted as characters values written out in hexadecimal, or decimal, +notation. The characters should be separated by spaces. For example, +

+ +
    hex 'DA'        /* is character hex DA */
+    hex 'D A'       /* is the two characters, hex D and A (carriage
+                       return, and line feed) */
+    decimal '10'    /* character 10 (line feed) */
+    decimal '13 10' /* characters 13 and 10 (carriage return, and
+                       line feed) */
+
+ + +

+The following forms are equivalent, +

+ +
    hex 'd a'      /* lower case also allowed */
+    hex '0D 000A'  /* leading zeroes ignored */
+    hex ' D  A  '  /* extra spacing is harmless */
+
+ + +

+The interpretation of the values is as Unicode codepoints if command +line option -utf8 or -widechars is specified, and as +character values in an unspecified single byte character set otherwise. For +ASCII and ISO-8859-1 the character values match Unicode codepoints, but to +handle other single byte character sets (e.g. ISO-8859-2 or KOI8-R) you needed +a special version of a Snowball source with different character values +specified via stringdef. The U+ notation allows +you to use a single Snowball source in this situation. +

+ +

13.2 among starter command

+ +

+The among command supports a "starter" command, C +in this example: +

+ +
+    among( (C)
+           'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+This is equivalent to adding C at the start of each +Ci: +

+ +
+    among( 'S11' 'S12' ... (C C1)
+           'S21' 'S22' ... (C C2)
+           ...
+           'Sn1' 'Sn2' ... (C Cn)
+         )
+
+ +

+However, both are equivalent to: +

+ +
+    substring C
+    among( 'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+This requires an explicit substring but seems clearer so +we recommend using this in new code and have designated the use of a starter as +a legacy feature. +

+ +

+A starter is also allowed with an explicit substring, for example: +

+ +
+    substring
+    Cs
+    among( (Ca)
+           'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+is equivalent to: +

+ +
+    substring
+    Cs
+    Ca
+    among( 'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

Snowball syntax

+ +
+

+In the grammar which follows, ||  is used for alternatives, + [X]  means that X is +optional, and  [X]*  means that X is repeated zero or more +times. meta-symbols are defined on the left.  <char>  means any +character. +

+ +

+The definition of  literal string  does not allow for the escaping +conventions established by the  stringescapes  directive. The command +?  is a debugging aid. +

+ +
+<letter>        ::= a || b || ... || z || A || B || ... || Z
+<digit>         ::= 0 || 1 || ... || 9
+<name>          ::= <letter> [ <letter> || <digit> || _ ]*
+<s_name>        ::= <name>
+<i_name>        ::= <name>
+<b_name>        ::= <name>
+<r_name>        ::= <name>
+<g_name>        ::= <name>
+<literal string>::= '[<char>]*'
+<number>        ::= <digit> [ <digit> ]*
+
+S               ::= <s_name> || <literal string>
+G               ::= <g_name> || <literal string>
+
+<declaration>   ::= strings ( [<s_name>]* ) ||
+                    integers ( [<i_name>]* ) ||
+                    booleans ( [<b_name>]* ) ||
+                    routines ( [<r_name>]* ) ||
+                    externals ( [<r_name>]* ) ||
+                    groupings ( [<g_name>]* )
+
+<r_definition>  ::= define <r_name> as C
+<plus_or_minus> ::= + || -
+<g_definition>  ::= define <g_name> G [ <plus_or_minus> G ]*
+
+AE              ::= (AE) ||
+                    AE + AE || AE - AE || AE * AE || AE / AE || - AE ||
+                    maxint || minint || cursor || limit ||
+                    size || sizeof S ||
+                    len || lenof S ||
+                    <i_name> || <number>
+
+<i_assign>      ::= $ <i_name> = AE ||
+                    $ <i_name> += AE || $ <i_name> -= AE ||
+                    $ <i_name> *= AE || $ <i_name> /= AE
+
+<i_test_op>     ::= == || != || > || >= || < || <=
+
+<i_test>        ::= $ ( AE <i_test_op> AE ) ||
+                    $ <i_name> <i_test_op> AE
+
+<s_command>     ::= $ <s_name> C
+
+C               ::= ( [C]* ) ||
+                    <i_assign> || <i_test> || <s_command> || C or C || C and C ||
+                    not C || test C || try C || do C || fail C ||
+                    goto C || gopast C || repeat C || loop AE C ||
+                    atleast AE C || S || = S || insert S || attach S ||
+                    <- S || delete ||  hop AE || next ||
+                    => <s_name> || [ || ] || -> <s_name> ||
+                    setmark <i_name> || tomark AE || atmark AE ||
+                    tolimit || atlimit || setlimit C for C ||
+                    backwards C || reverse C || substring ||
+                    among ( [<literal string> [<r_name>] || (C)]* ) ||
+                    set <b_name> || unset <b_name> || <b_name> ||
+                    <r_name> || <g_name> || non [-] <g_name> ||
+                    true || false || ?
+
+P              ::=  [P]* || <declaration> ||
+                    <r_definition> || <g_definition> ||
+                    backwardmode ( P )
+
+<program>      ::=  P
+
+
+
+synonyms:      <+ for insert
+
+ +
+
+
+ +
+ +
+ + + + diff --git a/compiler/snowman.tt b/compiler/snowman.tt new file mode 100644 index 0000000..442af83 --- /dev/null +++ b/compiler/snowman.tt @@ -0,0 +1,1558 @@ +[% header('Snowball Manual') %] + +

Links to resources

+ + + +

Snowball definition

+ +

+Snowball is a small string-handling language, and its name was chosen as a +tribute to SNOBOL (Farber 1964, Griswold 1968 — +see the references at the end of the +introduction), +with which it shares the +concept of string patterns delivering signals that are used to control the +flow of the program. +

+ +

1 Data types

+ +

+The basic data types handled by Snowball are strings of characters, signed +integers, and boolean truth values, or more simply strings, integers +and booleans. Snowball supports Unicode characters, which may be represented +as UTF-8, 8-bit characters, or 16-bit wide characters (depending on the +programming language code is being generated for - for C, all these options are +supported). +

+ +

2 Names

+ +

+A name in Snowball starts with an ASCII letter, followed by zero or more ASCII +letters, digits and underscores. A name can be of type string, +integer, boolean, routine, external or +grouping. All names must be declared. A declaration has the form +

+ +
+    Ts ( ... )
+
+ +

+where symbol  T  is one of  string,  integer  etc, and the region in +brackets contains a list of names separated by whitespace. For example, +

+ +[% highlight(" + integers ( p1 p2 ) + booleans ( Y_found ) + + routines ( + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b + ) + + externals ( stem ) + + groupings ( v v_WXY v_LSZ ) +") %] + +

+p1  and  p2  are integers,  Y_found  is boolean, and so on. Snowball is quite +strict about the declarations, so all the names go in the same name space, +no name may be declared twice, all used names must be declared, no two +routine definitions can have the same name, etc. Names declared and +subsequently not used are merely reported in a warning message.

+ +

+A name may not be one of the reserved words of Snowball. Additionally, names +for externals must be valid function/method names in the language being +generated in most cases, which generally means they can't be reserved words +in that language (e.g. [% highlight_inline("externals (null)") %] will generate +invalid Java code containing a method public boolean null().) +For internal symbols we add a prefix to avoid this issue, but an external +has to provide an external interface. When generating C code, the +-eprefix option provides a potential solution to this problem. +

+ +

+Names in Snowball are case-sensitive, but external names which differ only in +case will cause a problem for languages with case-insensitive identifiers (such +as Pascal). This issue is avoided for internal symbols in such languages by +encoding case difference via an added prefix. +

+ +

+So for portability a little care is needed when choosing names for externals. +The convention when using Snowball to implement stemming algorithms is to have +a single external named stem, which should be safe. +

+ +

3 Literals

+ +

3.1 Integer Literals

+ +

+A literal integer is an ASCII digit sequence, and is always interpreted as +decimal. +

+ +

3.2 String Literals

+ +

+A literal string is written between single quotes, for example, +

+ +[% highlight(" + 'aeiouy' +") %] + +

+Two special insert characters for use in literal strings are defined by +the directive [% highlight_inline("stringescapes AB") %], for example, +

+ +[% highlight(" + stringescapes {} +") %] + +

+Conventionally { and } are used as the insert +characters, and we would recommend following this convention unless you want to +use these as literal characters in your strings a lot. However, + A  and  B  can be any printing +characters, except that  A  can't be a single quote. +(If A  and  B are the same then + A  itself can never be escaped.) +

+ +

+A subsequent occurrence of the stringescapes directive redefines +the insert characters (but any string macros already defined with +stringdef remain defined). +

+ +

+Within insert characters, the following sequences are understood: +

+ + + +

+For example, +

+ +[% highlight(" + stringescapes {} + + /* Spanish diacritics */ + + stringdef a' '{U+00E1}' // a-acute + stringdef e' '{U+00E9}' // e-acute + stringdef i' '{U+00ED}' // i-acute + stringdef o' '{U+00F3}' // o-acute + stringdef u' '{U+00FA}' // u-acute + stringdef u\" '{U+00FC}' // u-diaeresis + stringdef n~ '{U+00F1}' // n-tilde + + /* All the characters in Spanish used to represent vowels */ + + define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u\"}' +") %] + +

4 Routines

+ +

+A routine definition has the form +

+ +[% highlight(" + define R as C +") %] + +

+where  R  is the routine name and  C  is a command, or bracketed group of +commands. So a routine is defined as a sequence of zero or more commands. +Snowball routines do not (at present) take parameters. For example, +

+ +[% highlight(" + define Step_5b as ( // this defines Step_5b + ['l'] // three commands here: [, 'l' and ] + R2 'l' // two commands, R2 and 'l' + delete // delete is one command + ) +" _ ' + define R1 as $p1 <= cursor + /* R1 is defined as the single command "$p1 <= cursor" */ +') %] + +

+A routine is called simply by using its name,  R, as a command. +

+ +

5 Commands and signals

+ +

+The flow of control in Snowball is arranged by the implicit use of +signals, rather than the explicit use of constructs like the  if, +else,  break  of C. The scheme is designed for handling strings, but is +perhaps easier to introduce using integers. Suppose  x,  y,  z  ... are +integers. The command +

+ +[% highlight(' + $x = 1 +') %] + +

+sets  x  to 1. The command +

+ +[% highlight(' + $x > 0 +') %] + +

+tests if  x  is greater than zero. Both commands give a signal t or f, +(true or false), but while the second command gives t if  x  is greater +than zero and f otherwise, the first command always gives t. In Snowball, +every command gives a t or f signal. A sequence of commands can be turned +into a single command by putting them in a list surrounded by round +brackets: +

+ +
+    ( C1 C2 C3 ... Ci Ci+1 ... )
+
+ +

+When this is obeyed,  Ci+1  will be obeyed if each of the preceding  C1  ... +Ci  give t, but as soon as a  Ci  gives f, the subsequent  Ci+1 Ci+2  ... +are ignored, and the whole sequence gives signal f. If all the  Ci  give t, +however, the bracketed command sequence also gives t. So, +

+ +[% highlight(' + $x > 0 $y = 1 +') %] + +

+sets  y  to 1 if  x  is greater than zero. If  x  is less than or equal to zero +the two commands give f. +

+ +

+If  C1  and  C2  are commands, we can build up the larger commands, +

+ +
+
C1 or C2 +
— Do  C1. If it gives t ignore  C2, otherwise do  C2. The resulting + signal is t if and only  C1  or  C2  gave t. +
C1 and C2 +
— Do  C1. If it gives f ignore  C2, otherwise do  C2. The resulting + signal is t if and only  C1  and  C2  gave t. +
not C +
— Do  C. The resulting signal is t if  C  gave f, otherwise f. +
try C +
— Do  C. The resulting signal is t whatever the signal of  C. +
fail C +
— Do  C. The resulting signal is f whatever the signal of  C. +
+ +

+So for example, +

+ +
+
[% highlight_inline('($x > 0 $y = 1) or ($y = 0)') %] +
— sets  y  to 1 if  x  is greater than zero, otherwise to zero. + +
[% highlight_inline('try( ($x > 0) and ($z > 0) $y = 1)') %] +
— sets  y  to 1 if both  x  and  z  are greater than 0, and gives t. +
+ +

+This last example is the same as +

+ +[% highlight(' + try($x > 0 $z > 0 $y = 1) +') %] + +

+so that  and  seems unnecessary here. But we will see that  and  has a +particular significance in string commands. +

+ +

+When a ‘monadic’ construct like  not,  try  or  fail  is not followed by a +round bracket, the construct applies to the shortest following valid command. +So for example +

+ +[% highlight(' + try not $x < 1 $z > 0 +') %] + +

+would mean +

+ +[% highlight(' + try ( not ( $x < 1 ) ) $z > 0 +') %] + +

+because [% highlight_inline('$x < 1') %] is the shortest valid command following  not, and then +not $x < 1  is the shortest valid command following  try. +

+ +

+The ‘dyadic’ constructs like  and  and  or  must sit in a bracketed list +of commands anyway, for example, +

+ +
+    ( C1 C2 and C3 C4 or C5 )
+
+ +

+And then in this case  C2  and  C3  are connected by the  and;  C4  and  C5  are +connected by the  or. So +

+ +[% highlight(' + $x > 0 not $y > 0 or not $z > 0 $t > 0 +') %] + +

+means +

+ +[% highlight(' + $x > 0 ((not ($y > 0)) or (not ($z > 0))) $t > 0 +') %] + +

+and  and  or  are equally binding, and bind from left to right, +so  C1 or C2 and C3  means  (C1 or C2) and C3  etc. +

+ +

6 Integer commands

+ +

+There are two sorts of integer commands - assignments and comparisons. Both +are built from Arithmetic Expressions (AEs). +

+ +

Arithmetic Expressions (AEs)

+ +

+An AE consists of integer names, literal numbers and a few other things +connected by dyadic  +,  -,  *  and  /, and monadic  -, with the same +binding powers and semantics as C. As well as integer names and literal +numbers, the following may be used in AEs: +

+ +
+ +
minint  — the minimum negative number +
maxint  — the maximum positive number +
cursor  — the current value of the string cursor +
limit  — the current value of the string limit +
size  — the size of the string, in "slots" +
sizeof s  — the number of "slots" in  s, where  s  is the name of a string or (since Snowball 2.1) a literal string +
New in Snowball 2.0:
len  — the length of the string, in Unicode characters +
lenof s  — the number of Unicode characters in  s, where  s  is the name of a string or (since Snowball 2.1) a literal string +
+ +

+[% highlight_inline('size') %] and [% highlight_inline('sizeof') %] count in +"slots" - see the "Character representation" section below for details. +

+ +

+The cursor and limit concepts are explained below. +

+ +

Integer assignments

+ +

+An integer assignment has the form +

+ +
+    $X assign_op AE
+
+ +

+where  X  is an integer name and assign_op is one of the five assignments + =,  +=,  -=,  *=, or  /=. +The meanings are the same as in C. +

+ +

+For example, +

+ +[% highlight(' + $p1 = limit // set p1 to the string limit +') %] + +

+Integer assignments always give the signal t. +

+ +

Integer comparisons

+ +

+An integer comparison has the form +

+ +
+    $X rel_op AE
+
+ +

+or (since Snowball 2.0): +

+ +
+    $(AE1 rel_op AE2)
+
+ +

+where  X  is an integer name and rel_op is one of the six tests + ==,  !=,  >=, + >, <=, or  <. +Again, the meanings are the same as in C. +

+ +

+Examples of integer comparisons are, +

+ +[% highlight(' + $p1 <= cursor // signal is f if the cursor is before position p1 + $(len >= 3) // signal is f unless the string is at least 3 characters long +') %] + +

+The second form is more general since an integer name is a valid AE, but it +also allows comparisons which don't involve integer variables. Before support +for this was added the second example could only be achieved by assigning +len to a variable and then testing that variable instead. +

+ +

7 String commands

+ +

+If  s  is a string name, a string command has the form +

+ +[% highlight(' + $s C +') %] + +

+where  C  is a command that operate on the string. Strings can be processed +left-to-right or right-to-left, but we will describe only the +left-to-right case for now. The string has a cursor, which we will +denote by c, and a limit point, or limit, which we will denote by l. c +advances towards l in the course of a string command, but the various +constructs  and,  or,  not  etc have side-effects which keep moving it +backwards. Initially c is at the start and l the end of the string. For +example, +

+ +
+        'a|n|i|m|a|d|v|e|r|s|i|o|n'
+        |                         |
+        c                         l
+
+ +

+c, and l, mark the boundaries between characters, and not +characters themselves. The characters between c and l will be denoted by +c:l. +

+ +

+If  C  gives t, the cursor c will have a new, well-defined value. But if  C +gives f, c is undefined. Its later value will in fact be determined by the +outer context of commands in which  C  came to be obeyed, not by  C  itself. +

+ +

+Here is a list of the commands that can be used to operate on strings. +

+ +

a) Setting a value

+ +
+
= S +
where  S  is the name of a string or a literal string. c:l is set equal + to  S, and l is adjusted to point to the end of the copied string. The + signal is t. For example, + +[% highlight(' + $x ' _ " = 'animadversion' /* literal string */" _ ' + $y = x /* string name */ +') %] + +
+ +

b) Basic tests

+ +
+
S +
here and below,  S  is the name of a string or a literal string. If c:l + begins with the substring  S, c is repositioned to the end of this + substring, and the signal is t. Otherwise the signal is f. For example, + +[% highlight(' + $x ' _ "'anim' /* gives t, assuming the string is 'animadversion' */" _ ' + $x ' _ "('anim' 'ad' 'vers')" _ ' + /* ditto */ + + $t ' _ "= 'anim'" _ ' + $x t /* ditto */ +') %] + +
true,  false +
true  is a dummy command that generates signal t.  false  generates + signal f. They are sometimes useful for emphasis, + +[% highlight(" + define start_off as true // nothing to do + define exception_list as false // put in among(...) list later +") %] + +  true  is equivalent to  () +
C1 or C2 +
This is like the case for integers described above, but the extra + touch is that if  C1  gives f, c is set back to its old position after +  C1  has given f and before  C2  is tried, so that the test takes place on + the same point in the string. So we have + +[% highlight(' + $x ' _ "('anim' /* signal t */ + 'ation' /* signal f */ + ) or + ( 'an' /* signal t - from the beginning */ + ) +") %] + +
C1 and C2 +
And similarly c is set back to its old position after  C1  has given t + and before  C2  is tried. So, + +[% highlight(' + $x ' _ "'anim' and 'an' /* signal t */" _ ' + $x ' _ "('anim' 'an') /* signal f, since 'an' and 'ad' mis-match */ +") %] + +
not C +
try C +
These are like the integer tests, with the added feature that c is set + back to its old position after an f signal is turned into t. So, + +[% highlight(' + $x ' _ "(not 'animation' not 'immersion') + /* both tests are done at the start of the string */ +" _ ' + $x ' _ "(try 'animus' try 'an' + 'imad') + /* - gives t */ +") %] + +
+
 try C  is equivalent to  C or true +
+
test C +
This does command  C  but without advancing c. Its signal is the same as + the signal of  C, but following signal t, c is set back to its old + value. +
+
 test C  is equivalent to  not not C +
 test C1 C2  is equivalent to  C1 and C2 +
+
fail C +
This does  C  and gives signal f. It is equivalent to  C false. Like +  false  it is useful, but only rarely. + +
do C +
This does  C, puts c back to its old value and gives signal t. It is + very useful as a way of suppressing the side effect of f signals and + cursor movement. +
+
 do C  is equivalent to  try test C +
or  test try C +
+
goto C +
c is moved right until obeying  C  gives t. But if c cannot be moved + right because it is at l the signal is f. c is set back to the position + it had before the last obeying of  C, so the effect is to leave c before + the pattern which matched against  C. + +[% highlight(' + $x goto' _ " 'ad' /* positions c after 'anim' */" _ ' + $x goto' _ " 'ax' /* signal f */ +") %] + +
gopast C +
Like goto, but c is not set back, so the effect is to leave c after + the pattern which matched against  C. + +[% highlight(' + $x gopast' _ " 'ad' /* positions c after 'animad' */ +") %] + +
repeat C +
C  is repeated until it gives f. When this happens c is set back to the + position it had before the last repetition of  C, and  repeat C  gives + signal t. For example, + +[% highlight(' + $x repeat gopast' _ " 'a' /* position c after the last 'a' */ +") %] + +
loop AE C +
This is like  C C ... C  written out AE times, where AE is an arithmetic + expression. For example, + +[% highlight(' + $x loop 2 gopast' _ " ('a' or 'e' or 'i' or 'o' or 'u') + /* position c after the second vowel */ +") %] + + The equivalent expression in C has the shape, + +[% highlight(" + { int i; + int limit = AE; + for (i = 0; i < limit; i++) C; + } +", "c") %] + +
atleast AE C +
This is equivalent to  loop AE C repeat C. + +
hop AE +
moves c AE character positions towards l, but if AE is negative, or if + there are less than AE characters between c and l the signal is f. + For example, + +[% highlight(" + test hop 3 +") %] + + tests that c:l contains more than 2 characters. + +
next +
is equivalent to  hop 1. +
+ +

c) Moving text about

+ +

+We have seen in (a) that  $x = y, when  x  and  y  are strings, sets c:l of  x +to the value of  y. Conversely +

+ +[% highlight(' + $x => y +') %] + +

+sets the value of  y  to the c:l region of  x. +

+ +

+A more delicate mechanism for pushing text around is to define a substring, +or slice of the string being tested. Then +

+ +
+
[% highlight_inline('[') %] +
sets the left-end of the slice to c, +
[% highlight_inline(']') %] +
sets the right-end of the slice to c, +
[% highlight_inline("-> s") %] +
moves the slice to variable  s, +
[% highlight_inline("<- S") %] +
replaces the slice with variable (or literal)  S. +
+ +

+For example +

+ +[% highlight(" + /* assume x holds 'animadversion' */" _ ' + $x ( [ ' _ " // '[animadversion' - [ set as indicated + loop 2 gopast 'a' + // '[anima|dversion' - c is marked by '|' + ] // '[anima]dversion' - ] set as indicated + -> y // y is 'anima' + ) +") %] + +

+For any string, the slice ends should be assumed to be unset until they are +set with the two commands  [,  ]. Thereafter the slice ends will retain +the same values until altered. +

+ +
+
[% highlight_inline("delete") %] +
is equivalent to [% highlight_inline("<- ''") %] +
+ +

+This next example deletes all vowels in x, +

+ +[% highlight(" + define vowel ('a' or 'e' or 'i' or 'o' or 'u') + /* ... */" _ ' + $ x repeat ( gopast([vowel]) delete ) +') %] + +

+As this example shows, the slice markers  [  and  ]  often appear as +pairs in a bracketed style, which makes for easy reading of the Snowball +scripts. But it must be remembered that, unusually in a computer +programming language, they are not true brackets. +

+ +

+More simply, text can be inserted at c. +

+ +
+
[% highlight_inline("insert S") %] +
insert variable or literal  S  before c, moving c to the right of the + insert.  <+  is a synonym for  insert. + +
[% highlight_inline("attach S") %] +
the same, but leave c at the left of the insert. +
+ +

d) Marks

+ +

+The cursor, c, (and the limit, l) can be thought of as having a numeric +value, from zero upwards: +

+ +
+         | a | n | i | m | a | d | v | e | r | s | i | o | n |
+         0   1   2   3   4   5   6   7   8   9  10  11  12  13
+
+ +

+It is these numeric values of c and l which are accessible through +cursor  and  limit  in arithmetic expressions. +

+ +
+
[% highlight_inline("setmark X") %] +
sets  X  to the current value of c, where  X  is an integer variable. + It's equivalent to: [% highlight_inline("$X = cursor") %] + +
[% highlight_inline("tomark AE") %] +
moves c forward to the position given by AE, + +
[% highlight_inline("atmark AE") %] +
tests if c is at position AE (t or f signal). + It's equivalent to: [% highlight_inline("$(cursor == AE)") %] +
+ +

+In the case of [% highlight_inline("tomark AE") %], a similar fail condition occurs as with [% highlight_inline("hop AE") %]. +If c is already beyond AE, or if position l is before position AE, the +signal is f. +

+ +

+In the stemming algorithms, certain regions of the word are defined by +setting marks, and later the failure condition of [% highlight_inline("tomark") %] is used to see if +c is inside a particular region. +

+ +

+Two other commands put c at l, and test if c is at l, +

+ +
+
[% highlight_inline("tolimit") %] +
moves c forward to l (signal t always), + +
[% highlight_inline("atlimit") %] +
tests if c is at l (t or f signal). +
+ +

e) Changing l

+ +

+In this account of string commands we see c moving right towards l, while +l stays fixed at the end. In fact l can be reset to a new position between +c and its old position, to act as a shorter barrier for the movement of c. +

+ +
+
setlimit C1 for C2 +
C1  is obeyed, and if it gives f the signal from  setlimit + is f with no further action. +

+ +

+ Otherwise, the final value of c becomes the new + position of l. c is then set back to its old value before  C1  was + obeyed, and  C2  is obeyed. Finally l is set back to its old position, + and the signal of  C2  becomes the signal of  setlimit. +

+ +

+ So the signal is f if either  C1  or  C2  gives f, otherwise t. + For example, +

+ +[% highlight(' + $x ( setlimit goto' _ " 's' // 'animadver}sion' new l as marked '}' + for // below, '|' marks c after each goto + ( goto 'a' and // '|animadver}sion' + goto 'e' and // 'animadv|er}sion' + goto 'i' and // 'an|imadver}sion' + ) + ) +") %] + +

+ This checks that x has characters ‘a’, ‘e’ and ‘i’ before the first + ‘s’. +

+ +
+ +

f) Backward processing

+ +

+String commands have been described with c to the left of l and moving +right. But the process can be reversed. +

+ +
+
[% highlight_inline("backwards C") %] +
c and l are swapped over, and c moves left towards l.  C  is obeyed, the + signal given by  C  becomes the signal of  backwards C, and c and l are + swapped back to their old values (except that l may have been adjusted + because of deletions and insertions).  C  cannot contain another + [% highlight_inline("backwards") %] command. + +
[% highlight_inline("reverse C") %] +
A similar idea, but here c simply moves left instead of moving right, + with the beginning of the string as the limit, l.  C  can contain other + [% highlight_inline("reverse") %] commands, but it cannot contain commands to do deletions or + insertions — it must be used for testing only. (Without this + restriction Snowball's semantics would become very untidy.) +
+ +

+Forward and backward processing are entirely symmetric, except that forward +processing is the default direction, and literal strings are always +written out forwards, even when they are being tested backwards. So the +following are equivalent, +

+ +[% highlight(' + $x (' _ " + 'ani' 'mad' 'version' atlimit + ) +" _ ' + $x backwards (' _ " + 'version' 'mad' 'ani' atlimit + ) +") %] + +

+If a routine is defined for backwards mode processing, it must be included +inside a  backwardmode(...)  declaration. +

+ +

g) substring and among

+ +

+The use of [% highlight_inline("substring") %] and [% highlight_inline("among") %] is central to the implementation of the +stemming algorithms. It is like a case switch on strings. In its simpler +form, +

+ +
+        substring among('S1' 'S2' 'S3' ...)
+
+ +

+searches for the longest matching substring  'S1'  or  'S2'  or  'S3'  ... from +position c. (The  'Si'  must all be different.) So this has the same +semantics as +

+ +
+        ('S1' or 'S2' or 'S3' ...)
+
+ +

+— so long as the  'Si'  are written out in decreasing order of length. +

+ +

+substring  may be omitted, in which case it is attached to its following +among, so +

+ +[% highlight(" + among(/*...*/) +") %] + +

+without a preceding [% highlight_inline("substring") %] is equivalent to +

+ +[% highlight(" + (substring among(/*...*/)) +") %] + +

+[% highlight_inline("substring") %] may also be detached from its [% highlight_inline("among") %], although it must +precede it textually in the same routine in which the [% highlight_inline("among") %] appears. +The more general form of [% highlight_inline("substring /* ... */ among") %] is, +

+ +
+    substring
+    C
+    among( 'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+Obeying  substring  searches for a longest match among the  'Sij'. The +signal from  substring  is t if a match is found, otherwise f. +Any commands C between the substring and among will be run after this +search and only if the search finds a match (it would be equivalent to remove C and replace each +Ci with C Ci). When the +among  comes to be obeyed, the  Ci  corresponding to the matched  'Sij'  is +obeyed, and its signal becomes the signal of the  among  command. +

+ +

+substring/among  pairs must match up textually inside each routine +definition. But there is no problem with an  among  containing other +substring/among  pairs, and  substring  is optional before  among  anyway. +The essential constraint is that two  substrings must be separated by an +among, and each  substring  must be followed by an  among. +

+ +

+The effect of obeying  among  when the preceding  substring  is not obeyed +is undefined. This would happen for example here, +

+ +[% highlight(' + try($x != 617 substring)' _ " + among(...) // 'substring' is bypassed in the exceptional case where x == 617 +") %] + +

+The significance of separating the  substring  from the  among  is to allow +them to work in different contexts. For example, +

+ +
+    setlimit tomark L for substring
+
+    among( 'S11' 'S12' ... (C1)
+           ...
+
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+Here the test for the longest  'Sij'  is constrained to the region between c +and the mark point given by integer  L. But the commands  Ci  operate outside +this limit. Another example is +

+ +
+    reverse substring
+
+    among( 'S11' 'S12' ... (C1)
+           ...
+
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+The substring test is in the opposite direction in the string to the +direction of the commands  Ci. +

+ +

+The last  (Cn)  may be omitted, in which case  (true)  is assumed. +

+ +

+Each string  'Sij'  may be optionally followed by a +routine name, +

+ +
+    among(
+           'S11' R11 'S12' R12 ... (C1)
+           'S21' R21 'S22' R22 ... (C2)
+           ...
+           'Sn1' Rn1 'Sn2' Rn1 ... (Cn)
+         )
+
+ +

+If a routine name is not specified, it is equivalent +to a routine which simply returns signal t, +

+ +[% highlight(" + define null as true +") %] + +

+— so we can imagine each  'Sij'  having its associated routine +Rij. Then obeying the  among  causes a search for the longest +'Sij'  whose corresponding routine +Rij  gives t. +

+ +

+The routines Rij  should be written without any +side-effects, other than the inevitable cursor movement. (c is in +any case set back to its old value following a call of +Rij.) +

+ +

8 Booleans

+ +

+[% highlight_inline("set B") %] and [% highlight_inline("unset B") %] set  B  to true and false respectively, where  B  is a +boolean name. [% highlight_inline("B") %] as a command gives a signal t if it is set true, f +otherwise. For example, +

+ +[% highlight(" + booleans ( Y_found ) // declare the boolean + + /* ... */ + + unset Y_found // unset it + do ( ['y'] <-'Y' set Y_found ) + /* if c:l begins 'y' replace it by 'Y' and set Y_found */ + + do repeat(goto (v ['y']) <-'Y' set Y_found) + /* repeatedly move down the string looking for v 'y' and + replacing 'y' with 'Y'. Whenever the replacement takes + place set Y_found. v is a test for a vowel, defined as + a grouping (see below). */ + + + /* Y_found means there are some letters Y in the string. + Later we can use this to trigger a conversion back to + lower case y. */ + + /* ... */ + + do (Y_found repeat(goto (['Y']) <- 'y') +") %] + +

9 Groupings

+ +

+A grouping brings characters together and enables them to be looked for +with a single test. +

+ +

+If  G  is declared as a grouping, it can be defined by +

+ +
+    define G G1 op G2 op G3 ...
+
+ +

+where op is  +  or  -, and  G1,  G2,  G3  are literal strings, or groupings that +have already been defined. (There can be zero or more of these additional +op components). For example, +

+ +[% highlight(" + define capital_letter 'ABDEFGHIJKLMNOPQRSTUVWXYZ' + define small_letter 'abdefghijklmnopqrstuvwxyz' + define letter capital_letter + small_letter + define vowel 'aeiou' + 'AEIOU' + define consonant letter - vowel + define digit '0123456789' + define alphanumeric letter + digit +") %] + +

+Once  G  is defined, it can be used as a command, and is equivalent to a test +

+ +
+    'ch1' or 'ch2' or ...
+
+ +

+where  ch1,  ch2  ... list all the characters in the grouping. +

+ +

+[% highlight_inline("non G") %] is the converse test, and matches any character except the +characters of  G. Note that [% highlight_inline("non G") %] is not the same as [% highlight_inline("not G") %], in fact +

+ +

+[% highlight_inline("non G") %] is equivalent to [% highlight_inline("(not G next)") %] +

+ +

+[% highlight_inline("non") %] may be optionally followed by hyphen, for example: +

+ +[% highlight(" + non-vowel + non-digit +") %] + +

+Bear in mind that [% highlight_inline("non-vowel") %] doesn't only match a +consonant - it'll match any character which isn't in the vowel +grouping. Failing to consider this has lead to bugs in stemming algorithms - +for example, here we intended to undouble a consonant: +

+ +[% highlight(" + [non-vowel] -> ch + ch + delete +") %] + +

+The problem with this code is it will also mangle numbers with repeated digits, +for example 1900 would become 190. A good rule of +thumb here seems to be to use an inclusive grouping check if the code goes on +to delete the character matched: +

+ +[% highlight(" + [consonant] -> ch + ch + delete +") %] + +

10 A Snowball program

+ +

+A complete program consists of a sequence of declarations followed by a +sequence of definitions of groupings and routines. Routines which are +implicitly defined as operating on c:l from right to left must be included +in a  backwardmode(...)  declaration. +

+ +

+A Snowball program is called up via a simple +API +through its defined externals. For example, +

+ +[% highlight(" + externals ( stem1 stem2 ) + /* ... */ + define stem1 as ( /* stem1 commands */ ) + define stem2 as ( /* stem2 commands */ ) +") %] + +

+The API also allows a current string to be defined, and this becomes the +c:l string for the external routine to work on. Its final value is the +result handed back through the API. +

+ +

+The strings, integers and booleans are accessible from any point in the +program, and exist throughout the running of the Snowball program. They are +therefore like static declarations in C. +

+ +

11 Comments, and other whitespace fillers

+ +

+At a deeper level, a program is a sequence of tokens, interspersed with +whitespace. Names, reserved words, literal numbers and strings are all +tokens. Various symbols, made up of non-alphanumerics, are also tokens. +

+ +

+A name, reserved word or number is terminated by the first character that +cannot form part of it. A symbol is recognised as the longest sequence of +characters that forms a valid symbol. So  +=-  is two symbols,  +=  and +-, because  +=  is a valid symbol in the language while  +=-  is not. +Whitespace separates tokens but is otherwise ignored. This of course is +like C. +

+ +

+Occasionally a newer version of Snowball may add a new token. So as not to +break existing programs, any such tokens declared as a name (via +[% highlight_inline('integers') %], [% highlight_inline('routines') %], etc) +will lose their token status for the rest of the program. This applies +to the tokens +[% highlight_inline('len') %] +and +[% highlight_inline('lenof') %]. +

+ +

+Anywhere that whitespace can occur, there may also occur: +

+ +

+(a) Comments, in the usual multi-line [% highlight_inline('/* .... */') %] or single line +[% highlight_inline('// ...') %] format. +

+ +

+(b) Get directives. These are like  #include  commands in C, and have the form +[% highlight_inline("get 'S'") %], where  'S'  is a literal string. For example, +

+ +[% highlight(" + get '/home/martin/snowball/main-hdr' // include the file contents +") %] + +

+(c) [% highlight_inline("stringescapes XY") %] where  X  and  Y  are any two printing characters. +

+ +

+(d) [% highlight_inline("stringdef m 'S'") %] where  m  is sequence of characters not including +whitespace and terminated with whitespace, and  'S'  is a literal string. +

+ +

12 Character representation

+ +

+In this description of Snowball, it is assumed that strings are composed of +characters, and that characters can be defined numerically, but the numeric range +of these characters is not defined. As implemented, three different schemes +are supported. Characters can either be (a) bytes in the range 0 to 255, +as in traditional C strings, or (b) byte pairs in the range 0 to 65535, +as in Java strings, or (c) UTF-8 encoded bytes sequences in the range 0 +to 65535, so that a character may occupy 1, 2 or 3 bytes. +

+ +

+For case (c), we need to make a slight separation of the concept of +characters into symbols, the units of text being represented, and +slots, the units of space into which they map. (So in case (a), all +slots are one byte; in case (b) all slots are two bytes.) +c and l have numeric values that can be used in AEs (arithmetic +expressions). These values count the number of slots. Similarly +setmark,  tomark  and  atmark  are remembering and then using slot +counts.  size  and  sizeof  measure string size +in slots, not symbols. However,  hop N  moves c over  N  symbols, +not  N  slots, and  next  is equivalent to  hop 1. +

+ +

+Snowball 2.0 adds len and lenof, which measure string length in symbols +(so they're the same as size and sizeof in cases (a) and (b), but +different in case (c)). +

+ +

+So long as these simple distinctions are recognised, the same Snowball +script can be compiled to work with any of the three encoding schemes. +

+ +

13 Legacy Features

+ +

13.1 hex and decimal

+ +

+This section documents features of Snowball for which there's a strongly +prefered alternative. They're still support for compatibility with +existing code which uses them, but you shouldn't use them in then code. +We document them here so that their meaning in existing code can be +understood, and especially to aid updated to the preferred alternatives. +

+ +

+In a  stringdef , string may be preceded by the word  hex, +or the word  decimal. This was how non-ASCII characters +were specified before support for specifying Unicode codepoints using the +U+ notation was added. +

+ +

+hex and decimal mean that the contents of the string +are interpreted as characters values written out in hexadecimal, or decimal, +notation. The characters should be separated by spaces. For example, +

+ +[% highlight(" + hex 'DA' /* is character hex DA */ + hex 'D A' /* is the two characters, hex D and A (carriage + return, and line feed) */ + decimal '10' /* character 10 (line feed) */ + decimal '13 10' /* characters 13 and 10 (carriage return, and + line feed) */ +") %] + +

+The following forms are equivalent, +

+ +[% highlight(" + hex 'd a' /* lower case also allowed */ + hex '0D 000A' /* leading zeroes ignored */ + hex ' D A ' /* extra spacing is harmless */ +") %] + +

+The interpretation of the values is as Unicode codepoints if command +line option -utf8 or -widechars is specified, and as +character values in an unspecified single byte character set otherwise. For +ASCII and ISO-8859-1 the character values match Unicode codepoints, but to +handle other single byte character sets (e.g. ISO-8859-2 or KOI8-R) you needed +a special version of a Snowball source with different character values +specified via stringdef. The U+ notation allows +you to use a single Snowball source in this situation. +

+ +

13.2 among starter command

+ +

+The among command supports a "starter" command, C +in this example: +

+ +
+    among( (C)
+           'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+This is equivalent to adding C at the start of each +Ci: +

+ +
+    among( 'S11' 'S12' ... (C C1)
+           'S21' 'S22' ... (C C2)
+           ...
+           'Sn1' 'Sn2' ... (C Cn)
+         )
+
+ +

+However, both are equivalent to: +

+ +
+    substring C
+    among( 'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+This requires an explicit substring but seems clearer so +we recommend using this in new code and have designated the use of a starter as +a legacy feature. +

+ +

+A starter is also allowed with an explicit substring, for example: +

+ +
+    substring
+    Cs
+    among( (Ca)
+           'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

+is equivalent to: +

+ +
+    substring
+    Cs
+    Ca
+    among( 'S11' 'S12' ... (C1)
+           'S21' 'S22' ... (C2)
+           ...
+           'Sn1' 'Sn2' ... (Cn)
+         )
+
+ +

Snowball syntax

+ +
+

+In the grammar which follows, ||  is used for alternatives, + [X]  means that X is +optional, and  [X]*  means that X is repeated zero or more +times. meta-symbols are defined on the left.  <char>  means any +character. +

+ +

+The definition of  literal string  does not allow for the escaping +conventions established by the  stringescapes  directive. The command +?  is a debugging aid. +

+ +
+<letter>        ::= a || b || ... || z || A || B || ... || Z
+<digit>         ::= 0 || 1 || ... || 9
+<name>          ::= <letter> [ <letter> || <digit> || _ ]*
+<s_name>        ::= <name>
+<i_name>        ::= <name>
+<b_name>        ::= <name>
+<r_name>        ::= <name>
+<g_name>        ::= <name>
+<literal string>::= '[<char>]*'
+<number>        ::= <digit> [ <digit> ]*
+
+S               ::= <s_name> || <literal string>
+G               ::= <g_name> || <literal string>
+
+<declaration>   ::= strings ( [<s_name>]* ) ||
+                    integers ( [<i_name>]* ) ||
+                    booleans ( [<b_name>]* ) ||
+                    routines ( [<r_name>]* ) ||
+                    externals ( [<r_name>]* ) ||
+                    groupings ( [<g_name>]* )
+
+<r_definition>  ::= define <r_name> as C
+<plus_or_minus> ::= + || -
+<g_definition>  ::= define <g_name> G [ <plus_or_minus> G ]*
+
+AE              ::= (AE) ||
+                    AE + AE || AE - AE || AE * AE || AE / AE || - AE ||
+                    maxint || minint || cursor || limit ||
+                    size || sizeof S ||
+                    len || lenof S ||
+                    <i_name> || <number>
+
+<i_assign>      ::= $ <i_name> = AE ||
+                    $ <i_name> += AE || $ <i_name> -= AE ||
+                    $ <i_name> *= AE || $ <i_name> /= AE
+
+<i_test_op>     ::= == || != || > || >= || < || <=
+
+<i_test>        ::= $ ( AE <i_test_op> AE ) ||
+                    $ <i_name> <i_test_op> AE
+
+<s_command>     ::= $ <s_name> C
+
+C               ::= ( [C]* ) ||
+                    <i_assign> || <i_test> || <s_command> || C or C || C and C ||
+                    not C || test C || try C || do C || fail C ||
+                    goto C || gopast C || repeat C || loop AE C ||
+                    atleast AE C || S || = S || insert S || attach S ||
+                    <- S || delete ||  hop AE || next ||
+                    => <s_name> || [ || ] || -> <s_name> ||
+                    setmark <i_name> || tomark AE || atmark AE ||
+                    tolimit || atlimit || setlimit C for C ||
+                    backwards C || reverse C || substring ||
+                    among ( [<literal string> [<r_name>] || (C)]* ) ||
+                    set <b_name> || unset <b_name> || <b_name> ||
+                    <r_name> || <g_name> || non [-] <g_name> ||
+                    true || false || ?
+
+P              ::=  [P]* || <declaration> ||
+                    <r_definition> || <g_definition> ||
+                    backwardmode ( P )
+
+<program>      ::=  P
+
+
+
+synonyms:      <+ for insert
+
+ +[% footer %] diff --git a/credits.html b/credits.html new file mode 100644 index 0000000..2c64239 --- /dev/null +++ b/credits.html @@ -0,0 +1,141 @@ + + + + + + + + + + Credits - Snowball + + + + + + + + + + +
+
+ +
+

Credits

+ + +

+Snowball, and most of the current stemming algorithms were written by +Dr Martin Porter, who also prepared the material for the Website. +The Snowball to Java codegenerator, and supporting Java libraries, were +contributed by Richard Boulton. +Dr Andrew MacFarlane, of City University, London, gave much +initial encouragement and proofreading +assistance. +

+ +

+Richard Boulton established the original Snowball website, from which this +website has evolved. +

+ +

+Linguistic assistance for Russian, German and Dutch has been provided by + +Patrick Miles +(of the +Patrick Miles Translation Agency, Cambridge, UK). Pat is a distinguished +translator, whose English versions of Chekhov have appeared on the London +stage. +

+ +

+Various emailers have helped improve the stemmers with their many suggestions +and comments. We must especially mention Andrei Aksyonoff and +Oleg Bartunov (Russian), Steve Tolkin and Wendy Reetz (English), and Fred Brault (French). +Blake Madden found a number of elusive errors in the stemmer descriptions. +

+ +

+Anna Tordai has provided the Hungarian stemming algorithm. +

+ +

+Evren (Kapusuz) Cilden has provided the Turkish stemming algorithm. +

+ +

+Olly Betts has made a significant performance improvement to the C +codegenerator. +

+ +

+The Snowball mailing lists are hosted for us free by James Aylett, who +owns and runs the machine that hosts the +tartarus website. +

+ +

+We received two Romanian stemming algorithms in 2006, from Erwin +Glockner, Doina Gliga and Marina Stegarescu, working at Heidelberg, +and from Irina Tirdea in Bucharest. After some experimentation, +the Snowball Romanian stemmer has been rewritten from scratch, but the +basic list of verb endings with their separation into two groups with +different removal criteria is taken from Irina Tirdea's stemmer. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/demo.html b/demo.html new file mode 100644 index 0000000..d433c10 --- /dev/null +++ b/demo.html @@ -0,0 +1,174 @@ + + + + + + + + + + Demo - Snowball + + + + + + + + + + +
+
+ +
+

Demo

+ + + + +

+Try the +stemming algorithm: +

+ + +
+ +
+ +

+This demo performs stemming entirely within your browser, using Javascript code generated by the Snowball compiler. +

+ + + + +
+
+
+ +
+ +
+ + + + diff --git a/download.html b/download.html new file mode 100644 index 0000000..c0bff22 --- /dev/null +++ b/download.html @@ -0,0 +1,144 @@ + + + + + + + + + + Download - Snowball + + + + + + + + + + +
+
+ +
+

Download

+ + +

Tarballs

+ +

Several tarballs of the Snowball sources are available:

+ +
    +
  • +The C version of the libstemmer library.
    +This contains all you need to include the snowball stemming algorithms into a +C project of your own. If you download this, you don't need to use the snowball +compiler, or worry about the internals of the stemmers in any way. +
  • +
  • +The C# version of the libstemmer library.
    +This contains all you need to include the snowball stemming algorithms into a +C# project of your own. If you download this, you don't need to use the snowball +compiler, or worry about the internals of the stemmers in any way. +
  • +
  • +The Java version of the libstemmer library.
    +This contains all you need to include the snowball stemming algorithms into a +Java project of your own. If you download this, you don't need to use the snowball +compiler, or worry about the internals of the stemmers in any way. +
  • +
  • +Snowball, algorithms, and libstemmer library.
    +This contains all the source code for snowball (but not the generated source files). +This is useful mainly if you are wanting to work on the algorithms (tweaking them, +or producing new algorithms). +
  • +
+ +

+We do not make binary (ie, compiled) distributions of snowball available - +there are simply too many different platforms, architectures and languages to +support. If you are willing to make such binaries available for others, and +can provide at least some measure of support for ensuring that they work, feel +free to contact us and we will add a link to your work from this site. +

+ +

Python

+ +

+We provide and support python wrappers for Snowball. The latest code can +be downloaded from the PyStemmer repo. +

+ +

Git

+ +

+Developers may wish to access the latest source using the command: +

+ +
+git clone git@github.com:snowballstem/snowball.git
+
+ +

Web interface

+ +

+The git repository can also be browsed online. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..5a12b6e Binary files /dev/null and b/favicon.ico differ diff --git a/index.html b/index.html new file mode 100644 index 0000000..d353bee --- /dev/null +++ b/index.html @@ -0,0 +1,326 @@ + + + + + + + + + + Snowball + + + + + + + + + + +
+
+ +
+

Snowball

+ + +
+
+ +

+Snowball is a small string processing language for creating +stemming algorithms for use in Information Retrieval, plus a collection of +stemming algorithms implemented using it. +

+ +

+It was originally designed +and built by Martin +Porter. Martin retired from development in 2014 and Snowball is now +maintained as a community project. Martin originally chose the name Snowball as +a tribute to SNOBOL, the +excellent string handling language from the 1960s. It now also serves as a +metaphor for how the project grows by gathering contributions over time. +

+ +

+The Snowball compiler translates a Snowball program into source code in another +language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal, +Python and Rust are supported. +

+ +

What is Stemming?

+ +

+Stemming maps different forms of the same word to a common "stem" - for +example, the English stemmer maps connection, connections, +connective, connected, and connecting to connect. +So a searching for connected would also find documents which only +have the other forms. +

+ +

+This stem form is often a word itself, but this is not always the case as +this is not a requirement for text search systems, which are the intended +field of use. We also aim to conflate words with the same meaning, rather +than all words with a common linguistic root (so awe and awful +don't have the same stem), and over-stemming is more problematic than +under-stemming so we tend not to stem in cases that are hard to resolve. If +you want to always reduce words to a root form and/or get a root form which is +itself a word then Snowball's stemming algorithms likely aren't the right +answer. +

+ + + +

+Please address all Snowball-related mail to the snowball-discuss mailing list. +

+ +

+Any such mail sent directly to individual developers may be answered less +speedily, and in any case they reserve the right to post their answers on snowball-discuss. +

+ +

Major events

+ + +
    +
  • + Sep 2023 - Estonian stemming algorithm contributed by Linda Freienthal. +
  • +
  • + Nov 2021 - Snowball 2.2.0 released! +
  • +
  • + Jan 2021 - Snowball 2.1.0 released. +
  • +
  • + Jan 2021 - Armenian stemmer from Astghik Mkrtchyan merged into the distribution. +
  • +
  • + Jan 2021 - Ada backend contributed by Stephane Carrez. +
  • +
  • + Nov 2020 - Yiddish stemming algorithm contributed by Assaf Urieli. +
  • +
  • + Oct 2019 - Serbian stemming algorithm contributed by Stefan Petkovic and Dragan Ivanovic. +
  • +
  • + Oct 2019 - Snowball 2.0.0 released. +
  • +
  • + Aug 2019 - Hindi stemming algorithm contributed by Olly Betts. +
  • +
  • + Aug 2019 - Basque and Catalan merged into the distribution. +
  • +
  • + Oct 2018 - Greek stemming algorithm contributed by Oleg Smirnov. +
  • +
  • + Jun 2018 - Object pascal backend from Wout van Wezel merged. +
  • +
  • + May 2018 - Lithuanian stemming algorithm contributed by Dainius Jocas. +
  • +
  • + May 2018 - Indonesian stemming algorithm contributed by Olly Betts. +
  • +
  • + Mar 2018 - C# backend contributed by Cesar Souza. +
  • +
  • + Mar 2018 - Javascript backend merged. +
  • +
  • + Jun 2017 - Go backend contributed by Marty Schoch. +
  • +
  • + Mar 2017 - Rust backend contributed by Jakob Demler. +
  • +
  • + Jan 2016 - Arabic stemming algorithm contributed by Assem Chelli. +
  • +
  • + Oct 2015 - Tamil stemming algorithm contributed by Damodharan Rajalingam. +
  • +
  • + Sep 2015 - New home for snowball on snowballstem.org. +
  • +
  • + Sep 2014 - Martin Porter retires from snowball development. +
  • +
  • + May 2012 - Contributed stemmers for Irish and Czech. +
  • +
  • + Jul 2010 - Contributed stemmers for Armenian, Basque, Catalan. +
  • +
  • + Mar 2007 - Romanian stemmer. +
  • +
  • + Jan 2007 - Turkish stemmer. Contributed by Evren (Kapusuz) Cilden. +
  • +
  • + Sep 2006 - Hungarian stemmer. Contributed by Anna Tordai. +
  • +
  • + Jun 2006 - Supported and updated Python bindings. +
  • +
  • + May 2005 - UTF-8 Unicode support. +
  • +
  • + Sep 2002 - Finnish stemmer. +
  • +
  • + Jul 2002 - ISO Latin I as default + The use of MS DOS Latin I is now history, but the old versions of the + Snowball stemmers are still accessible on the site. +
  • +
  • + May 2002 - Unicode support +
  • +
  • + Feb 2002 - Java support + Richard has modified the snowball code generator to produce Java output as + well as ANSI C output. This means that pure Java systems can now use the + snowball stemmers. +
  • +
+ +
+
+ +

Links to resources

+ + + +

Snowball compiler

+ + + +
+
+ +
+
+
+ +
+ +
+ + + + diff --git a/js/arabic-stemmer.js b/js/arabic-stemmer.js new file mode 100644 index 0000000..7de24c0 --- /dev/null +++ b/js/arabic-stemmer.js @@ -0,0 +1,1613 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var ArabicStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0640", -1, 1], + ["\u064B", -1, 1], + ["\u064C", -1, 1], + ["\u064D", -1, 1], + ["\u064E", -1, 1], + ["\u064F", -1, 1], + ["\u0650", -1, 1], + ["\u0651", -1, 1], + ["\u0652", -1, 1], + ["\u0660", -1, 2], + ["\u0661", -1, 3], + ["\u0662", -1, 4], + ["\u0663", -1, 5], + ["\u0664", -1, 6], + ["\u0665", -1, 7], + ["\u0666", -1, 8], + ["\u0667", -1, 9], + ["\u0668", -1, 10], + ["\u0669", -1, 11], + ["\uFE80", -1, 12], + ["\uFE81", -1, 16], + ["\uFE82", -1, 16], + ["\uFE83", -1, 13], + ["\uFE84", -1, 13], + ["\uFE85", -1, 17], + ["\uFE86", -1, 17], + ["\uFE87", -1, 14], + ["\uFE88", -1, 14], + ["\uFE89", -1, 15], + ["\uFE8A", -1, 15], + ["\uFE8B", -1, 15], + ["\uFE8C", -1, 15], + ["\uFE8D", -1, 18], + ["\uFE8E", -1, 18], + ["\uFE8F", -1, 19], + ["\uFE90", -1, 19], + ["\uFE91", -1, 19], + ["\uFE92", -1, 19], + ["\uFE93", -1, 20], + ["\uFE94", -1, 20], + ["\uFE95", -1, 21], + ["\uFE96", -1, 21], + ["\uFE97", -1, 21], + ["\uFE98", -1, 21], + ["\uFE99", -1, 22], + ["\uFE9A", -1, 22], + ["\uFE9B", -1, 22], + ["\uFE9C", -1, 22], + ["\uFE9D", -1, 23], + ["\uFE9E", -1, 23], + ["\uFE9F", -1, 23], + ["\uFEA0", -1, 23], + ["\uFEA1", -1, 24], + ["\uFEA2", -1, 24], + ["\uFEA3", -1, 24], + ["\uFEA4", -1, 24], + ["\uFEA5", -1, 25], + ["\uFEA6", -1, 25], + ["\uFEA7", -1, 25], + ["\uFEA8", -1, 25], + ["\uFEA9", -1, 26], + ["\uFEAA", -1, 26], + ["\uFEAB", -1, 27], + ["\uFEAC", -1, 27], + ["\uFEAD", -1, 28], + ["\uFEAE", -1, 28], + ["\uFEAF", -1, 29], + ["\uFEB0", -1, 29], + ["\uFEB1", -1, 30], + ["\uFEB2", -1, 30], + ["\uFEB3", -1, 30], + ["\uFEB4", -1, 30], + ["\uFEB5", -1, 31], + ["\uFEB6", -1, 31], + ["\uFEB7", -1, 31], + ["\uFEB8", -1, 31], + ["\uFEB9", -1, 32], + ["\uFEBA", -1, 32], + ["\uFEBB", -1, 32], + ["\uFEBC", -1, 32], + ["\uFEBD", -1, 33], + ["\uFEBE", -1, 33], + ["\uFEBF", -1, 33], + ["\uFEC0", -1, 33], + ["\uFEC1", -1, 34], + ["\uFEC2", -1, 34], + ["\uFEC3", -1, 34], + ["\uFEC4", -1, 34], + ["\uFEC5", -1, 35], + ["\uFEC6", -1, 35], + ["\uFEC7", -1, 35], + ["\uFEC8", -1, 35], + ["\uFEC9", -1, 36], + ["\uFECA", -1, 36], + ["\uFECB", -1, 36], + ["\uFECC", -1, 36], + ["\uFECD", -1, 37], + ["\uFECE", -1, 37], + ["\uFECF", -1, 37], + ["\uFED0", -1, 37], + ["\uFED1", -1, 38], + ["\uFED2", -1, 38], + ["\uFED3", -1, 38], + ["\uFED4", -1, 38], + ["\uFED5", -1, 39], + ["\uFED6", -1, 39], + ["\uFED7", -1, 39], + ["\uFED8", -1, 39], + ["\uFED9", -1, 40], + ["\uFEDA", -1, 40], + ["\uFEDB", -1, 40], + ["\uFEDC", -1, 40], + ["\uFEDD", -1, 41], + ["\uFEDE", -1, 41], + ["\uFEDF", -1, 41], + ["\uFEE0", -1, 41], + ["\uFEE1", -1, 42], + ["\uFEE2", -1, 42], + ["\uFEE3", -1, 42], + ["\uFEE4", -1, 42], + ["\uFEE5", -1, 43], + ["\uFEE6", -1, 43], + ["\uFEE7", -1, 43], + ["\uFEE8", -1, 43], + ["\uFEE9", -1, 44], + ["\uFEEA", -1, 44], + ["\uFEEB", -1, 44], + ["\uFEEC", -1, 44], + ["\uFEED", -1, 45], + ["\uFEEE", -1, 45], + ["\uFEEF", -1, 46], + ["\uFEF0", -1, 46], + ["\uFEF1", -1, 47], + ["\uFEF2", -1, 47], + ["\uFEF3", -1, 47], + ["\uFEF4", -1, 47], + ["\uFEF5", -1, 51], + ["\uFEF6", -1, 51], + ["\uFEF7", -1, 49], + ["\uFEF8", -1, 49], + ["\uFEF9", -1, 50], + ["\uFEFA", -1, 50], + ["\uFEFB", -1, 48], + ["\uFEFC", -1, 48] + ]; + + /** @const */ var a_1 = [ + ["\u0622", -1, 1], + ["\u0623", -1, 1], + ["\u0624", -1, 1], + ["\u0625", -1, 1], + ["\u0626", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["\u0622", -1, 1], + ["\u0623", -1, 1], + ["\u0624", -1, 2], + ["\u0625", -1, 1], + ["\u0626", -1, 3] + ]; + + /** @const */ var a_3 = [ + ["\u0627\u0644", -1, 2], + ["\u0628\u0627\u0644", -1, 1], + ["\u0643\u0627\u0644", -1, 1], + ["\u0644\u0644", -1, 2] + ]; + + /** @const */ var a_4 = [ + ["\u0623\u0622", -1, 2], + ["\u0623\u0623", -1, 1], + ["\u0623\u0624", -1, 1], + ["\u0623\u0625", -1, 4], + ["\u0623\u0627", -1, 3] + ]; + + /** @const */ var a_5 = [ + ["\u0641", -1, 1], + ["\u0648", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["\u0627\u0644", -1, 2], + ["\u0628\u0627\u0644", -1, 1], + ["\u0643\u0627\u0644", -1, 1], + ["\u0644\u0644", -1, 2] + ]; + + /** @const */ var a_7 = [ + ["\u0628", -1, 1], + ["\u0628\u0627", 0, -1], + ["\u0628\u0628", 0, 2], + ["\u0643\u0643", -1, 3] + ]; + + /** @const */ var a_8 = [ + ["\u0633\u0623", -1, 4], + ["\u0633\u062A", -1, 2], + ["\u0633\u0646", -1, 3], + ["\u0633\u064A", -1, 1] + ]; + + /** @const */ var a_9 = [ + ["\u062A\u0633\u062A", -1, 1], + ["\u0646\u0633\u062A", -1, 1], + ["\u064A\u0633\u062A", -1, 1] + ]; + + /** @const */ var a_10 = [ + ["\u0643\u0645\u0627", -1, 3], + ["\u0647\u0645\u0627", -1, 3], + ["\u0646\u0627", -1, 2], + ["\u0647\u0627", -1, 2], + ["\u0643", -1, 1], + ["\u0643\u0645", -1, 2], + ["\u0647\u0645", -1, 2], + ["\u0647\u0646", -1, 2], + ["\u0647", -1, 1], + ["\u064A", -1, 1] + ]; + + /** @const */ var a_11 = [ + ["\u0646", -1, 1] + ]; + + /** @const */ var a_12 = [ + ["\u0627", -1, 1], + ["\u0648", -1, 1], + ["\u064A", -1, 1] + ]; + + /** @const */ var a_13 = [ + ["\u0627\u062A", -1, 1] + ]; + + /** @const */ var a_14 = [ + ["\u062A", -1, 1] + ]; + + /** @const */ var a_15 = [ + ["\u0629", -1, 1] + ]; + + /** @const */ var a_16 = [ + ["\u064A", -1, 1] + ]; + + /** @const */ var a_17 = [ + ["\u0643\u0645\u0627", -1, 3], + ["\u0647\u0645\u0627", -1, 3], + ["\u0646\u0627", -1, 2], + ["\u0647\u0627", -1, 2], + ["\u0643", -1, 1], + ["\u0643\u0645", -1, 2], + ["\u0647\u0645", -1, 2], + ["\u0643\u0646", -1, 2], + ["\u0647\u0646", -1, 2], + ["\u0647", -1, 1], + ["\u0643\u0645\u0648", -1, 3], + ["\u0646\u064A", -1, 2] + ]; + + /** @const */ var a_18 = [ + ["\u0627", -1, 1], + ["\u062A\u0627", 0, 2], + ["\u062A\u0645\u0627", 0, 4], + ["\u0646\u0627", 0, 2], + ["\u062A", -1, 1], + ["\u0646", -1, 1], + ["\u0627\u0646", 5, 3], + ["\u062A\u0646", 5, 2], + ["\u0648\u0646", 5, 3], + ["\u064A\u0646", 5, 3], + ["\u064A", -1, 1] + ]; + + /** @const */ var a_19 = [ + ["\u0648\u0627", -1, 1], + ["\u062A\u0645", -1, 1] + ]; + + /** @const */ var a_20 = [ + ["\u0648", -1, 1], + ["\u062A\u0645\u0648", 0, 2] + ]; + + /** @const */ var a_21 = [ + ["\u0649", -1, 1] + ]; + + var /** boolean */ B_is_defined = false; + var /** boolean */ B_is_verb = false; + var /** boolean */ B_is_noun = false; + + + /** @return {boolean} */ + function r_Normalize_pre() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + lab2: { + var /** number */ v_3 = base.cursor; + lab3: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + break lab3; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("0")) + { + return false; + } + break; + case 3: + if (!base.slice_from("1")) + { + return false; + } + break; + case 4: + if (!base.slice_from("2")) + { + return false; + } + break; + case 5: + if (!base.slice_from("3")) + { + return false; + } + break; + case 6: + if (!base.slice_from("4")) + { + return false; + } + break; + case 7: + if (!base.slice_from("5")) + { + return false; + } + break; + case 8: + if (!base.slice_from("6")) + { + return false; + } + break; + case 9: + if (!base.slice_from("7")) + { + return false; + } + break; + case 10: + if (!base.slice_from("8")) + { + return false; + } + break; + case 11: + if (!base.slice_from("9")) + { + return false; + } + break; + case 12: + if (!base.slice_from("\u0621")) + { + return false; + } + break; + case 13: + if (!base.slice_from("\u0623")) + { + return false; + } + break; + case 14: + if (!base.slice_from("\u0625")) + { + return false; + } + break; + case 15: + if (!base.slice_from("\u0626")) + { + return false; + } + break; + case 16: + if (!base.slice_from("\u0622")) + { + return false; + } + break; + case 17: + if (!base.slice_from("\u0624")) + { + return false; + } + break; + case 18: + if (!base.slice_from("\u0627")) + { + return false; + } + break; + case 19: + if (!base.slice_from("\u0628")) + { + return false; + } + break; + case 20: + if (!base.slice_from("\u0629")) + { + return false; + } + break; + case 21: + if (!base.slice_from("\u062A")) + { + return false; + } + break; + case 22: + if (!base.slice_from("\u062B")) + { + return false; + } + break; + case 23: + if (!base.slice_from("\u062C")) + { + return false; + } + break; + case 24: + if (!base.slice_from("\u062D")) + { + return false; + } + break; + case 25: + if (!base.slice_from("\u062E")) + { + return false; + } + break; + case 26: + if (!base.slice_from("\u062F")) + { + return false; + } + break; + case 27: + if (!base.slice_from("\u0630")) + { + return false; + } + break; + case 28: + if (!base.slice_from("\u0631")) + { + return false; + } + break; + case 29: + if (!base.slice_from("\u0632")) + { + return false; + } + break; + case 30: + if (!base.slice_from("\u0633")) + { + return false; + } + break; + case 31: + if (!base.slice_from("\u0634")) + { + return false; + } + break; + case 32: + if (!base.slice_from("\u0635")) + { + return false; + } + break; + case 33: + if (!base.slice_from("\u0636")) + { + return false; + } + break; + case 34: + if (!base.slice_from("\u0637")) + { + return false; + } + break; + case 35: + if (!base.slice_from("\u0638")) + { + return false; + } + break; + case 36: + if (!base.slice_from("\u0639")) + { + return false; + } + break; + case 37: + if (!base.slice_from("\u063A")) + { + return false; + } + break; + case 38: + if (!base.slice_from("\u0641")) + { + return false; + } + break; + case 39: + if (!base.slice_from("\u0642")) + { + return false; + } + break; + case 40: + if (!base.slice_from("\u0643")) + { + return false; + } + break; + case 41: + if (!base.slice_from("\u0644")) + { + return false; + } + break; + case 42: + if (!base.slice_from("\u0645")) + { + return false; + } + break; + case 43: + if (!base.slice_from("\u0646")) + { + return false; + } + break; + case 44: + if (!base.slice_from("\u0647")) + { + return false; + } + break; + case 45: + if (!base.slice_from("\u0648")) + { + return false; + } + break; + case 46: + if (!base.slice_from("\u0649")) + { + return false; + } + break; + case 47: + if (!base.slice_from("\u064A")) + { + return false; + } + break; + case 48: + if (!base.slice_from("\u0644\u0627")) + { + return false; + } + break; + case 49: + if (!base.slice_from("\u0644\u0623")) + { + return false; + } + break; + case 50: + if (!base.slice_from("\u0644\u0625")) + { + return false; + } + break; + case 51: + if (!base.slice_from("\u0644\u0622")) + { + return false; + } + break; + } + break lab2; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_Normalize_post() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + lab0: { + base.limit_backward = base.cursor; base.cursor = base.limit; + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_from("\u0621")) + { + return false; + } + base.cursor = base.limit_backward; + } + base.cursor = v_1; + var /** number */ v_2 = base.cursor; + lab1: { + while(true) + { + var /** number */ v_3 = base.cursor; + lab2: { + lab3: { + var /** number */ v_4 = base.cursor; + lab4: { + base.bra = base.cursor; + among_var = base.find_among(a_2); + if (among_var == 0) + { + break lab4; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u0627")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u0648")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u064A")) + { + return false; + } + break; + } + break lab3; + } + base.cursor = v_4; + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + continue; + } + base.cursor = v_3; + break; + } + } + base.cursor = v_2; + return true; + }; + + /** @return {boolean} */ + function r_Checks1() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_3); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (base.current.length <= 4) + { + return false; + } + B_is_noun = true; + B_is_verb = false; + B_is_defined = true; + break; + case 2: + if (base.current.length <= 3) + { + return false; + } + B_is_noun = true; + B_is_verb = false; + B_is_defined = true; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Prefix_Step1() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_4); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_from("\u0623")) + { + return false; + } + break; + case 2: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_from("\u0622")) + { + return false; + } + break; + case 3: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_from("\u0627")) + { + return false; + } + break; + case 4: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_from("\u0625")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Prefix_Step2() { + base.bra = base.cursor; + if (base.find_among(a_5) == 0) + { + return false; + } + base.ket = base.cursor; + if (base.current.length <= 3) + { + return false; + } + { + var /** number */ v_1 = base.cursor; + lab0: { + if (!(base.eq_s("\u0627"))) + { + break lab0; + } + return false; + } + base.cursor = v_1; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Prefix_Step3a_Noun() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_6); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (base.current.length <= 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (base.current.length <= 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Prefix_Step3b_Noun() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_7); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_from("\u0628")) + { + return false; + } + break; + case 3: + if (base.current.length <= 3) + { + return false; + } + if (!base.slice_from("\u0643")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Prefix_Step3_Verb() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_8); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (base.current.length <= 4) + { + return false; + } + if (!base.slice_from("\u064A")) + { + return false; + } + break; + case 2: + if (base.current.length <= 4) + { + return false; + } + if (!base.slice_from("\u062A")) + { + return false; + } + break; + case 3: + if (base.current.length <= 4) + { + return false; + } + if (!base.slice_from("\u0646")) + { + return false; + } + break; + case 4: + if (base.current.length <= 4) + { + return false; + } + if (!base.slice_from("\u0623")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Prefix_Step4_Verb() { + base.bra = base.cursor; + if (base.find_among(a_9) == 0) + { + return false; + } + base.ket = base.cursor; + if (base.current.length <= 4) + { + return false; + } + B_is_verb = true; + B_is_noun = false; + if (!base.slice_from("\u0627\u0633\u062A")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step1a() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_10); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (base.current.length < 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (base.current.length < 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (base.current.length < 6) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step1b() { + base.ket = base.cursor; + if (base.find_among_b(a_11) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length <= 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step2a() { + base.ket = base.cursor; + if (base.find_among_b(a_12) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length <= 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step2b() { + base.ket = base.cursor; + if (base.find_among_b(a_13) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length < 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step2c1() { + base.ket = base.cursor; + if (base.find_among_b(a_14) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length < 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step2c2() { + base.ket = base.cursor; + if (base.find_among_b(a_15) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length < 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Noun_Step3() { + base.ket = base.cursor; + if (base.find_among_b(a_16) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length < 3) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Verb_Step1() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_17); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (base.current.length < 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (base.current.length < 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (base.current.length < 6) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Verb_Step2a() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_18); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (base.current.length < 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (base.current.length < 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (base.current.length <= 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 4: + if (base.current.length < 6) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Verb_Step2b() { + base.ket = base.cursor; + if (base.find_among_b(a_19) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.current.length < 5) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_Verb_Step2c() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_20); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (base.current.length < 4) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (base.current.length < 6) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Suffix_All_alef_maqsura() { + base.ket = base.cursor; + if (base.find_among_b(a_21) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_from("\u064A")) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + B_is_noun = true; + B_is_verb = true; + B_is_defined = false; + var /** number */ v_1 = base.cursor; + r_Checks1(); + base.cursor = v_1; + r_Normalize_pre(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + if (!B_is_verb) + { + break lab2; + } + lab3: { + var /** number */ v_5 = base.limit - base.cursor; + lab4: { + { + var v_6 = 1; + while(true) + { + var /** number */ v_7 = base.limit - base.cursor; + lab5: { + if (!r_Suffix_Verb_Step1()) + { + break lab5; + } + v_6--; + continue; + } + base.cursor = base.limit - v_7; + break; + } + if (v_6 > 0) + { + break lab4; + } + } + lab6: { + var /** number */ v_8 = base.limit - base.cursor; + lab7: { + if (!r_Suffix_Verb_Step2a()) + { + break lab7; + } + break lab6; + } + base.cursor = base.limit - v_8; + lab8: { + if (!r_Suffix_Verb_Step2c()) + { + break lab8; + } + break lab6; + } + base.cursor = base.limit - v_8; + if (base.cursor <= base.limit_backward) + { + break lab4; + } + base.cursor--; + } + break lab3; + } + base.cursor = base.limit - v_5; + lab9: { + if (!r_Suffix_Verb_Step2b()) + { + break lab9; + } + break lab3; + } + base.cursor = base.limit - v_5; + if (!r_Suffix_Verb_Step2a()) + { + break lab2; + } + } + break lab1; + } + base.cursor = base.limit - v_4; + lab10: { + if (!B_is_noun) + { + break lab10; + } + var /** number */ v_9 = base.limit - base.cursor; + lab11: { + lab12: { + var /** number */ v_10 = base.limit - base.cursor; + lab13: { + if (!r_Suffix_Noun_Step2c2()) + { + break lab13; + } + break lab12; + } + base.cursor = base.limit - v_10; + lab14: { + lab15: { + if (!B_is_defined) + { + break lab15; + } + break lab14; + } + if (!r_Suffix_Noun_Step1a()) + { + break lab14; + } + lab16: { + var /** number */ v_12 = base.limit - base.cursor; + lab17: { + if (!r_Suffix_Noun_Step2a()) + { + break lab17; + } + break lab16; + } + base.cursor = base.limit - v_12; + lab18: { + if (!r_Suffix_Noun_Step2b()) + { + break lab18; + } + break lab16; + } + base.cursor = base.limit - v_12; + lab19: { + if (!r_Suffix_Noun_Step2c1()) + { + break lab19; + } + break lab16; + } + base.cursor = base.limit - v_12; + if (base.cursor <= base.limit_backward) + { + break lab14; + } + base.cursor--; + } + break lab12; + } + base.cursor = base.limit - v_10; + lab20: { + if (!r_Suffix_Noun_Step1b()) + { + break lab20; + } + lab21: { + var /** number */ v_13 = base.limit - base.cursor; + lab22: { + if (!r_Suffix_Noun_Step2a()) + { + break lab22; + } + break lab21; + } + base.cursor = base.limit - v_13; + lab23: { + if (!r_Suffix_Noun_Step2b()) + { + break lab23; + } + break lab21; + } + base.cursor = base.limit - v_13; + if (!r_Suffix_Noun_Step2c1()) + { + break lab20; + } + } + break lab12; + } + base.cursor = base.limit - v_10; + lab24: { + lab25: { + if (!B_is_defined) + { + break lab25; + } + break lab24; + } + if (!r_Suffix_Noun_Step2a()) + { + break lab24; + } + break lab12; + } + base.cursor = base.limit - v_10; + if (!r_Suffix_Noun_Step2b()) + { + base.cursor = base.limit - v_9; + break lab11; + } + } + } + if (!r_Suffix_Noun_Step3()) + { + break lab10; + } + break lab1; + } + base.cursor = base.limit - v_4; + if (!r_Suffix_All_alef_maqsura()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_3; + base.cursor = base.limit_backward; + var /** number */ v_15 = base.cursor; + lab26: { + var /** number */ v_16 = base.cursor; + lab27: { + if (!r_Prefix_Step1()) + { + base.cursor = v_16; + break lab27; + } + } + var /** number */ v_17 = base.cursor; + lab28: { + if (!r_Prefix_Step2()) + { + base.cursor = v_17; + break lab28; + } + } + lab29: { + var /** number */ v_18 = base.cursor; + lab30: { + if (!r_Prefix_Step3a_Noun()) + { + break lab30; + } + break lab29; + } + base.cursor = v_18; + lab31: { + if (!B_is_noun) + { + break lab31; + } + if (!r_Prefix_Step3b_Noun()) + { + break lab31; + } + break lab29; + } + base.cursor = v_18; + if (!B_is_verb) + { + break lab26; + } + var /** number */ v_19 = base.cursor; + lab32: { + if (!r_Prefix_Step3_Verb()) + { + base.cursor = v_19; + break lab32; + } + } + if (!r_Prefix_Step4_Verb()) + { + break lab26; + } + } + } + base.cursor = v_15; + r_Normalize_post(); + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['ArabicStemmer'] = ArabicStemmer; diff --git a/js/armenian-stemmer.js b/js/armenian-stemmer.js new file mode 100644 index 0000000..669e63c --- /dev/null +++ b/js/armenian-stemmer.js @@ -0,0 +1,391 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var ArmenianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0580\u0578\u0580\u0564", -1, 1], + ["\u0565\u0580\u0578\u0580\u0564", 0, 1], + ["\u0561\u056C\u056B", -1, 1], + ["\u0561\u056F\u056B", -1, 1], + ["\u0578\u0580\u0561\u056F", -1, 1], + ["\u0565\u0572", -1, 1], + ["\u0561\u056F\u0561\u0576", -1, 1], + ["\u0561\u0580\u0561\u0576", -1, 1], + ["\u0565\u0576", -1, 1], + ["\u0565\u056F\u0565\u0576", 8, 1], + ["\u0565\u0580\u0565\u0576", 8, 1], + ["\u0578\u0580\u0567\u0576", -1, 1], + ["\u056B\u0576", -1, 1], + ["\u0563\u056B\u0576", 12, 1], + ["\u0578\u057E\u056B\u0576", 12, 1], + ["\u056C\u0561\u0575\u0576", -1, 1], + ["\u057E\u0578\u0582\u0576", -1, 1], + ["\u057A\u0565\u057D", -1, 1], + ["\u056B\u057E", -1, 1], + ["\u0561\u057F", -1, 1], + ["\u0561\u057E\u0565\u057F", -1, 1], + ["\u056F\u0578\u057F", -1, 1], + ["\u0562\u0561\u0580", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["\u0561", -1, 1], + ["\u0561\u0581\u0561", 0, 1], + ["\u0565\u0581\u0561", 0, 1], + ["\u057E\u0565", -1, 1], + ["\u0561\u0581\u0580\u056B", -1, 1], + ["\u0561\u0581\u056B", -1, 1], + ["\u0565\u0581\u056B", -1, 1], + ["\u057E\u0565\u0581\u056B", 6, 1], + ["\u0561\u056C", -1, 1], + ["\u0568\u0561\u056C", 8, 1], + ["\u0561\u0576\u0561\u056C", 8, 1], + ["\u0565\u0576\u0561\u056C", 8, 1], + ["\u0561\u0581\u0576\u0561\u056C", 8, 1], + ["\u0565\u056C", -1, 1], + ["\u0568\u0565\u056C", 13, 1], + ["\u0576\u0565\u056C", 13, 1], + ["\u0581\u0576\u0565\u056C", 15, 1], + ["\u0565\u0581\u0576\u0565\u056C", 16, 1], + ["\u0579\u0565\u056C", 13, 1], + ["\u057E\u0565\u056C", 13, 1], + ["\u0561\u0581\u057E\u0565\u056C", 19, 1], + ["\u0565\u0581\u057E\u0565\u056C", 19, 1], + ["\u057F\u0565\u056C", 13, 1], + ["\u0561\u057F\u0565\u056C", 22, 1], + ["\u0578\u057F\u0565\u056C", 22, 1], + ["\u056F\u0578\u057F\u0565\u056C", 24, 1], + ["\u057E\u0561\u056E", -1, 1], + ["\u0578\u0582\u0574", -1, 1], + ["\u057E\u0578\u0582\u0574", 27, 1], + ["\u0561\u0576", -1, 1], + ["\u0581\u0561\u0576", 29, 1], + ["\u0561\u0581\u0561\u0576", 30, 1], + ["\u0561\u0581\u0580\u056B\u0576", -1, 1], + ["\u0561\u0581\u056B\u0576", -1, 1], + ["\u0565\u0581\u056B\u0576", -1, 1], + ["\u057E\u0565\u0581\u056B\u0576", 34, 1], + ["\u0561\u056C\u056B\u057D", -1, 1], + ["\u0565\u056C\u056B\u057D", -1, 1], + ["\u0561\u057E", -1, 1], + ["\u0561\u0581\u0561\u057E", 38, 1], + ["\u0565\u0581\u0561\u057E", 38, 1], + ["\u0561\u056C\u0578\u057E", -1, 1], + ["\u0565\u056C\u0578\u057E", -1, 1], + ["\u0561\u0580", -1, 1], + ["\u0561\u0581\u0561\u0580", 43, 1], + ["\u0565\u0581\u0561\u0580", 43, 1], + ["\u0561\u0581\u0580\u056B\u0580", -1, 1], + ["\u0561\u0581\u056B\u0580", -1, 1], + ["\u0565\u0581\u056B\u0580", -1, 1], + ["\u057E\u0565\u0581\u056B\u0580", 48, 1], + ["\u0561\u0581", -1, 1], + ["\u0565\u0581", -1, 1], + ["\u0561\u0581\u0580\u0565\u0581", 51, 1], + ["\u0561\u056C\u0578\u0582\u0581", -1, 1], + ["\u0565\u056C\u0578\u0582\u0581", -1, 1], + ["\u0561\u056C\u0578\u0582", -1, 1], + ["\u0565\u056C\u0578\u0582", -1, 1], + ["\u0561\u0584", -1, 1], + ["\u0581\u0561\u0584", 57, 1], + ["\u0561\u0581\u0561\u0584", 58, 1], + ["\u0561\u0581\u0580\u056B\u0584", -1, 1], + ["\u0561\u0581\u056B\u0584", -1, 1], + ["\u0565\u0581\u056B\u0584", -1, 1], + ["\u057E\u0565\u0581\u056B\u0584", 62, 1], + ["\u0561\u0576\u0584", -1, 1], + ["\u0581\u0561\u0576\u0584", 64, 1], + ["\u0561\u0581\u0561\u0576\u0584", 65, 1], + ["\u0561\u0581\u0580\u056B\u0576\u0584", -1, 1], + ["\u0561\u0581\u056B\u0576\u0584", -1, 1], + ["\u0565\u0581\u056B\u0576\u0584", -1, 1], + ["\u057E\u0565\u0581\u056B\u0576\u0584", 69, 1] + ]; + + /** @const */ var a_2 = [ + ["\u0578\u0580\u0564", -1, 1], + ["\u0578\u0582\u0575\u0569", -1, 1], + ["\u0578\u0582\u0570\u056B", -1, 1], + ["\u0581\u056B", -1, 1], + ["\u056B\u056C", -1, 1], + ["\u0561\u056F", -1, 1], + ["\u0575\u0561\u056F", 5, 1], + ["\u0561\u0576\u0561\u056F", 5, 1], + ["\u056B\u056F", -1, 1], + ["\u0578\u0582\u056F", -1, 1], + ["\u0561\u0576", -1, 1], + ["\u057A\u0561\u0576", 10, 1], + ["\u057D\u057F\u0561\u0576", 10, 1], + ["\u0561\u0580\u0561\u0576", 10, 1], + ["\u0565\u0572\u0567\u0576", -1, 1], + ["\u0575\u0578\u0582\u0576", -1, 1], + ["\u0578\u0582\u0569\u0575\u0578\u0582\u0576", 15, 1], + ["\u0561\u056E\u0578", -1, 1], + ["\u056B\u0579", -1, 1], + ["\u0578\u0582\u057D", -1, 1], + ["\u0578\u0582\u057D\u057F", -1, 1], + ["\u0563\u0561\u0580", -1, 1], + ["\u057E\u0578\u0580", -1, 1], + ["\u0561\u057E\u0578\u0580", 22, 1], + ["\u0578\u0581", -1, 1], + ["\u0561\u0576\u0585\u0581", -1, 1], + ["\u0578\u0582", -1, 1], + ["\u0584", -1, 1], + ["\u0579\u0565\u0584", 27, 1], + ["\u056B\u0584", 27, 1], + ["\u0561\u056C\u056B\u0584", 29, 1], + ["\u0561\u0576\u056B\u0584", 29, 1], + ["\u057E\u0561\u056E\u0584", 27, 1], + ["\u0578\u0582\u0575\u0584", 27, 1], + ["\u0565\u0576\u0584", 27, 1], + ["\u0578\u0576\u0584", 27, 1], + ["\u0578\u0582\u0576\u0584", 27, 1], + ["\u0574\u0578\u0582\u0576\u0584", 36, 1], + ["\u056B\u0579\u0584", 27, 1], + ["\u0561\u0580\u0584", 27, 1] + ]; + + /** @const */ var a_3 = [ + ["\u057D\u0561", -1, 1], + ["\u057E\u0561", -1, 1], + ["\u0561\u0574\u0562", -1, 1], + ["\u0564", -1, 1], + ["\u0561\u0576\u0564", 3, 1], + ["\u0578\u0582\u0569\u0575\u0561\u0576\u0564", 4, 1], + ["\u057E\u0561\u0576\u0564", 4, 1], + ["\u0578\u057B\u0564", 3, 1], + ["\u0565\u0580\u0564", 3, 1], + ["\u0576\u0565\u0580\u0564", 8, 1], + ["\u0578\u0582\u0564", 3, 1], + ["\u0568", -1, 1], + ["\u0561\u0576\u0568", 11, 1], + ["\u0578\u0582\u0569\u0575\u0561\u0576\u0568", 12, 1], + ["\u057E\u0561\u0576\u0568", 12, 1], + ["\u0578\u057B\u0568", 11, 1], + ["\u0565\u0580\u0568", 11, 1], + ["\u0576\u0565\u0580\u0568", 16, 1], + ["\u056B", -1, 1], + ["\u057E\u056B", 18, 1], + ["\u0565\u0580\u056B", 18, 1], + ["\u0576\u0565\u0580\u056B", 20, 1], + ["\u0561\u0576\u0578\u0582\u0574", -1, 1], + ["\u0565\u0580\u0578\u0582\u0574", -1, 1], + ["\u0576\u0565\u0580\u0578\u0582\u0574", 23, 1], + ["\u0576", -1, 1], + ["\u0561\u0576", 25, 1], + ["\u0578\u0582\u0569\u0575\u0561\u0576", 26, 1], + ["\u057E\u0561\u0576", 26, 1], + ["\u056B\u0576", 25, 1], + ["\u0565\u0580\u056B\u0576", 29, 1], + ["\u0576\u0565\u0580\u056B\u0576", 30, 1], + ["\u0578\u0582\u0569\u0575\u0561\u0576\u0576", 25, 1], + ["\u0565\u0580\u0576", 25, 1], + ["\u0576\u0565\u0580\u0576", 33, 1], + ["\u0578\u0582\u0576", 25, 1], + ["\u0578\u057B", -1, 1], + ["\u0578\u0582\u0569\u0575\u0561\u0576\u057D", -1, 1], + ["\u057E\u0561\u0576\u057D", -1, 1], + ["\u0578\u057B\u057D", -1, 1], + ["\u0578\u057E", -1, 1], + ["\u0561\u0576\u0578\u057E", 40, 1], + ["\u057E\u0578\u057E", 40, 1], + ["\u0565\u0580\u0578\u057E", 40, 1], + ["\u0576\u0565\u0580\u0578\u057E", 43, 1], + ["\u0565\u0580", -1, 1], + ["\u0576\u0565\u0580", 45, 1], + ["\u0581", -1, 1], + ["\u056B\u0581", 47, 1], + ["\u057E\u0561\u0576\u056B\u0581", 48, 1], + ["\u0578\u057B\u056B\u0581", 48, 1], + ["\u057E\u056B\u0581", 48, 1], + ["\u0565\u0580\u056B\u0581", 48, 1], + ["\u0576\u0565\u0580\u056B\u0581", 52, 1], + ["\u0581\u056B\u0581", 48, 1], + ["\u0578\u0581", 47, 1], + ["\u0578\u0582\u0581", 47, 1] + ]; + + /** @const */ var /** Array */ g_v = [209, 4, 128, 0, 18]; + + var /** number */ I_p2 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + lab2: { + if (!(base.in_grouping(g_v, 1377, 1413))) + { + break lab2; + } + break golab1; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_pV = base.cursor; + golab3: while(true) + { + lab4: { + if (!(base.out_grouping(g_v, 1377, 1413))) + { + break lab4; + } + break golab3; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 1377, 1413))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 1377, 1413))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_adjective() { + base.ket = base.cursor; + if (base.find_among_b(a_0) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_verb() { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_noun() { + base.ket = base.cursor; + if (base.find_among_b(a_2) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_ending() { + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_3 = base.limit_backward; + base.limit_backward = I_pV; + var /** number */ v_4 = base.limit - base.cursor; + r_ending(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_verb(); + base.cursor = base.limit - v_5; + var /** number */ v_6 = base.limit - base.cursor; + r_adjective(); + base.cursor = base.limit - v_6; + var /** number */ v_7 = base.limit - base.cursor; + r_noun(); + base.cursor = base.limit - v_7; + base.limit_backward = v_3; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['ArmenianStemmer'] = ArmenianStemmer; diff --git a/js/base-stemmer.js b/js/base-stemmer.js new file mode 100644 index 0000000..930078b --- /dev/null +++ b/js/base-stemmer.js @@ -0,0 +1,296 @@ +/**@constructor*/ +const BaseStemmer = function() { + this.setCurrent = function(value) { + this.current = value; + this.cursor = 0; + this.limit = this.current.length; + this.limit_backward = 0; + this.bra = this.cursor; + this.ket = this.limit; + }; + + this.getCurrent = function() { + return this.current; + }; + + this.copy_from = function(other) { + this.current = other.current; + this.cursor = other.cursor; + this.limit = other.limit; + this.limit_backward = other.limit_backward; + this.bra = other.bra; + this.ket = other.ket; + }; + + this.in_grouping = function(s, min, max) { + if (this.cursor >= this.limit) return false; + var ch = this.current.charCodeAt(this.cursor); + if (ch > max || ch < min) return false; + ch -= min; + if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; + this.cursor++; + return true; + }; + + this.in_grouping_b = function(s, min, max) { + if (this.cursor <= this.limit_backward) return false; + var ch = this.current.charCodeAt(this.cursor - 1); + if (ch > max || ch < min) return false; + ch -= min; + if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; + this.cursor--; + return true; + }; + + this.out_grouping = function(s, min, max) { + if (this.cursor >= this.limit) return false; + var ch = this.current.charCodeAt(this.cursor); + if (ch > max || ch < min) { + this.cursor++; + return true; + } + ch -= min; + if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) { + this.cursor++; + return true; + } + return false; + }; + + this.out_grouping_b = function(s, min, max) { + if (this.cursor <= this.limit_backward) return false; + var ch = this.current.charCodeAt(this.cursor - 1); + if (ch > max || ch < min) { + this.cursor--; + return true; + } + ch -= min; + if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) { + this.cursor--; + return true; + } + return false; + }; + + this.eq_s = function(s) + { + if (this.limit - this.cursor < s.length) return false; + if (this.current.slice(this.cursor, this.cursor + s.length) != s) + { + return false; + } + this.cursor += s.length; + return true; + }; + + this.eq_s_b = function(s) + { + if (this.cursor - this.limit_backward < s.length) return false; + if (this.current.slice(this.cursor - s.length, this.cursor) != s) + { + return false; + } + this.cursor -= s.length; + return true; + }; + + /** @return {number} */ this.find_among = function(v) + { + var i = 0; + var j = v.length; + + var c = this.cursor; + var l = this.limit; + + var common_i = 0; + var common_j = 0; + + var first_key_inspected = false; + + while (true) + { + var k = i + ((j - i) >>> 1); + var diff = 0; + var common = common_i < common_j ? common_i : common_j; // smaller + // w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional) + var w = v[k]; + var i2; + for (i2 = common; i2 < w[0].length; i2++) + { + if (c + common == l) + { + diff = -1; + break; + } + diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2); + if (diff != 0) break; + common++; + } + if (diff < 0) + { + j = k; + common_j = common; + } + else + { + i = k; + common_i = common; + } + if (j - i <= 1) + { + if (i > 0) break; // v->s has been inspected + if (j == i) break; // only one item in v + + // - but now we need to go round once more to get + // v->s inspected. This looks messy, but is actually + // the optimal approach. + + if (first_key_inspected) break; + first_key_inspected = true; + } + } + do { + var w = v[i]; + if (common_i >= w[0].length) + { + this.cursor = c + w[0].length; + if (w.length < 4) return w[2]; + var res = w[3](this); + this.cursor = c + w[0].length; + if (res) return w[2]; + } + i = w[1]; + } while (i >= 0); + return 0; + }; + + // find_among_b is for backwards processing. Same comments apply + this.find_among_b = function(v) + { + var i = 0; + var j = v.length + + var c = this.cursor; + var lb = this.limit_backward; + + var common_i = 0; + var common_j = 0; + + var first_key_inspected = false; + + while (true) + { + var k = i + ((j - i) >> 1); + var diff = 0; + var common = common_i < common_j ? common_i : common_j; + var w = v[k]; + var i2; + for (i2 = w[0].length - 1 - common; i2 >= 0; i2--) + { + if (c - common == lb) + { + diff = -1; + break; + } + diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2); + if (diff != 0) break; + common++; + } + if (diff < 0) + { + j = k; + common_j = common; + } + else + { + i = k; + common_i = common; + } + if (j - i <= 1) + { + if (i > 0) break; + if (j == i) break; + if (first_key_inspected) break; + first_key_inspected = true; + } + } + do { + var w = v[i]; + if (common_i >= w[0].length) + { + this.cursor = c - w[0].length; + if (w.length < 4) return w[2]; + var res = w[3](this); + this.cursor = c - w[0].length; + if (res) return w[2]; + } + i = w[1]; + } while (i >= 0); + return 0; + }; + + /* to replace chars between c_bra and c_ket in this.current by the + * chars in s. + */ + this.replace_s = function(c_bra, c_ket, s) + { + var adjustment = s.length - (c_ket - c_bra); + this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket); + this.limit += adjustment; + if (this.cursor >= c_ket) this.cursor += adjustment; + else if (this.cursor > c_bra) this.cursor = c_bra; + return adjustment; + }; + + this.slice_check = function() + { + if (this.bra < 0 || + this.bra > this.ket || + this.ket > this.limit || + this.limit > this.current.length) + { + return false; + } + return true; + }; + + this.slice_from = function(s) + { + var result = false; + if (this.slice_check()) + { + this.replace_s(this.bra, this.ket, s); + result = true; + } + return result; + }; + + this.slice_del = function() + { + return this.slice_from(""); + }; + + this.insert = function(c_bra, c_ket, s) + { + var adjustment = this.replace_s(c_bra, c_ket, s); + if (c_bra <= this.bra) this.bra += adjustment; + if (c_bra <= this.ket) this.ket += adjustment; + }; + + this.slice_to = function() + { + var result = ''; + if (this.slice_check()) + { + result = this.current.slice(this.bra, this.ket); + } + return result; + }; + + this.assign_to = function() + { + return this.current.slice(0, this.limit); + }; +}; + +window['BaseStemmer'] = BaseStemmer; diff --git a/js/basque-stemmer.js b/js/basque-stemmer.js new file mode 100644 index 0000000..dbd9e76 --- /dev/null +++ b/js/basque-stemmer.js @@ -0,0 +1,849 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var BasqueStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["idea", -1, 1], + ["bidea", 0, 1], + ["kidea", 0, 1], + ["pidea", 0, 1], + ["kundea", -1, 1], + ["galea", -1, 1], + ["tailea", -1, 1], + ["tzailea", -1, 1], + ["gunea", -1, 1], + ["kunea", -1, 1], + ["tzaga", -1, 1], + ["gaia", -1, 1], + ["aldia", -1, 1], + ["taldia", 12, 1], + ["karia", -1, 1], + ["garria", -1, 2], + ["karria", -1, 1], + ["ka", -1, 1], + ["tzaka", 17, 1], + ["la", -1, 1], + ["mena", -1, 1], + ["pena", -1, 1], + ["kina", -1, 1], + ["ezina", -1, 1], + ["tezina", 23, 1], + ["kuna", -1, 1], + ["tuna", -1, 1], + ["kizuna", -1, 1], + ["era", -1, 1], + ["bera", 28, 1], + ["arabera", 29, 4], + ["kera", 28, 1], + ["pera", 28, 1], + ["orra", -1, 1], + ["korra", 33, 1], + ["dura", -1, 1], + ["gura", -1, 1], + ["kura", -1, 1], + ["tura", -1, 1], + ["eta", -1, 1], + ["keta", 39, 1], + ["gailua", -1, 1], + ["eza", -1, 1], + ["erreza", 42, 1], + ["tza", -1, 2], + ["gaitza", 44, 1], + ["kaitza", 44, 1], + ["kuntza", 44, 1], + ["ide", -1, 1], + ["bide", 48, 1], + ["kide", 48, 1], + ["pide", 48, 1], + ["kunde", -1, 1], + ["tzake", -1, 1], + ["tzeke", -1, 1], + ["le", -1, 1], + ["gale", 55, 1], + ["taile", 55, 1], + ["tzaile", 55, 1], + ["gune", -1, 1], + ["kune", -1, 1], + ["tze", -1, 1], + ["atze", 61, 1], + ["gai", -1, 1], + ["aldi", -1, 1], + ["taldi", 64, 1], + ["ki", -1, 1], + ["ari", -1, 1], + ["kari", 67, 1], + ["lari", 67, 1], + ["tari", 67, 1], + ["etari", 70, 1], + ["garri", -1, 2], + ["karri", -1, 1], + ["arazi", -1, 1], + ["tarazi", 74, 1], + ["an", -1, 1], + ["ean", 76, 1], + ["rean", 77, 1], + ["kan", 76, 1], + ["etan", 76, 1], + ["atseden", -1, 3], + ["men", -1, 1], + ["pen", -1, 1], + ["kin", -1, 1], + ["rekin", 84, 1], + ["ezin", -1, 1], + ["tezin", 86, 1], + ["tun", -1, 1], + ["kizun", -1, 1], + ["go", -1, 1], + ["ago", 90, 1], + ["tio", -1, 1], + ["dako", -1, 1], + ["or", -1, 1], + ["kor", 94, 1], + ["tzat", -1, 1], + ["du", -1, 1], + ["gailu", -1, 1], + ["tu", -1, 1], + ["atu", 99, 1], + ["aldatu", 100, 1], + ["tatu", 100, 1], + ["baditu", 99, 5], + ["ez", -1, 1], + ["errez", 104, 1], + ["tzez", 104, 1], + ["gaitz", -1, 1], + ["kaitz", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["ada", -1, 1], + ["kada", 0, 1], + ["anda", -1, 1], + ["denda", -1, 1], + ["gabea", -1, 1], + ["kabea", -1, 1], + ["aldea", -1, 1], + ["kaldea", 6, 1], + ["taldea", 6, 1], + ["ordea", -1, 1], + ["zalea", -1, 1], + ["tzalea", 10, 1], + ["gilea", -1, 1], + ["emea", -1, 1], + ["kumea", -1, 1], + ["nea", -1, 1], + ["enea", 15, 1], + ["zionea", 15, 1], + ["unea", 15, 1], + ["gunea", 18, 1], + ["pea", -1, 1], + ["aurrea", -1, 1], + ["tea", -1, 1], + ["kotea", 22, 1], + ["artea", 22, 1], + ["ostea", 22, 1], + ["etxea", -1, 1], + ["ga", -1, 1], + ["anga", 27, 1], + ["gaia", -1, 1], + ["aldia", -1, 1], + ["taldia", 30, 1], + ["handia", -1, 1], + ["mendia", -1, 1], + ["geia", -1, 1], + ["egia", -1, 1], + ["degia", 35, 1], + ["tegia", 35, 1], + ["nahia", -1, 1], + ["ohia", -1, 1], + ["kia", -1, 1], + ["tokia", 40, 1], + ["oia", -1, 1], + ["koia", 42, 1], + ["aria", -1, 1], + ["karia", 44, 1], + ["laria", 44, 1], + ["taria", 44, 1], + ["eria", -1, 1], + ["keria", 48, 1], + ["teria", 48, 1], + ["garria", -1, 2], + ["larria", -1, 1], + ["kirria", -1, 1], + ["duria", -1, 1], + ["asia", -1, 1], + ["tia", -1, 1], + ["ezia", -1, 1], + ["bizia", -1, 1], + ["ontzia", -1, 1], + ["ka", -1, 1], + ["joka", 60, 3], + ["aurka", 60, 10], + ["ska", 60, 1], + ["xka", 60, 1], + ["zka", 60, 1], + ["gibela", -1, 1], + ["gela", -1, 1], + ["kaila", -1, 1], + ["skila", -1, 1], + ["tila", -1, 1], + ["ola", -1, 1], + ["na", -1, 1], + ["kana", 72, 1], + ["ena", 72, 1], + ["garrena", 74, 1], + ["gerrena", 74, 1], + ["urrena", 74, 1], + ["zaina", 72, 1], + ["tzaina", 78, 1], + ["kina", 72, 1], + ["mina", 72, 1], + ["garna", 72, 1], + ["una", 72, 1], + ["duna", 83, 1], + ["asuna", 83, 1], + ["tasuna", 85, 1], + ["ondoa", -1, 1], + ["kondoa", 87, 1], + ["ngoa", -1, 1], + ["zioa", -1, 1], + ["koa", -1, 1], + ["takoa", 91, 1], + ["zkoa", 91, 1], + ["noa", -1, 1], + ["zinoa", 94, 1], + ["aroa", -1, 1], + ["taroa", 96, 1], + ["zaroa", 96, 1], + ["eroa", -1, 1], + ["oroa", -1, 1], + ["osoa", -1, 1], + ["toa", -1, 1], + ["ttoa", 102, 1], + ["ztoa", 102, 1], + ["txoa", -1, 1], + ["tzoa", -1, 1], + ["\u00F1oa", -1, 1], + ["ra", -1, 1], + ["ara", 108, 1], + ["dara", 109, 1], + ["liara", 109, 1], + ["tiara", 109, 1], + ["tara", 109, 1], + ["etara", 113, 1], + ["tzara", 109, 1], + ["bera", 108, 1], + ["kera", 108, 1], + ["pera", 108, 1], + ["ora", 108, 2], + ["tzarra", 108, 1], + ["korra", 108, 1], + ["tra", 108, 1], + ["sa", -1, 1], + ["osa", 123, 1], + ["ta", -1, 1], + ["eta", 125, 1], + ["keta", 126, 1], + ["sta", 125, 1], + ["dua", -1, 1], + ["mendua", 129, 1], + ["ordua", 129, 1], + ["lekua", -1, 1], + ["burua", -1, 1], + ["durua", -1, 1], + ["tsua", -1, 1], + ["tua", -1, 1], + ["mentua", 136, 1], + ["estua", 136, 1], + ["txua", -1, 1], + ["zua", -1, 1], + ["tzua", 140, 1], + ["za", -1, 1], + ["eza", 142, 1], + ["eroza", 142, 1], + ["tza", 142, 2], + ["koitza", 145, 1], + ["antza", 145, 1], + ["gintza", 145, 1], + ["kintza", 145, 1], + ["kuntza", 145, 1], + ["gabe", -1, 1], + ["kabe", -1, 1], + ["kide", -1, 1], + ["alde", -1, 1], + ["kalde", 154, 1], + ["talde", 154, 1], + ["orde", -1, 1], + ["ge", -1, 1], + ["zale", -1, 1], + ["tzale", 159, 1], + ["gile", -1, 1], + ["eme", -1, 1], + ["kume", -1, 1], + ["ne", -1, 1], + ["zione", 164, 1], + ["une", 164, 1], + ["gune", 166, 1], + ["pe", -1, 1], + ["aurre", -1, 1], + ["te", -1, 1], + ["kote", 170, 1], + ["arte", 170, 1], + ["oste", 170, 1], + ["etxe", -1, 1], + ["gai", -1, 1], + ["di", -1, 1], + ["aldi", 176, 1], + ["taldi", 177, 1], + ["geldi", 176, 8], + ["handi", 176, 1], + ["mendi", 176, 1], + ["gei", -1, 1], + ["egi", -1, 1], + ["degi", 183, 1], + ["tegi", 183, 1], + ["nahi", -1, 1], + ["ohi", -1, 1], + ["ki", -1, 1], + ["toki", 188, 1], + ["oi", -1, 1], + ["goi", 190, 1], + ["koi", 190, 1], + ["ari", -1, 1], + ["kari", 193, 1], + ["lari", 193, 1], + ["tari", 193, 1], + ["garri", -1, 2], + ["larri", -1, 1], + ["kirri", -1, 1], + ["duri", -1, 1], + ["asi", -1, 1], + ["ti", -1, 1], + ["ontzi", -1, 1], + ["\u00F1i", -1, 1], + ["ak", -1, 1], + ["ek", -1, 1], + ["tarik", -1, 1], + ["gibel", -1, 1], + ["ail", -1, 1], + ["kail", 209, 1], + ["kan", -1, 1], + ["tan", -1, 1], + ["etan", 212, 1], + ["en", -1, 4], + ["ren", 214, 2], + ["garren", 215, 1], + ["gerren", 215, 1], + ["urren", 215, 1], + ["ten", 214, 4], + ["tzen", 214, 4], + ["zain", -1, 1], + ["tzain", 221, 1], + ["kin", -1, 1], + ["min", -1, 1], + ["dun", -1, 1], + ["asun", -1, 1], + ["tasun", 226, 1], + ["aizun", -1, 1], + ["ondo", -1, 1], + ["kondo", 229, 1], + ["go", -1, 1], + ["ngo", 231, 1], + ["zio", -1, 1], + ["ko", -1, 1], + ["trako", 234, 5], + ["tako", 234, 1], + ["etako", 236, 1], + ["eko", 234, 1], + ["tariko", 234, 1], + ["sko", 234, 1], + ["tuko", 234, 1], + ["minutuko", 241, 6], + ["zko", 234, 1], + ["no", -1, 1], + ["zino", 244, 1], + ["ro", -1, 1], + ["aro", 246, 1], + ["igaro", 247, 9], + ["taro", 247, 1], + ["zaro", 247, 1], + ["ero", 246, 1], + ["giro", 246, 1], + ["oro", 246, 1], + ["oso", -1, 1], + ["to", -1, 1], + ["tto", 255, 1], + ["zto", 255, 1], + ["txo", -1, 1], + ["tzo", -1, 1], + ["gintzo", 259, 1], + ["\u00F1o", -1, 1], + ["zp", -1, 1], + ["ar", -1, 1], + ["dar", 263, 1], + ["behar", 263, 1], + ["zehar", 263, 7], + ["liar", 263, 1], + ["tiar", 263, 1], + ["tar", 263, 1], + ["tzar", 263, 1], + ["or", -1, 2], + ["kor", 271, 1], + ["os", -1, 1], + ["ket", -1, 1], + ["du", -1, 1], + ["mendu", 275, 1], + ["ordu", 275, 1], + ["leku", -1, 1], + ["buru", -1, 2], + ["duru", -1, 1], + ["tsu", -1, 1], + ["tu", -1, 1], + ["tatu", 282, 4], + ["mentu", 282, 1], + ["estu", 282, 1], + ["txu", -1, 1], + ["zu", -1, 1], + ["tzu", 287, 1], + ["gintzu", 288, 1], + ["z", -1, 1], + ["ez", 290, 1], + ["eroz", 290, 1], + ["tz", 290, 1], + ["koitz", 293, 1] + ]; + + /** @const */ var a_2 = [ + ["zlea", -1, 2], + ["keria", -1, 1], + ["la", -1, 1], + ["era", -1, 1], + ["dade", -1, 1], + ["tade", -1, 1], + ["date", -1, 1], + ["tate", -1, 1], + ["gi", -1, 1], + ["ki", -1, 1], + ["ik", -1, 1], + ["lanik", 10, 1], + ["rik", 10, 1], + ["larik", 12, 1], + ["ztik", 10, 1], + ["go", -1, 1], + ["ro", -1, 1], + ["ero", 16, 1], + ["to", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab2; + } + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab4; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab4; + } + base.cursor++; + } + break lab3; + } + base.cursor = v_3; + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab2; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + } + break lab1; + } + base.cursor = v_2; + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab0; + } + lab9: { + var /** number */ v_6 = base.cursor; + lab10: { + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab10; + } + golab11: while(true) + { + lab12: { + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab12; + } + break golab11; + } + if (base.cursor >= base.limit) + { + break lab10; + } + base.cursor++; + } + break lab9; + } + base.cursor = v_6; + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab0; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_pV = base.cursor; + } + base.cursor = v_1; + var /** number */ v_8 = base.cursor; + lab13: { + golab14: while(true) + { + lab15: { + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab15; + } + break golab14; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab16: while(true) + { + lab17: { + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab17; + } + break golab16; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p1 = base.cursor; + golab18: while(true) + { + lab19: { + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab19; + } + break golab18; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab20: while(true) + { + lab21: { + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab21; + } + break golab20; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_8; + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_aditzak() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_from("atseden")) + { + return false; + } + break; + case 4: + if (!base.slice_from("arabera")) + { + return false; + } + break; + case 5: + if (!base.slice_from("baditu")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_izenak() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_1); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_from("jok")) + { + return false; + } + break; + case 4: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 5: + if (!base.slice_from("tra")) + { + return false; + } + break; + case 6: + if (!base.slice_from("minutu")) + { + return false; + } + break; + case 7: + if (!base.slice_from("zehar")) + { + return false; + } + break; + case 8: + if (!base.slice_from("geldi")) + { + return false; + } + break; + case 9: + if (!base.slice_from("igaro")) + { + return false; + } + break; + case 10: + if (!base.slice_from("aurka")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_adjetiboak() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("z")) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + while(true) + { + var /** number */ v_2 = base.limit - base.cursor; + lab0: { + if (!r_aditzak()) + { + break lab0; + } + continue; + } + base.cursor = base.limit - v_2; + break; + } + while(true) + { + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + if (!r_izenak()) + { + break lab1; + } + continue; + } + base.cursor = base.limit - v_3; + break; + } + var /** number */ v_4 = base.limit - base.cursor; + r_adjetiboak(); + base.cursor = base.limit - v_4; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['BasqueStemmer'] = BasqueStemmer; diff --git a/js/catalan-stemmer.js b/js/catalan-stemmer.js new file mode 100644 index 0000000..34f535d --- /dev/null +++ b/js/catalan-stemmer.js @@ -0,0 +1,927 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var CatalanStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["", -1, 7], + ["\u00B7", 0, 6], + ["\u00E0", 0, 1], + ["\u00E1", 0, 1], + ["\u00E8", 0, 2], + ["\u00E9", 0, 2], + ["\u00EC", 0, 3], + ["\u00ED", 0, 3], + ["\u00EF", 0, 3], + ["\u00F2", 0, 4], + ["\u00F3", 0, 4], + ["\u00FA", 0, 5], + ["\u00FC", 0, 5] + ]; + + /** @const */ var a_1 = [ + ["la", -1, 1], + ["-la", 0, 1], + ["sela", 0, 1], + ["le", -1, 1], + ["me", -1, 1], + ["-me", 4, 1], + ["se", -1, 1], + ["-te", -1, 1], + ["hi", -1, 1], + ["'hi", 8, 1], + ["li", -1, 1], + ["-li", 10, 1], + ["'l", -1, 1], + ["'m", -1, 1], + ["-m", -1, 1], + ["'n", -1, 1], + ["-n", -1, 1], + ["ho", -1, 1], + ["'ho", 17, 1], + ["lo", -1, 1], + ["selo", 19, 1], + ["'s", -1, 1], + ["las", -1, 1], + ["selas", 22, 1], + ["les", -1, 1], + ["-les", 24, 1], + ["'ls", -1, 1], + ["-ls", -1, 1], + ["'ns", -1, 1], + ["-ns", -1, 1], + ["ens", -1, 1], + ["los", -1, 1], + ["selos", 31, 1], + ["nos", -1, 1], + ["-nos", 33, 1], + ["vos", -1, 1], + ["us", -1, 1], + ["-us", 36, 1], + ["'t", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["ica", -1, 4], + ["l\u00F3gica", 0, 3], + ["enca", -1, 1], + ["ada", -1, 2], + ["ancia", -1, 1], + ["encia", -1, 1], + ["\u00E8ncia", -1, 1], + ["\u00EDcia", -1, 1], + ["logia", -1, 3], + ["inia", -1, 1], + ["\u00EDinia", 9, 1], + ["eria", -1, 1], + ["\u00E0ria", -1, 1], + ["at\u00F2ria", -1, 1], + ["alla", -1, 1], + ["ella", -1, 1], + ["\u00EDvola", -1, 1], + ["ima", -1, 1], + ["\u00EDssima", 17, 1], + ["qu\u00EDssima", 18, 5], + ["ana", -1, 1], + ["ina", -1, 1], + ["era", -1, 1], + ["sfera", 22, 1], + ["ora", -1, 1], + ["dora", 24, 1], + ["adora", 25, 1], + ["adura", -1, 1], + ["esa", -1, 1], + ["osa", -1, 1], + ["assa", -1, 1], + ["essa", -1, 1], + ["issa", -1, 1], + ["eta", -1, 1], + ["ita", -1, 1], + ["ota", -1, 1], + ["ista", -1, 1], + ["ialista", 36, 1], + ["ionista", 36, 1], + ["iva", -1, 1], + ["ativa", 39, 1], + ["n\u00E7a", -1, 1], + ["log\u00EDa", -1, 3], + ["ic", -1, 4], + ["\u00EDstic", 43, 1], + ["enc", -1, 1], + ["esc", -1, 1], + ["ud", -1, 1], + ["atge", -1, 1], + ["ble", -1, 1], + ["able", 49, 1], + ["ible", 49, 1], + ["isme", -1, 1], + ["ialisme", 52, 1], + ["ionisme", 52, 1], + ["ivisme", 52, 1], + ["aire", -1, 1], + ["icte", -1, 1], + ["iste", -1, 1], + ["ici", -1, 1], + ["\u00EDci", -1, 1], + ["logi", -1, 3], + ["ari", -1, 1], + ["tori", -1, 1], + ["al", -1, 1], + ["il", -1, 1], + ["all", -1, 1], + ["ell", -1, 1], + ["\u00EDvol", -1, 1], + ["isam", -1, 1], + ["issem", -1, 1], + ["\u00ECssem", -1, 1], + ["\u00EDssem", -1, 1], + ["\u00EDssim", -1, 1], + ["qu\u00EDssim", 73, 5], + ["amen", -1, 1], + ["\u00ECssin", -1, 1], + ["ar", -1, 1], + ["ificar", 77, 1], + ["egar", 77, 1], + ["ejar", 77, 1], + ["itar", 77, 1], + ["itzar", 77, 1], + ["fer", -1, 1], + ["or", -1, 1], + ["dor", 84, 1], + ["dur", -1, 1], + ["doras", -1, 1], + ["ics", -1, 4], + ["l\u00F3gics", 88, 3], + ["uds", -1, 1], + ["nces", -1, 1], + ["ades", -1, 2], + ["ancies", -1, 1], + ["encies", -1, 1], + ["\u00E8ncies", -1, 1], + ["\u00EDcies", -1, 1], + ["logies", -1, 3], + ["inies", -1, 1], + ["\u00EDnies", -1, 1], + ["eries", -1, 1], + ["\u00E0ries", -1, 1], + ["at\u00F2ries", -1, 1], + ["bles", -1, 1], + ["ables", 103, 1], + ["ibles", 103, 1], + ["imes", -1, 1], + ["\u00EDssimes", 106, 1], + ["qu\u00EDssimes", 107, 5], + ["formes", -1, 1], + ["ismes", -1, 1], + ["ialismes", 110, 1], + ["ines", -1, 1], + ["eres", -1, 1], + ["ores", -1, 1], + ["dores", 114, 1], + ["idores", 115, 1], + ["dures", -1, 1], + ["eses", -1, 1], + ["oses", -1, 1], + ["asses", -1, 1], + ["ictes", -1, 1], + ["ites", -1, 1], + ["otes", -1, 1], + ["istes", -1, 1], + ["ialistes", 124, 1], + ["ionistes", 124, 1], + ["iques", -1, 4], + ["l\u00F3giques", 127, 3], + ["ives", -1, 1], + ["atives", 129, 1], + ["log\u00EDes", -1, 3], + ["alleng\u00FCes", -1, 1], + ["icis", -1, 1], + ["\u00EDcis", -1, 1], + ["logis", -1, 3], + ["aris", -1, 1], + ["toris", -1, 1], + ["ls", -1, 1], + ["als", 138, 1], + ["ells", 138, 1], + ["ims", -1, 1], + ["\u00EDssims", 141, 1], + ["qu\u00EDssims", 142, 5], + ["ions", -1, 1], + ["cions", 144, 1], + ["acions", 145, 2], + ["esos", -1, 1], + ["osos", -1, 1], + ["assos", -1, 1], + ["issos", -1, 1], + ["ers", -1, 1], + ["ors", -1, 1], + ["dors", 152, 1], + ["adors", 153, 1], + ["idors", 153, 1], + ["ats", -1, 1], + ["itats", 156, 1], + ["bilitats", 157, 1], + ["ivitats", 157, 1], + ["ativitats", 159, 1], + ["\u00EFtats", 156, 1], + ["ets", -1, 1], + ["ants", -1, 1], + ["ents", -1, 1], + ["ments", 164, 1], + ["aments", 165, 1], + ["ots", -1, 1], + ["uts", -1, 1], + ["ius", -1, 1], + ["trius", 169, 1], + ["atius", 169, 1], + ["\u00E8s", -1, 1], + ["\u00E9s", -1, 1], + ["\u00EDs", -1, 1], + ["d\u00EDs", 174, 1], + ["\u00F3s", -1, 1], + ["itat", -1, 1], + ["bilitat", 177, 1], + ["ivitat", 177, 1], + ["ativitat", 179, 1], + ["\u00EFtat", -1, 1], + ["et", -1, 1], + ["ant", -1, 1], + ["ent", -1, 1], + ["ient", 184, 1], + ["ment", 184, 1], + ["ament", 186, 1], + ["isament", 187, 1], + ["ot", -1, 1], + ["isseu", -1, 1], + ["\u00ECsseu", -1, 1], + ["\u00EDsseu", -1, 1], + ["triu", -1, 1], + ["\u00EDssiu", -1, 1], + ["atiu", -1, 1], + ["\u00F3", -1, 1], + ["i\u00F3", 196, 1], + ["ci\u00F3", 197, 1], + ["aci\u00F3", 198, 1] + ]; + + /** @const */ var a_3 = [ + ["aba", -1, 1], + ["esca", -1, 1], + ["isca", -1, 1], + ["\u00EFsca", -1, 1], + ["ada", -1, 1], + ["ida", -1, 1], + ["uda", -1, 1], + ["\u00EFda", -1, 1], + ["ia", -1, 1], + ["aria", 8, 1], + ["iria", 8, 1], + ["ara", -1, 1], + ["iera", -1, 1], + ["ira", -1, 1], + ["adora", -1, 1], + ["\u00EFra", -1, 1], + ["ava", -1, 1], + ["ixa", -1, 1], + ["itza", -1, 1], + ["\u00EDa", -1, 1], + ["ar\u00EDa", 19, 1], + ["er\u00EDa", 19, 1], + ["ir\u00EDa", 19, 1], + ["\u00EFa", -1, 1], + ["isc", -1, 1], + ["\u00EFsc", -1, 1], + ["ad", -1, 1], + ["ed", -1, 1], + ["id", -1, 1], + ["ie", -1, 1], + ["re", -1, 1], + ["dre", 30, 1], + ["ase", -1, 1], + ["iese", -1, 1], + ["aste", -1, 1], + ["iste", -1, 1], + ["ii", -1, 1], + ["ini", -1, 1], + ["esqui", -1, 1], + ["eixi", -1, 1], + ["itzi", -1, 1], + ["am", -1, 1], + ["em", -1, 1], + ["arem", 42, 1], + ["irem", 42, 1], + ["\u00E0rem", 42, 1], + ["\u00EDrem", 42, 1], + ["\u00E0ssem", 42, 1], + ["\u00E9ssem", 42, 1], + ["iguem", 42, 1], + ["\u00EFguem", 42, 1], + ["avem", 42, 1], + ["\u00E0vem", 42, 1], + ["\u00E1vem", 42, 1], + ["ir\u00ECem", 42, 1], + ["\u00EDem", 42, 1], + ["ar\u00EDem", 55, 1], + ["ir\u00EDem", 55, 1], + ["assim", -1, 1], + ["essim", -1, 1], + ["issim", -1, 1], + ["\u00E0ssim", -1, 1], + ["\u00E8ssim", -1, 1], + ["\u00E9ssim", -1, 1], + ["\u00EDssim", -1, 1], + ["\u00EFm", -1, 1], + ["an", -1, 1], + ["aban", 66, 1], + ["arian", 66, 1], + ["aran", 66, 1], + ["ieran", 66, 1], + ["iran", 66, 1], + ["\u00EDan", 66, 1], + ["ar\u00EDan", 72, 1], + ["er\u00EDan", 72, 1], + ["ir\u00EDan", 72, 1], + ["en", -1, 1], + ["ien", 76, 1], + ["arien", 77, 1], + ["irien", 77, 1], + ["aren", 76, 1], + ["eren", 76, 1], + ["iren", 76, 1], + ["\u00E0ren", 76, 1], + ["\u00EFren", 76, 1], + ["asen", 76, 1], + ["iesen", 76, 1], + ["assen", 76, 1], + ["essen", 76, 1], + ["issen", 76, 1], + ["\u00E9ssen", 76, 1], + ["\u00EFssen", 76, 1], + ["esquen", 76, 1], + ["isquen", 76, 1], + ["\u00EFsquen", 76, 1], + ["aven", 76, 1], + ["ixen", 76, 1], + ["eixen", 96, 1], + ["\u00EFxen", 76, 1], + ["\u00EFen", 76, 1], + ["in", -1, 1], + ["inin", 100, 1], + ["sin", 100, 1], + ["isin", 102, 1], + ["assin", 102, 1], + ["essin", 102, 1], + ["issin", 102, 1], + ["\u00EFssin", 102, 1], + ["esquin", 100, 1], + ["eixin", 100, 1], + ["aron", -1, 1], + ["ieron", -1, 1], + ["ar\u00E1n", -1, 1], + ["er\u00E1n", -1, 1], + ["ir\u00E1n", -1, 1], + ["i\u00EFn", -1, 1], + ["ado", -1, 1], + ["ido", -1, 1], + ["ando", -1, 2], + ["iendo", -1, 1], + ["io", -1, 1], + ["ixo", -1, 1], + ["eixo", 121, 1], + ["\u00EFxo", -1, 1], + ["itzo", -1, 1], + ["ar", -1, 1], + ["tzar", 125, 1], + ["er", -1, 1], + ["eixer", 127, 1], + ["ir", -1, 1], + ["ador", -1, 1], + ["as", -1, 1], + ["abas", 131, 1], + ["adas", 131, 1], + ["idas", 131, 1], + ["aras", 131, 1], + ["ieras", 131, 1], + ["\u00EDas", 131, 1], + ["ar\u00EDas", 137, 1], + ["er\u00EDas", 137, 1], + ["ir\u00EDas", 137, 1], + ["ids", -1, 1], + ["es", -1, 1], + ["ades", 142, 1], + ["ides", 142, 1], + ["udes", 142, 1], + ["\u00EFdes", 142, 1], + ["atges", 142, 1], + ["ies", 142, 1], + ["aries", 148, 1], + ["iries", 148, 1], + ["ares", 142, 1], + ["ires", 142, 1], + ["adores", 142, 1], + ["\u00EFres", 142, 1], + ["ases", 142, 1], + ["ieses", 142, 1], + ["asses", 142, 1], + ["esses", 142, 1], + ["isses", 142, 1], + ["\u00EFsses", 142, 1], + ["ques", 142, 1], + ["esques", 161, 1], + ["\u00EFsques", 161, 1], + ["aves", 142, 1], + ["ixes", 142, 1], + ["eixes", 165, 1], + ["\u00EFxes", 142, 1], + ["\u00EFes", 142, 1], + ["abais", -1, 1], + ["arais", -1, 1], + ["ierais", -1, 1], + ["\u00EDais", -1, 1], + ["ar\u00EDais", 172, 1], + ["er\u00EDais", 172, 1], + ["ir\u00EDais", 172, 1], + ["aseis", -1, 1], + ["ieseis", -1, 1], + ["asteis", -1, 1], + ["isteis", -1, 1], + ["inis", -1, 1], + ["sis", -1, 1], + ["isis", 181, 1], + ["assis", 181, 1], + ["essis", 181, 1], + ["issis", 181, 1], + ["\u00EFssis", 181, 1], + ["esquis", -1, 1], + ["eixis", -1, 1], + ["itzis", -1, 1], + ["\u00E1is", -1, 1], + ["ar\u00E9is", -1, 1], + ["er\u00E9is", -1, 1], + ["ir\u00E9is", -1, 1], + ["ams", -1, 1], + ["ados", -1, 1], + ["idos", -1, 1], + ["amos", -1, 1], + ["\u00E1bamos", 197, 1], + ["\u00E1ramos", 197, 1], + ["i\u00E9ramos", 197, 1], + ["\u00EDamos", 197, 1], + ["ar\u00EDamos", 201, 1], + ["er\u00EDamos", 201, 1], + ["ir\u00EDamos", 201, 1], + ["aremos", -1, 1], + ["eremos", -1, 1], + ["iremos", -1, 1], + ["\u00E1semos", -1, 1], + ["i\u00E9semos", -1, 1], + ["imos", -1, 1], + ["adors", -1, 1], + ["ass", -1, 1], + ["erass", 212, 1], + ["ess", -1, 1], + ["ats", -1, 1], + ["its", -1, 1], + ["ents", -1, 1], + ["\u00E0s", -1, 1], + ["ar\u00E0s", 218, 1], + ["ir\u00E0s", 218, 1], + ["ar\u00E1s", -1, 1], + ["er\u00E1s", -1, 1], + ["ir\u00E1s", -1, 1], + ["\u00E9s", -1, 1], + ["ar\u00E9s", 224, 1], + ["\u00EDs", -1, 1], + ["i\u00EFs", -1, 1], + ["at", -1, 1], + ["it", -1, 1], + ["ant", -1, 1], + ["ent", -1, 1], + ["int", -1, 1], + ["ut", -1, 1], + ["\u00EFt", -1, 1], + ["au", -1, 1], + ["erau", 235, 1], + ["ieu", -1, 1], + ["ineu", -1, 1], + ["areu", -1, 1], + ["ireu", -1, 1], + ["\u00E0reu", -1, 1], + ["\u00EDreu", -1, 1], + ["asseu", -1, 1], + ["esseu", -1, 1], + ["eresseu", 244, 1], + ["\u00E0sseu", -1, 1], + ["\u00E9sseu", -1, 1], + ["igueu", -1, 1], + ["\u00EFgueu", -1, 1], + ["\u00E0veu", -1, 1], + ["\u00E1veu", -1, 1], + ["itzeu", -1, 1], + ["\u00ECeu", -1, 1], + ["ir\u00ECeu", 253, 1], + ["\u00EDeu", -1, 1], + ["ar\u00EDeu", 255, 1], + ["ir\u00EDeu", 255, 1], + ["assiu", -1, 1], + ["issiu", -1, 1], + ["\u00E0ssiu", -1, 1], + ["\u00E8ssiu", -1, 1], + ["\u00E9ssiu", -1, 1], + ["\u00EDssiu", -1, 1], + ["\u00EFu", -1, 1], + ["ix", -1, 1], + ["eix", 265, 1], + ["\u00EFx", -1, 1], + ["itz", -1, 1], + ["i\u00E0", -1, 1], + ["ar\u00E0", -1, 1], + ["ir\u00E0", -1, 1], + ["itz\u00E0", -1, 1], + ["ar\u00E1", -1, 1], + ["er\u00E1", -1, 1], + ["ir\u00E1", -1, 1], + ["ir\u00E8", -1, 1], + ["ar\u00E9", -1, 1], + ["er\u00E9", -1, 1], + ["ir\u00E9", -1, 1], + ["\u00ED", -1, 1], + ["i\u00EF", -1, 1], + ["i\u00F3", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["a", -1, 1], + ["e", -1, 1], + ["i", -1, 1], + ["\u00EFn", -1, 1], + ["o", -1, 1], + ["ir", -1, 1], + ["s", -1, 1], + ["is", 6, 1], + ["os", 6, 1], + ["\u00EFs", 6, 1], + ["it", -1, 1], + ["eu", -1, 1], + ["iu", -1, 1], + ["iqu", -1, 2], + ["itz", -1, 1], + ["\u00E0", -1, 1], + ["\u00E1", -1, 1], + ["\u00E9", -1, 1], + ["\u00EC", -1, 1], + ["\u00ED", -1, 1], + ["\u00EF", -1, 1], + ["\u00F3", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 129, 81, 6, 10]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + lab2: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab2; + } + break golab1; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab3: while(true) + { + lab4: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab4; + } + break golab3; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p1 = base.cursor; + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_cleaning() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("a")) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + case 3: + if (!base.slice_from("i")) + { + return false; + } + break; + case 4: + if (!base.slice_from("o")) + { + return false; + } + break; + case 5: + if (!base.slice_from("u")) + { + return false; + } + break; + case 6: + if (!base.slice_from(".")) + { + return false; + } + break; + case 7: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_attached_pronoun() { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("log")) + { + return false; + } + break; + case 4: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("ic")) + { + return false; + } + break; + case 5: + if (!r_R1()) + { + return false; + } + if (!base.slice_from("c")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_verb_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_residual_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R1()) + { + return false; + } + if (!base.slice_from("ic")) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_attached_pronoun(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + if (!r_standard_suffix()) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_4; + if (!r_verb_suffix()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_3; + var /** number */ v_5 = base.limit - base.cursor; + r_residual_suffix(); + base.cursor = base.limit - v_5; + base.cursor = base.limit_backward; + var /** number */ v_6 = base.cursor; + r_cleaning(); + base.cursor = v_6; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['CatalanStemmer'] = CatalanStemmer; diff --git a/js/danish-stemmer.js b/js/danish-stemmer.js new file mode 100644 index 0000000..4c8941b --- /dev/null +++ b/js/danish-stemmer.js @@ -0,0 +1,312 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var DanishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["hed", -1, 1], + ["ethed", 0, 1], + ["ered", -1, 1], + ["e", -1, 1], + ["erede", 3, 1], + ["ende", 3, 1], + ["erende", 5, 1], + ["ene", 3, 1], + ["erne", 3, 1], + ["ere", 3, 1], + ["en", -1, 1], + ["heden", 10, 1], + ["eren", 10, 1], + ["er", -1, 1], + ["heder", 13, 1], + ["erer", 13, 1], + ["s", -1, 2], + ["heds", 16, 1], + ["es", 16, 1], + ["endes", 18, 1], + ["erendes", 19, 1], + ["enes", 18, 1], + ["ernes", 18, 1], + ["eres", 18, 1], + ["ens", 16, 1], + ["hedens", 24, 1], + ["erens", 24, 1], + ["ers", 16, 1], + ["ets", 16, 1], + ["erets", 28, 1], + ["et", -1, 1], + ["eret", 30, 1] + ]; + + /** @const */ var a_1 = [ + ["gd", -1, -1], + ["dt", -1, -1], + ["gt", -1, -1], + ["kt", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["ig", -1, 1], + ["lig", 0, 1], + ["elig", 1, 1], + ["els", -1, 1], + ["l\u00F8st", -1, 2] + ]; + + /** @const */ var /** Array */ g_c = [119, 223, 119, 1]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128]; + + /** @const */ var /** Array */ g_s_ending = [239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16]; + + var /** number */ I_x = 0; + var /** number */ I_p1 = 0; + var /** string */ S_ch = ''; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + var /** number */ v_1 = base.cursor; + { + var /** number */ c1 = base.cursor + 3; + if (c1 > base.limit) + { + return false; + } + base.cursor = c1; + } + I_x = base.cursor; + base.cursor = v_1; + golab0: while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + if (!(base.in_grouping(g_v, 97, 248))) + { + break lab1; + } + base.cursor = v_2; + break golab0; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_v, 97, 248))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + lab4: { + if (I_p1 >= I_x) + { + break lab4; + } + I_p1 = I_x; + } + return true; + }; + + /** @return {boolean} */ + function r_main_suffix() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.in_grouping_b(g_s_ending, 97, 229))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_consonant_pair() { + var /** number */ v_1 = base.limit - base.cursor; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_3 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + base.limit_backward = v_3; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_3; + base.cursor = base.limit - v_1; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_other_suffix() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("st"))) + { + break lab0; + } + base.bra = base.cursor; + if (!(base.eq_s_b("ig"))) + { + break lab0; + } + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_1; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_3 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + base.limit_backward = v_3; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_3; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + r_consonant_pair(); + base.cursor = base.limit - v_4; + break; + case 2: + if (!base.slice_from("l\u00F8s")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_undouble() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (!(base.in_grouping_b(g_c, 98, 122))) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + S_ch = base.slice_to(); + if (S_ch == '') + { + return false; + } + base.limit_backward = v_2; + if (!(base.eq_s_b(S_ch))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_mark_regions(); + base.cursor = v_1; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_main_suffix(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_consonant_pair(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_other_suffix(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_undouble(); + base.cursor = base.limit - v_5; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['DanishStemmer'] = DanishStemmer; diff --git a/js/demo.js b/js/demo.js new file mode 100644 index 0000000..214b6c4 --- /dev/null +++ b/js/demo.js @@ -0,0 +1,71 @@ +var old_words; +function stem_demo(elt) { + var words = elt.value; + if (words === old_words) return; + old_words = words; + // Auto-expand height of the textarea + var lines = 1; + words.replace(/\n/g, function(){++lines;}); + elt.rows = lines; + var lang = document.getElementById('lang').value; + var s = stemmer_factory(lang); + if (s === null) return; + var result = ''; + var i = 0; + // FIXME: Generate this from UnicodeData. + for (const m of words.matchAll(/([A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376-\u037D\u037F-\u0383\u0386\u0388-\u03F5\u03F7-\u0481\u048A-\u0559\u0561-\u0588\u05D0-\u05F2\u0620-\u064A\u066E-\u066F\u0671-\u06D3\u06D5\u06E5-\u06E6\u06EE-\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1-\u07BF\u07CA-\u07EA\u07F4-\u07F5\u07FA-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0-\u08E2\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0980\u0985-\u09BB\u09BD\u09CE-\u09D6\u09DC-\u09E1\u09F0-\u09F1\u0A05-\u0A3B\u0A59-\u0A65\u0A72-\u0A74\u0A85-\u0ABB\u0ABD\u0AD0-\u0AE1\u0AF9-\u0B00\u0B05-\u0B3B\u0B3D\u0B5C-\u0B61\u0B71\u0B83-\u0BBD\u0BD0-\u0BD6\u0C05-\u0C3D\u0C58-\u0C61\u0C85-\u0CBB\u0CBD\u0CDE-\u0CE1\u0CF1-\u0D00\u0D05-\u0D3D\u0D4E-\u0D56\u0D5F-\u0D61\u0D7A-\u0D81\u0D85-\u0DC9\u0E01-\u0E30\u0E32-\u0E33\u0E40-\u0E46\u0E81-\u0EB0\u0EB2-\u0EB3\u0EBD-\u0EC7\u0EDC-\u0F00\u0F40-\u0F70\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065-\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10FA\u10FC-\u135C\u1380-\u138F\u13A0-\u13FF\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u1771\u1780-\u17B3\u17D7\u17DC\u1820-\u18A8\u18AA-\u191F\u1950-\u19CF\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4F\u1B83-\u1BA0\u1BAE-\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5-\u1CF7\u1D00-\u1DBF\u1E00-\u1FBC\u1FBE\u1FC2-\u1FCC\u1FD0-\u1FDC\u1FE0-\u1FEC\u1FF2-\u1FFC\u2071-\u2073\u207F\u2090-\u209F\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183-\u2184\u2C00-\u2CE4\u2CEB-\u2CEE\u2CF2-\u2CF8\u2D00-\u2D6F\u2D80-\u2DDF\u2E2F\u3005-\u3006\u3031-\u3035\u303B-\u303C\u3041-\u3098\u309D-\u309F\u30A1-\u30FA\u30FC-\u318F\u31A0-\u31BF\u31F0-\u31FF\u3400-\u4DBF\u4E00-\uA48F\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A-\uA66E\uA67F-\uA69D\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD-\uA8FF\uA90A-\uA925\uA930-\uA946\uA960-\uA97F\uA984-\uA9B2\uA9CF\uA9E0-\uA9E4\uA9E6-\uA9EF\uA9FA-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5-\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB5A\uAB5C-\uABE2\uAC00-\uD7FF\uF900-\uFB1D\uFB1F-\uFB28\uFB2A-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDFB\uFE70-\uFEFE\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDF\u{10000}-\u{100FF}\u{10280}-\u{102DF}\u{10300}-\u{1031F}\u{10330}-\u{10340}\u{10342}-\u{10349}\u{10350}-\u{10375}\u{10380}-\u{1039E}\u{103A0}-\u{103CF}\u{10400}-\u{1049F}\u{10500}-\u{1056E}\u{10600}-\u{10856}\u{10860}-\u{10876}\u{10880}-\u{108A6}\u{108E0}-\u{108FA}\u{10900}-\u{10915}\u{10920}-\u{1093E}\u{10980}-\u{109BB}\u{109BE}-\u{109BF}\u{10A00}\u{10A10}-\u{10A37}\u{10A60}-\u{10A7C}\u{10A80}-\u{10A9C}\u{10AC0}-\u{10AC7}\u{10AC9}-\u{10AE4}\u{10B00}-\u{10B38}\u{10B40}-\u{10B57}\u{10B60}-\u{10B77}\u{10B80}-\u{10B98}\u{10C00}-\u{10CF9}\u{11003}-\u{11037}\u{11083}-\u{110AF}\u{110D0}-\u{110EF}\u{11103}-\u{11126}\u{11150}-\u{11172}\u{11176}-\u{1117F}\u{11183}-\u{111B2}\u{111C1}-\u{111C4}\u{111DA}\u{111DC}\u{11200}-\u{1122B}\u{11280}-\u{112A8}\u{112B0}-\u{112DE}\u{11305}-\u{1133B}\u{1133D}\u{11350}-\u{11356}\u{1135D}-\u{11361}\u{11480}-\u{114AF}\u{114C4}-\u{114C5}\u{114C7}-\u{114CF}\u{11580}-\u{115AE}\u{115D8}-\u{115DB}\u{11600}-\u{1162F}\u{11644}-\u{1164F}\u{11680}-\u{116AA}\u{11700}-\u{1171C}\u{118A0}-\u{118DF}\u{118FF}-\u{123FF}\u{12480}-\u{16A5F}\u{16AD0}-\u{16AEF}\u{16B00}-\u{16B2F}\u{16B40}-\u{16B43}\u{16B63}-\u{16F50}\u{16F93}-\u{1BC9B}\u{1D400}-\u{1D6C0}\u{1D6C2}-\u{1D6DA}\u{1D6DC}-\u{1D6FA}\u{1D6FC}-\u{1D714}\u{1D716}-\u{1D734}\u{1D736}-\u{1D74E}\u{1D750}-\u{1D76E}\u{1D770}-\u{1D788}\u{1D78A}-\u{1D7A8}\u{1D7AA}-\u{1D7C2}\u{1D7C4}-\u{1D7CD}\u{1E800}-\u{1E8C6}\u{1EE00}-\u{1EEEF}\u{20000}-\u{E0000}]+)/ug)) { + result += words.substring(i, m.index).replace(/[ &<>\n]/g, function(c){ + if (c === ' ') return ' '; + if (c === '<') return '<'; + if (c === '>') return '>'; + if (c === '&') return '&'; + // Zero width space means a trailing newline will get rendered. + return '
​'; + }); + const w = m[0]; + result += '' + s.stemWord(w.toLowerCase()) + ''; + i = m.index + w.length; + } + document.getElementById('results').innerHTML = result; +} +function lang_changed(elt) { + old_words = undefined; + var lang = elt.value; + var words_elt = document.getElementById('words'); + var results_elt = document.getElementById('results'); + if (lang === 'Arabic' || lang === 'Yiddish') { + words_elt.dir = results_elt.dir = 'rtl'; + } else { + if (lang === 'Porter') lang = 'English'; + words_elt.dir = results_elt.dir = 'ltr'; + } + words_elt.placeholder = "Enter some " + lang + " text"; + stem_demo(document.getElementById('words')); + location.hash=elt.options[elt.selectedIndex].value; +} +window.onload = function() { + let d=location.hash; + if (d.length) { + let c=null; + let s=document.getElementById('lang'); + let a=(s.selectedIndex>=0?s.options[s.selectedIndex]:null); + d = d.substring(1); + for (let i=0; i */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128]; + + /** @const */ var /** Array */ g_v_I = [1, 0, 0, 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128]; + + /** @const */ var /** Array */ g_v_j = [17, 67, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** boolean */ B_e_found = false; + + + /** @return {boolean} */ + function r_prelude() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + while(true) + { + var /** number */ v_2 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("a")) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + case 3: + if (!base.slice_from("i")) + { + return false; + } + break; + case 4: + if (!base.slice_from("o")) + { + return false; + } + break; + case 5: + if (!base.slice_from("u")) + { + return false; + } + break; + case 6: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_2; + break; + } + base.cursor = v_1; + var /** number */ v_3 = base.cursor; + lab1: { + base.bra = base.cursor; + if (!(base.eq_s("y"))) + { + base.cursor = v_3; + break lab1; + } + base.ket = base.cursor; + if (!base.slice_from("Y")) + { + return false; + } + } + while(true) + { + var /** number */ v_4 = base.cursor; + lab2: { + golab3: while(true) + { + var /** number */ v_5 = base.cursor; + lab4: { + if (!(base.in_grouping(g_v, 97, 232))) + { + break lab4; + } + base.bra = base.cursor; + lab5: { + var /** number */ v_6 = base.cursor; + lab6: { + if (!(base.eq_s("i"))) + { + break lab6; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 232))) + { + break lab6; + } + if (!base.slice_from("I")) + { + return false; + } + break lab5; + } + base.cursor = v_6; + if (!(base.eq_s("y"))) + { + break lab4; + } + base.ket = base.cursor; + if (!base.slice_from("Y")) + { + return false; + } + } + base.cursor = v_5; + break golab3; + } + base.cursor = v_5; + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + continue; + } + base.cursor = v_4; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + I_p2 = base.limit; + golab0: while(true) + { + lab1: { + if (!(base.in_grouping(g_v, 97, 232))) + { + break lab1; + } + break golab0; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_v, 97, 232))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + lab4: { + if (I_p1 >= 3) + { + break lab4; + } + I_p1 = 3; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 232))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 232))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p2 = base.cursor; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_1); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("y")) + { + return false; + } + break; + case 2: + if (!base.slice_from("i")) + { + return false; + } + break; + case 3: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_undouble() { + var /** number */ v_1 = base.limit - base.cursor; + if (base.find_among_b(a_2) == 0) + { + return false; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_e_ending() { + B_e_found = false; + base.ket = base.cursor; + if (!(base.eq_s_b("e"))) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + if (!(base.out_grouping_b(g_v, 97, 232))) + { + return false; + } + base.cursor = base.limit - v_1; + if (!base.slice_del()) + { + return false; + } + B_e_found = true; + if (!r_undouble()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_en_ending() { + if (!r_R1()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + if (!(base.out_grouping_b(g_v, 97, 232))) + { + return false; + } + base.cursor = base.limit - v_1; + { + var /** number */ v_2 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("gem"))) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_2; + } + if (!base.slice_del()) + { + return false; + } + if (!r_undouble()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + break lab0; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + break lab0; + } + if (!base.slice_from("heid")) + { + return false; + } + break; + case 2: + if (!r_en_ending()) + { + break lab0; + } + break; + case 3: + if (!r_R1()) + { + break lab0; + } + if (!(base.out_grouping_b(g_v_j, 97, 232))) + { + break lab0; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + base.cursor = base.limit - v_1; + var /** number */ v_2 = base.limit - base.cursor; + r_e_ending(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (!(base.eq_s_b("heid"))) + { + break lab1; + } + base.bra = base.cursor; + if (!r_R2()) + { + break lab1; + } + { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + if (!(base.eq_s_b("c"))) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_4; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("en"))) + { + break lab1; + } + base.bra = base.cursor; + if (!r_en_ending()) + { + break lab1; + } + } + base.cursor = base.limit - v_3; + var /** number */ v_5 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + break lab3; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + break lab3; + } + if (!base.slice_del()) + { + return false; + } + lab4: { + var /** number */ v_6 = base.limit - base.cursor; + lab5: { + base.ket = base.cursor; + if (!(base.eq_s_b("ig"))) + { + break lab5; + } + base.bra = base.cursor; + if (!r_R2()) + { + break lab5; + } + { + var /** number */ v_7 = base.limit - base.cursor; + lab6: { + if (!(base.eq_s_b("e"))) + { + break lab6; + } + break lab5; + } + base.cursor = base.limit - v_7; + } + if (!base.slice_del()) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_6; + if (!r_undouble()) + { + break lab3; + } + } + break; + case 2: + if (!r_R2()) + { + break lab3; + } + { + var /** number */ v_8 = base.limit - base.cursor; + lab7: { + if (!(base.eq_s_b("e"))) + { + break lab7; + } + break lab3; + } + base.cursor = base.limit - v_8; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!r_R2()) + { + break lab3; + } + if (!base.slice_del()) + { + return false; + } + if (!r_e_ending()) + { + break lab3; + } + break; + case 4: + if (!r_R2()) + { + break lab3; + } + if (!base.slice_del()) + { + return false; + } + break; + case 5: + if (!r_R2()) + { + break lab3; + } + if (!B_e_found) + { + break lab3; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + base.cursor = base.limit - v_5; + var /** number */ v_9 = base.limit - base.cursor; + lab8: { + if (!(base.out_grouping_b(g_v_I, 73, 232))) + { + break lab8; + } + var /** number */ v_10 = base.limit - base.cursor; + if (base.find_among_b(a_5) == 0) + { + break lab8; + } + if (!(base.out_grouping_b(g_v, 97, 232))) + { + break lab8; + } + base.cursor = base.limit - v_10; + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + break lab8; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_9; + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_prelude(); + base.cursor = v_1; + var /** number */ v_2 = base.cursor; + r_mark_regions(); + base.cursor = v_2; + base.limit_backward = base.cursor; base.cursor = base.limit; + r_standard_suffix(); + base.cursor = base.limit_backward; + var /** number */ v_4 = base.cursor; + r_postlude(); + base.cursor = v_4; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['DutchStemmer'] = DutchStemmer; diff --git a/js/english-stemmer.js b/js/english-stemmer.js new file mode 100644 index 0000000..0f9cd7d --- /dev/null +++ b/js/english-stemmer.js @@ -0,0 +1,1086 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var EnglishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["arsen", -1, -1], + ["commun", -1, -1], + ["gener", -1, -1] + ]; + + /** @const */ var a_1 = [ + ["'", -1, 1], + ["'s'", 0, 1], + ["'s", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["ied", -1, 2], + ["s", -1, 3], + ["ies", 1, 2], + ["sses", 1, 1], + ["ss", 1, -1], + ["us", 1, -1] + ]; + + /** @const */ var a_3 = [ + ["", -1, 3], + ["bb", 0, 2], + ["dd", 0, 2], + ["ff", 0, 2], + ["gg", 0, 2], + ["bl", 0, 1], + ["mm", 0, 2], + ["nn", 0, 2], + ["pp", 0, 2], + ["rr", 0, 2], + ["at", 0, 1], + ["tt", 0, 2], + ["iz", 0, 1] + ]; + + /** @const */ var a_4 = [ + ["ed", -1, 2], + ["eed", 0, 1], + ["ing", -1, 2], + ["edly", -1, 2], + ["eedly", 3, 1], + ["ingly", -1, 2] + ]; + + /** @const */ var a_5 = [ + ["anci", -1, 3], + ["enci", -1, 2], + ["ogi", -1, 13], + ["li", -1, 15], + ["bli", 3, 12], + ["abli", 4, 4], + ["alli", 3, 8], + ["fulli", 3, 9], + ["lessli", 3, 14], + ["ousli", 3, 10], + ["entli", 3, 5], + ["aliti", -1, 8], + ["biliti", -1, 12], + ["iviti", -1, 11], + ["tional", -1, 1], + ["ational", 14, 7], + ["alism", -1, 8], + ["ation", -1, 7], + ["ization", 17, 6], + ["izer", -1, 6], + ["ator", -1, 7], + ["iveness", -1, 11], + ["fulness", -1, 9], + ["ousness", -1, 10] + ]; + + /** @const */ var a_6 = [ + ["icate", -1, 4], + ["ative", -1, 6], + ["alize", -1, 3], + ["iciti", -1, 4], + ["ical", -1, 4], + ["tional", -1, 1], + ["ational", 5, 2], + ["ful", -1, 5], + ["ness", -1, 5] + ]; + + /** @const */ var a_7 = [ + ["ic", -1, 1], + ["ance", -1, 1], + ["ence", -1, 1], + ["able", -1, 1], + ["ible", -1, 1], + ["ate", -1, 1], + ["ive", -1, 1], + ["ize", -1, 1], + ["iti", -1, 1], + ["al", -1, 1], + ["ism", -1, 1], + ["ion", -1, 2], + ["er", -1, 1], + ["ous", -1, 1], + ["ant", -1, 1], + ["ent", -1, 1], + ["ment", 15, 1], + ["ement", 16, 1] + ]; + + /** @const */ var a_8 = [ + ["e", -1, 1], + ["l", -1, 2] + ]; + + /** @const */ var a_9 = [ + ["succeed", -1, -1], + ["proceed", -1, -1], + ["exceed", -1, -1], + ["canning", -1, -1], + ["inning", -1, -1], + ["earring", -1, -1], + ["herring", -1, -1], + ["outing", -1, -1] + ]; + + /** @const */ var a_10 = [ + ["andes", -1, -1], + ["atlas", -1, -1], + ["bias", -1, -1], + ["cosmos", -1, -1], + ["dying", -1, 3], + ["early", -1, 9], + ["gently", -1, 7], + ["howe", -1, -1], + ["idly", -1, 6], + ["lying", -1, 4], + ["news", -1, -1], + ["only", -1, 10], + ["singly", -1, 11], + ["skies", -1, 2], + ["skis", -1, 1], + ["sky", -1, -1], + ["tying", -1, 5], + ["ugly", -1, 8] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1]; + + /** @const */ var /** Array */ g_v_WXY = [1, 17, 65, 208, 1]; + + /** @const */ var /** Array */ g_valid_LI = [55, 141, 2]; + + var /** boolean */ B_Y_found = false; + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_prelude() { + B_Y_found = false; + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + if (!(base.eq_s("'"))) + { + break lab0; + } + base.ket = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + base.cursor = v_1; + var /** number */ v_2 = base.cursor; + lab1: { + base.bra = base.cursor; + if (!(base.eq_s("y"))) + { + break lab1; + } + base.ket = base.cursor; + if (!base.slice_from("Y")) + { + return false; + } + B_Y_found = true; + } + base.cursor = v_2; + var /** number */ v_3 = base.cursor; + lab2: { + while(true) + { + var /** number */ v_4 = base.cursor; + lab3: { + golab4: while(true) + { + var /** number */ v_5 = base.cursor; + lab5: { + if (!(base.in_grouping(g_v, 97, 121))) + { + break lab5; + } + base.bra = base.cursor; + if (!(base.eq_s("y"))) + { + break lab5; + } + base.ket = base.cursor; + base.cursor = v_5; + break golab4; + } + base.cursor = v_5; + if (base.cursor >= base.limit) + { + break lab3; + } + base.cursor++; + } + if (!base.slice_from("Y")) + { + return false; + } + B_Y_found = true; + continue; + } + base.cursor = v_4; + break; + } + } + base.cursor = v_3; + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (base.find_among(a_0) == 0) + { + break lab2; + } + break lab1; + } + base.cursor = v_2; + golab3: while(true) + { + lab4: { + if (!(base.in_grouping(g_v, 97, 121))) + { + break lab4; + } + break golab3; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab5: while(true) + { + lab6: { + if (!(base.out_grouping(g_v, 97, 121))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_p1 = base.cursor; + golab7: while(true) + { + lab8: { + if (!(base.in_grouping(g_v, 97, 121))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab9: while(true) + { + lab10: { + if (!(base.out_grouping(g_v, 97, 121))) + { + break lab10; + } + break golab9; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_shortv() { + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.out_grouping_b(g_v_WXY, 89, 121))) + { + break lab1; + } + if (!(base.in_grouping_b(g_v, 97, 121))) + { + break lab1; + } + if (!(base.out_grouping_b(g_v, 97, 121))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.out_grouping_b(g_v, 97, 121))) + { + return false; + } + if (!(base.in_grouping_b(g_v, 97, 121))) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_Step_1a() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("ss")) + { + return false; + } + break; + case 2: + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + { + var /** number */ c1 = base.cursor - 2; + if (c1 < base.limit_backward) + { + break lab2; + } + base.cursor = c1; + } + if (!base.slice_from("i")) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + if (!base.slice_from("ie")) + { + return false; + } + } + break; + case 3: + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + golab3: while(true) + { + lab4: { + if (!(base.in_grouping_b(g_v, 97, 121))) + { + break lab4; + } + break golab3; + } + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_1b() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + return false; + } + if (!base.slice_from("ee")) + { + return false; + } + break; + case 2: + var /** number */ v_1 = base.limit - base.cursor; + golab0: while(true) + { + lab1: { + if (!(base.in_grouping_b(g_v, 97, 121))) + { + break lab1; + } + break golab0; + } + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + base.cursor = base.limit - v_1; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + among_var = base.find_among_b(a_3); + base.cursor = base.limit - v_3; + switch (among_var) { + case 1: + { + var /** number */ c1 = base.cursor; + base.insert(base.cursor, base.cursor, "e"); + base.cursor = c1; + } + break; + case 2: + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (base.cursor != I_p1) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + if (!r_shortv()) + { + return false; + } + base.cursor = base.limit - v_4; + { + var /** number */ c2 = base.cursor; + base.insert(base.cursor, base.cursor, "e"); + base.cursor = c2; + } + break; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_1c() { + base.ket = base.cursor; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("y"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("Y"))) + { + return false; + } + } + base.bra = base.cursor; + if (!(base.out_grouping_b(g_v, 97, 121))) + { + return false; + } + lab2: { + if (base.cursor > base.limit_backward) + { + break lab2; + } + return false; + } + if (!base.slice_from("i")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_2() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("tion")) + { + return false; + } + break; + case 2: + if (!base.slice_from("ence")) + { + return false; + } + break; + case 3: + if (!base.slice_from("ance")) + { + return false; + } + break; + case 4: + if (!base.slice_from("able")) + { + return false; + } + break; + case 5: + if (!base.slice_from("ent")) + { + return false; + } + break; + case 6: + if (!base.slice_from("ize")) + { + return false; + } + break; + case 7: + if (!base.slice_from("ate")) + { + return false; + } + break; + case 8: + if (!base.slice_from("al")) + { + return false; + } + break; + case 9: + if (!base.slice_from("ful")) + { + return false; + } + break; + case 10: + if (!base.slice_from("ous")) + { + return false; + } + break; + case 11: + if (!base.slice_from("ive")) + { + return false; + } + break; + case 12: + if (!base.slice_from("ble")) + { + return false; + } + break; + case 13: + if (!(base.eq_s_b("l"))) + { + return false; + } + if (!base.slice_from("og")) + { + return false; + } + break; + case 14: + if (!base.slice_from("less")) + { + return false; + } + break; + case 15: + if (!(base.in_grouping_b(g_valid_LI, 99, 116))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_3() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("tion")) + { + return false; + } + break; + case 2: + if (!base.slice_from("ate")) + { + return false; + } + break; + case 3: + if (!base.slice_from("al")) + { + return false; + } + break; + case 4: + if (!base.slice_from("ic")) + { + return false; + } + break; + case 5: + if (!base.slice_del()) + { + return false; + } + break; + case 6: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_4() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_7); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R2()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("s"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("t"))) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_5() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_8); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!r_R2()) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!r_R1()) + { + return false; + } + { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + if (!r_shortv()) + { + break lab2; + } + return false; + } + base.cursor = base.limit - v_2; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!(base.eq_s_b("l"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_exception2() { + base.ket = base.cursor; + if (base.find_among_b(a_9) == 0) + { + return false; + } + base.bra = base.cursor; + if (base.cursor > base.limit_backward) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_exception1() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_10); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + if (base.cursor < base.limit) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("ski")) + { + return false; + } + break; + case 2: + if (!base.slice_from("sky")) + { + return false; + } + break; + case 3: + if (!base.slice_from("die")) + { + return false; + } + break; + case 4: + if (!base.slice_from("lie")) + { + return false; + } + break; + case 5: + if (!base.slice_from("tie")) + { + return false; + } + break; + case 6: + if (!base.slice_from("idl")) + { + return false; + } + break; + case 7: + if (!base.slice_from("gentl")) + { + return false; + } + break; + case 8: + if (!base.slice_from("ugli")) + { + return false; + } + break; + case 9: + if (!base.slice_from("earli")) + { + return false; + } + break; + case 10: + if (!base.slice_from("onli")) + { + return false; + } + break; + case 11: + if (!base.slice_from("singl")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + if (!B_Y_found) + { + return false; + } + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + var /** number */ v_2 = base.cursor; + lab2: { + base.bra = base.cursor; + if (!(base.eq_s("Y"))) + { + break lab2; + } + base.ket = base.cursor; + base.cursor = v_2; + break golab1; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + if (!base.slice_from("y")) + { + return false; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + lab0: { + var /** number */ v_1 = base.cursor; + lab1: { + if (!r_exception1()) + { + break lab1; + } + break lab0; + } + base.cursor = v_1; + lab2: { + { + var /** number */ v_2 = base.cursor; + lab3: { + { + var /** number */ c1 = base.cursor + 3; + if (c1 > base.limit) + { + break lab3; + } + base.cursor = c1; + } + break lab2; + } + base.cursor = v_2; + } + break lab0; + } + base.cursor = v_1; + r_prelude(); + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_5 = base.limit - base.cursor; + r_Step_1a(); + base.cursor = base.limit - v_5; + lab4: { + var /** number */ v_6 = base.limit - base.cursor; + lab5: { + if (!r_exception2()) + { + break lab5; + } + break lab4; + } + base.cursor = base.limit - v_6; + var /** number */ v_7 = base.limit - base.cursor; + r_Step_1b(); + base.cursor = base.limit - v_7; + var /** number */ v_8 = base.limit - base.cursor; + r_Step_1c(); + base.cursor = base.limit - v_8; + var /** number */ v_9 = base.limit - base.cursor; + r_Step_2(); + base.cursor = base.limit - v_9; + var /** number */ v_10 = base.limit - base.cursor; + r_Step_3(); + base.cursor = base.limit - v_10; + var /** number */ v_11 = base.limit - base.cursor; + r_Step_4(); + base.cursor = base.limit - v_11; + var /** number */ v_12 = base.limit - base.cursor; + r_Step_5(); + base.cursor = base.limit - v_12; + } + base.cursor = base.limit_backward; + var /** number */ v_13 = base.cursor; + r_postlude(); + base.cursor = v_13; + } + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['EnglishStemmer'] = EnglishStemmer; diff --git a/js/estonian-stemmer.js b/js/estonian-stemmer.js new file mode 100644 index 0000000..4b9ef4b --- /dev/null +++ b/js/estonian-stemmer.js @@ -0,0 +1,1112 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var EstonianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["gi", -1, 1], + ["ki", -1, 2] + ]; + + /** @const */ var a_1 = [ + ["da", -1, 3], + ["mata", -1, 1], + ["b", -1, 3], + ["ksid", -1, 1], + ["nuksid", 3, 1], + ["me", -1, 3], + ["sime", 5, 1], + ["ksime", 6, 1], + ["nuksime", 7, 1], + ["akse", -1, 2], + ["dakse", 9, 1], + ["takse", 9, 1], + ["site", -1, 1], + ["ksite", 12, 1], + ["nuksite", 13, 1], + ["n", -1, 3], + ["sin", 15, 1], + ["ksin", 16, 1], + ["nuksin", 17, 1], + ["daks", -1, 1], + ["taks", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["aa", -1, -1], + ["ee", -1, -1], + ["ii", -1, -1], + ["oo", -1, -1], + ["uu", -1, -1], + ["\u00E4\u00E4", -1, -1], + ["\u00F5\u00F5", -1, -1], + ["\u00F6\u00F6", -1, -1], + ["\u00FC\u00FC", -1, -1] + ]; + + /** @const */ var a_3 = [ + ["i", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["lane", -1, 1], + ["line", -1, 3], + ["mine", -1, 2], + ["lasse", -1, 1], + ["lisse", -1, 3], + ["misse", -1, 2], + ["lasi", -1, 1], + ["lisi", -1, 3], + ["misi", -1, 2], + ["last", -1, 1], + ["list", -1, 3], + ["mist", -1, 2] + ]; + + /** @const */ var a_5 = [ + ["ga", -1, 1], + ["ta", -1, 1], + ["le", -1, 1], + ["sse", -1, 1], + ["l", -1, 1], + ["s", -1, 1], + ["ks", 5, 1], + ["t", -1, 2], + ["lt", 7, 1], + ["st", 7, 1] + ]; + + /** @const */ var a_6 = [ + ["", -1, 2], + ["las", 0, 1], + ["lis", 0, 1], + ["mis", 0, 1], + ["t", 0, -1] + ]; + + /** @const */ var a_7 = [ + ["d", -1, 4], + ["sid", 0, 2], + ["de", -1, 4], + ["ikkude", 2, 1], + ["ike", -1, 1], + ["ikke", -1, 1], + ["te", -1, 3] + ]; + + /** @const */ var a_8 = [ + ["va", -1, -1], + ["du", -1, -1], + ["nu", -1, -1], + ["tu", -1, -1] + ]; + + /** @const */ var a_9 = [ + ["kk", -1, 1], + ["pp", -1, 2], + ["tt", -1, 3] + ]; + + /** @const */ var a_10 = [ + ["ma", -1, 2], + ["mai", -1, 1], + ["m", -1, 1] + ]; + + /** @const */ var a_11 = [ + ["joob", -1, 1], + ["jood", -1, 1], + ["joodakse", 1, 1], + ["jooma", -1, 1], + ["joomata", 3, 1], + ["joome", -1, 1], + ["joon", -1, 1], + ["joote", -1, 1], + ["joovad", -1, 1], + ["juua", -1, 1], + ["juuakse", 9, 1], + ["j\u00E4i", -1, 12], + ["j\u00E4id", 11, 12], + ["j\u00E4ime", 11, 12], + ["j\u00E4in", 11, 12], + ["j\u00E4ite", 11, 12], + ["j\u00E4\u00E4b", -1, 12], + ["j\u00E4\u00E4d", -1, 12], + ["j\u00E4\u00E4da", 17, 12], + ["j\u00E4\u00E4dakse", 18, 12], + ["j\u00E4\u00E4di", 17, 12], + ["j\u00E4\u00E4ks", -1, 12], + ["j\u00E4\u00E4ksid", 21, 12], + ["j\u00E4\u00E4ksime", 21, 12], + ["j\u00E4\u00E4ksin", 21, 12], + ["j\u00E4\u00E4ksite", 21, 12], + ["j\u00E4\u00E4ma", -1, 12], + ["j\u00E4\u00E4mata", 26, 12], + ["j\u00E4\u00E4me", -1, 12], + ["j\u00E4\u00E4n", -1, 12], + ["j\u00E4\u00E4te", -1, 12], + ["j\u00E4\u00E4vad", -1, 12], + ["j\u00F5i", -1, 1], + ["j\u00F5id", 32, 1], + ["j\u00F5ime", 32, 1], + ["j\u00F5in", 32, 1], + ["j\u00F5ite", 32, 1], + ["keeb", -1, 4], + ["keed", -1, 4], + ["keedakse", 38, 4], + ["keeks", -1, 4], + ["keeksid", 40, 4], + ["keeksime", 40, 4], + ["keeksin", 40, 4], + ["keeksite", 40, 4], + ["keema", -1, 4], + ["keemata", 45, 4], + ["keeme", -1, 4], + ["keen", -1, 4], + ["kees", -1, 4], + ["keeta", -1, 4], + ["keete", -1, 4], + ["keevad", -1, 4], + ["k\u00E4ia", -1, 8], + ["k\u00E4iakse", 53, 8], + ["k\u00E4ib", -1, 8], + ["k\u00E4id", -1, 8], + ["k\u00E4idi", 56, 8], + ["k\u00E4iks", -1, 8], + ["k\u00E4iksid", 58, 8], + ["k\u00E4iksime", 58, 8], + ["k\u00E4iksin", 58, 8], + ["k\u00E4iksite", 58, 8], + ["k\u00E4ima", -1, 8], + ["k\u00E4imata", 63, 8], + ["k\u00E4ime", -1, 8], + ["k\u00E4in", -1, 8], + ["k\u00E4is", -1, 8], + ["k\u00E4ite", -1, 8], + ["k\u00E4ivad", -1, 8], + ["laob", -1, 16], + ["laod", -1, 16], + ["laoks", -1, 16], + ["laoksid", 72, 16], + ["laoksime", 72, 16], + ["laoksin", 72, 16], + ["laoksite", 72, 16], + ["laome", -1, 16], + ["laon", -1, 16], + ["laote", -1, 16], + ["laovad", -1, 16], + ["loeb", -1, 14], + ["loed", -1, 14], + ["loeks", -1, 14], + ["loeksid", 83, 14], + ["loeksime", 83, 14], + ["loeksin", 83, 14], + ["loeksite", 83, 14], + ["loeme", -1, 14], + ["loen", -1, 14], + ["loete", -1, 14], + ["loevad", -1, 14], + ["loob", -1, 7], + ["lood", -1, 7], + ["loodi", 93, 7], + ["looks", -1, 7], + ["looksid", 95, 7], + ["looksime", 95, 7], + ["looksin", 95, 7], + ["looksite", 95, 7], + ["looma", -1, 7], + ["loomata", 100, 7], + ["loome", -1, 7], + ["loon", -1, 7], + ["loote", -1, 7], + ["loovad", -1, 7], + ["luua", -1, 7], + ["luuakse", 106, 7], + ["l\u00F5i", -1, 6], + ["l\u00F5id", 108, 6], + ["l\u00F5ime", 108, 6], + ["l\u00F5in", 108, 6], + ["l\u00F5ite", 108, 6], + ["l\u00F6\u00F6b", -1, 5], + ["l\u00F6\u00F6d", -1, 5], + ["l\u00F6\u00F6dakse", 114, 5], + ["l\u00F6\u00F6di", 114, 5], + ["l\u00F6\u00F6ks", -1, 5], + ["l\u00F6\u00F6ksid", 117, 5], + ["l\u00F6\u00F6ksime", 117, 5], + ["l\u00F6\u00F6ksin", 117, 5], + ["l\u00F6\u00F6ksite", 117, 5], + ["l\u00F6\u00F6ma", -1, 5], + ["l\u00F6\u00F6mata", 122, 5], + ["l\u00F6\u00F6me", -1, 5], + ["l\u00F6\u00F6n", -1, 5], + ["l\u00F6\u00F6te", -1, 5], + ["l\u00F6\u00F6vad", -1, 5], + ["l\u00FC\u00FCa", -1, 5], + ["l\u00FC\u00FCakse", 128, 5], + ["m\u00FC\u00FCa", -1, 13], + ["m\u00FC\u00FCakse", 130, 13], + ["m\u00FC\u00FCb", -1, 13], + ["m\u00FC\u00FCd", -1, 13], + ["m\u00FC\u00FCdi", 133, 13], + ["m\u00FC\u00FCks", -1, 13], + ["m\u00FC\u00FCksid", 135, 13], + ["m\u00FC\u00FCksime", 135, 13], + ["m\u00FC\u00FCksin", 135, 13], + ["m\u00FC\u00FCksite", 135, 13], + ["m\u00FC\u00FCma", -1, 13], + ["m\u00FC\u00FCmata", 140, 13], + ["m\u00FC\u00FCme", -1, 13], + ["m\u00FC\u00FCn", -1, 13], + ["m\u00FC\u00FCs", -1, 13], + ["m\u00FC\u00FCte", -1, 13], + ["m\u00FC\u00FCvad", -1, 13], + ["n\u00E4eb", -1, 18], + ["n\u00E4ed", -1, 18], + ["n\u00E4eks", -1, 18], + ["n\u00E4eksid", 149, 18], + ["n\u00E4eksime", 149, 18], + ["n\u00E4eksin", 149, 18], + ["n\u00E4eksite", 149, 18], + ["n\u00E4eme", -1, 18], + ["n\u00E4en", -1, 18], + ["n\u00E4ete", -1, 18], + ["n\u00E4evad", -1, 18], + ["n\u00E4gema", -1, 18], + ["n\u00E4gemata", 158, 18], + ["n\u00E4ha", -1, 18], + ["n\u00E4hakse", 160, 18], + ["n\u00E4hti", -1, 18], + ["p\u00F5eb", -1, 15], + ["p\u00F5ed", -1, 15], + ["p\u00F5eks", -1, 15], + ["p\u00F5eksid", 165, 15], + ["p\u00F5eksime", 165, 15], + ["p\u00F5eksin", 165, 15], + ["p\u00F5eksite", 165, 15], + ["p\u00F5eme", -1, 15], + ["p\u00F5en", -1, 15], + ["p\u00F5ete", -1, 15], + ["p\u00F5evad", -1, 15], + ["saab", -1, 2], + ["saad", -1, 2], + ["saada", 175, 2], + ["saadakse", 176, 2], + ["saadi", 175, 2], + ["saaks", -1, 2], + ["saaksid", 179, 2], + ["saaksime", 179, 2], + ["saaksin", 179, 2], + ["saaksite", 179, 2], + ["saama", -1, 2], + ["saamata", 184, 2], + ["saame", -1, 2], + ["saan", -1, 2], + ["saate", -1, 2], + ["saavad", -1, 2], + ["sai", -1, 2], + ["said", 190, 2], + ["saime", 190, 2], + ["sain", 190, 2], + ["saite", 190, 2], + ["s\u00F5i", -1, 9], + ["s\u00F5id", 195, 9], + ["s\u00F5ime", 195, 9], + ["s\u00F5in", 195, 9], + ["s\u00F5ite", 195, 9], + ["s\u00F6\u00F6b", -1, 9], + ["s\u00F6\u00F6d", -1, 9], + ["s\u00F6\u00F6dakse", 201, 9], + ["s\u00F6\u00F6di", 201, 9], + ["s\u00F6\u00F6ks", -1, 9], + ["s\u00F6\u00F6ksid", 204, 9], + ["s\u00F6\u00F6ksime", 204, 9], + ["s\u00F6\u00F6ksin", 204, 9], + ["s\u00F6\u00F6ksite", 204, 9], + ["s\u00F6\u00F6ma", -1, 9], + ["s\u00F6\u00F6mata", 209, 9], + ["s\u00F6\u00F6me", -1, 9], + ["s\u00F6\u00F6n", -1, 9], + ["s\u00F6\u00F6te", -1, 9], + ["s\u00F6\u00F6vad", -1, 9], + ["s\u00FC\u00FCa", -1, 9], + ["s\u00FC\u00FCakse", 215, 9], + ["teeb", -1, 17], + ["teed", -1, 17], + ["teeks", -1, 17], + ["teeksid", 219, 17], + ["teeksime", 219, 17], + ["teeksin", 219, 17], + ["teeksite", 219, 17], + ["teeme", -1, 17], + ["teen", -1, 17], + ["teete", -1, 17], + ["teevad", -1, 17], + ["tegema", -1, 17], + ["tegemata", 228, 17], + ["teha", -1, 17], + ["tehakse", 230, 17], + ["tehti", -1, 17], + ["toob", -1, 10], + ["tood", -1, 10], + ["toodi", 234, 10], + ["tooks", -1, 10], + ["tooksid", 236, 10], + ["tooksime", 236, 10], + ["tooksin", 236, 10], + ["tooksite", 236, 10], + ["tooma", -1, 10], + ["toomata", 241, 10], + ["toome", -1, 10], + ["toon", -1, 10], + ["toote", -1, 10], + ["toovad", -1, 10], + ["tuua", -1, 10], + ["tuuakse", 247, 10], + ["t\u00F5i", -1, 10], + ["t\u00F5id", 249, 10], + ["t\u00F5ime", 249, 10], + ["t\u00F5in", 249, 10], + ["t\u00F5ite", 249, 10], + ["viia", -1, 3], + ["viiakse", 254, 3], + ["viib", -1, 3], + ["viid", -1, 3], + ["viidi", 257, 3], + ["viiks", -1, 3], + ["viiksid", 259, 3], + ["viiksime", 259, 3], + ["viiksin", 259, 3], + ["viiksite", 259, 3], + ["viima", -1, 3], + ["viimata", 264, 3], + ["viime", -1, 3], + ["viin", -1, 3], + ["viisime", -1, 3], + ["viisin", -1, 3], + ["viisite", -1, 3], + ["viite", -1, 3], + ["viivad", -1, 3], + ["v\u00F5ib", -1, 11], + ["v\u00F5id", -1, 11], + ["v\u00F5ida", 274, 11], + ["v\u00F5idakse", 275, 11], + ["v\u00F5idi", 274, 11], + ["v\u00F5iks", -1, 11], + ["v\u00F5iksid", 278, 11], + ["v\u00F5iksime", 278, 11], + ["v\u00F5iksin", 278, 11], + ["v\u00F5iksite", 278, 11], + ["v\u00F5ima", -1, 11], + ["v\u00F5imata", 283, 11], + ["v\u00F5ime", -1, 11], + ["v\u00F5in", -1, 11], + ["v\u00F5is", -1, 11], + ["v\u00F5ite", -1, 11], + ["v\u00F5ivad", -1, 11] + ]; + + /** @const */ var /** Array */ g_V1 = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 48, 8]; + + /** @const */ var /** Array */ g_RV = [17, 65, 16]; + + /** @const */ var /** Array */ g_KI = [117, 66, 6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 16]; + + /** @const */ var /** Array */ g_GI = [21, 123, 243, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 48, 8]; + + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + golab0: while(true) + { + var /** number */ v_1 = base.cursor; + lab1: { + if (!(base.in_grouping(g_V1, 97, 252))) + { + break lab1; + } + base.cursor = v_1; + break golab0; + } + base.cursor = v_1; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_V1, 97, 252))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + return true; + }; + + /** @return {boolean} */ + function r_emphasis() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + var /** number */ v_3 = base.limit - base.cursor; + { + var /** number */ c1 = base.cursor - 4; + if (c1 < base.limit_backward) + { + return false; + } + base.cursor = c1; + } + base.cursor = base.limit - v_3; + switch (among_var) { + case 1: + var /** number */ v_4 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_GI, 97, 252))) + { + return false; + } + base.cursor = base.limit - v_4; + { + var /** number */ v_5 = base.limit - base.cursor; + lab0: { + if (!r_LONGV()) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_5; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.in_grouping_b(g_KI, 98, 382))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_verb() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_1); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("a")) + { + return false; + } + break; + case 3: + if (!(base.in_grouping_b(g_V1, 97, 252))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_LONGV() { + if (base.find_among_b(a_2) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_i_plural() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + if (!(base.in_grouping_b(g_RV, 97, 117))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_special_noun_endings() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!base.slice_from("lase")) + { + return false; + } + break; + case 2: + if (!base.slice_from("mise")) + { + return false; + } + break; + case 3: + if (!base.slice_from("lise")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_case_ending() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + if (!(base.in_grouping_b(g_RV, 97, 117))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_3; + if (!r_LONGV()) + { + return false; + } + } + break; + case 2: + var /** number */ v_4 = base.limit - base.cursor; + { + var /** number */ c1 = base.cursor - 4; + if (c1 < base.limit_backward) + { + return false; + } + base.cursor = c1; + } + base.cursor = base.limit - v_4; + break; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_plural_three_first_cases() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_7); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!base.slice_from("iku")) + { + return false; + } + break; + case 2: + { + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + if (!r_LONGV()) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_3; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + var /** number */ v_5 = base.limit - base.cursor; + { + var /** number */ c1 = base.cursor - 4; + if (c1 < base.limit_backward) + { + break lab2; + } + base.cursor = c1; + } + base.cursor = base.limit - v_5; + among_var = base.find_among_b(a_6); + switch (among_var) { + case 1: + if (!base.slice_from("e")) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + break lab1; + } + base.cursor = base.limit - v_4; + if (!base.slice_from("t")) + { + return false; + } + } + break; + case 4: + lab3: { + var /** number */ v_6 = base.limit - base.cursor; + lab4: { + if (!(base.in_grouping_b(g_RV, 97, 117))) + { + break lab4; + } + break lab3; + } + base.cursor = base.limit - v_6; + if (!r_LONGV()) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_nu() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_8) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_undouble_kpt() { + var /** number */ among_var; + if (!(base.in_grouping_b(g_V1, 97, 252))) + { + return false; + } + if (I_p1 > base.cursor) + { + return false; + } + base.ket = base.cursor; + among_var = base.find_among_b(a_9); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("k")) + { + return false; + } + break; + case 2: + if (!base.slice_from("p")) + { + return false; + } + break; + case 3: + if (!base.slice_from("t")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_degrees() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_10); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!(base.in_grouping_b(g_RV, 97, 117))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_substantive() { + var /** number */ v_1 = base.limit - base.cursor; + r_special_noun_endings(); + base.cursor = base.limit - v_1; + var /** number */ v_2 = base.limit - base.cursor; + r_case_ending(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_plural_three_first_cases(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_degrees(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_i_plural(); + base.cursor = base.limit - v_5; + var /** number */ v_6 = base.limit - base.cursor; + r_nu(); + base.cursor = base.limit - v_6; + return true; + }; + + /** @return {boolean} */ + function r_verb_exceptions() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_11); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + if (base.cursor < base.limit) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("joo")) + { + return false; + } + break; + case 2: + if (!base.slice_from("saa")) + { + return false; + } + break; + case 3: + if (!base.slice_from("viima")) + { + return false; + } + break; + case 4: + if (!base.slice_from("keesi")) + { + return false; + } + break; + case 5: + if (!base.slice_from("l\u00F6\u00F6")) + { + return false; + } + break; + case 6: + if (!base.slice_from("l\u00F5i")) + { + return false; + } + break; + case 7: + if (!base.slice_from("loo")) + { + return false; + } + break; + case 8: + if (!base.slice_from("k\u00E4isi")) + { + return false; + } + break; + case 9: + if (!base.slice_from("s\u00F6\u00F6")) + { + return false; + } + break; + case 10: + if (!base.slice_from("too")) + { + return false; + } + break; + case 11: + if (!base.slice_from("v\u00F5isi")) + { + return false; + } + break; + case 12: + if (!base.slice_from("j\u00E4\u00E4ma")) + { + return false; + } + break; + case 13: + if (!base.slice_from("m\u00FC\u00FCsi")) + { + return false; + } + break; + case 14: + if (!base.slice_from("luge")) + { + return false; + } + break; + case 15: + if (!base.slice_from("p\u00F5de")) + { + return false; + } + break; + case 16: + if (!base.slice_from("ladu")) + { + return false; + } + break; + case 17: + if (!base.slice_from("tegi")) + { + return false; + } + break; + case 18: + if (!base.slice_from("n\u00E4gi")) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + { + var /** number */ v_1 = base.cursor; + lab0: { + if (!r_verb_exceptions()) + { + break lab0; + } + return false; + } + base.cursor = v_1; + } + var /** number */ v_2 = base.cursor; + r_mark_regions(); + base.cursor = v_2; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_3 = base.limit - base.cursor; + r_emphasis(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + lab1: { + lab2: { + var /** number */ v_5 = base.limit - base.cursor; + lab3: { + if (!r_verb()) + { + break lab3; + } + break lab2; + } + base.cursor = base.limit - v_5; + r_substantive(); + } + } + base.cursor = base.limit - v_4; + var /** number */ v_6 = base.limit - base.cursor; + r_undouble_kpt(); + base.cursor = base.limit - v_6; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['EstonianStemmer'] = EstonianStemmer; diff --git a/js/finnish-stemmer.js b/js/finnish-stemmer.js new file mode 100644 index 0000000..d78ceff --- /dev/null +++ b/js/finnish-stemmer.js @@ -0,0 +1,788 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var FinnishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["pa", -1, 1], + ["sti", -1, 2], + ["kaan", -1, 1], + ["han", -1, 1], + ["kin", -1, 1], + ["h\u00E4n", -1, 1], + ["k\u00E4\u00E4n", -1, 1], + ["ko", -1, 1], + ["p\u00E4", -1, 1], + ["k\u00F6", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["lla", -1, -1], + ["na", -1, -1], + ["ssa", -1, -1], + ["ta", -1, -1], + ["lta", 3, -1], + ["sta", 3, -1] + ]; + + /** @const */ var a_2 = [ + ["ll\u00E4", -1, -1], + ["n\u00E4", -1, -1], + ["ss\u00E4", -1, -1], + ["t\u00E4", -1, -1], + ["lt\u00E4", 3, -1], + ["st\u00E4", 3, -1] + ]; + + /** @const */ var a_3 = [ + ["lle", -1, -1], + ["ine", -1, -1] + ]; + + /** @const */ var a_4 = [ + ["nsa", -1, 3], + ["mme", -1, 3], + ["nne", -1, 3], + ["ni", -1, 2], + ["si", -1, 1], + ["an", -1, 4], + ["en", -1, 6], + ["\u00E4n", -1, 5], + ["ns\u00E4", -1, 3] + ]; + + /** @const */ var a_5 = [ + ["aa", -1, -1], + ["ee", -1, -1], + ["ii", -1, -1], + ["oo", -1, -1], + ["uu", -1, -1], + ["\u00E4\u00E4", -1, -1], + ["\u00F6\u00F6", -1, -1] + ]; + + /** @const */ var a_6 = [ + ["a", -1, 8], + ["lla", 0, -1], + ["na", 0, -1], + ["ssa", 0, -1], + ["ta", 0, -1], + ["lta", 4, -1], + ["sta", 4, -1], + ["tta", 4, 2], + ["lle", -1, -1], + ["ine", -1, -1], + ["ksi", -1, -1], + ["n", -1, 7], + ["han", 11, 1], + ["den", 11, -1, r_VI], + ["seen", 11, -1, r_LONG], + ["hen", 11, 2], + ["tten", 11, -1, r_VI], + ["hin", 11, 3], + ["siin", 11, -1, r_VI], + ["hon", 11, 4], + ["h\u00E4n", 11, 5], + ["h\u00F6n", 11, 6], + ["\u00E4", -1, 8], + ["ll\u00E4", 22, -1], + ["n\u00E4", 22, -1], + ["ss\u00E4", 22, -1], + ["t\u00E4", 22, -1], + ["lt\u00E4", 26, -1], + ["st\u00E4", 26, -1], + ["tt\u00E4", 26, 2] + ]; + + /** @const */ var a_7 = [ + ["eja", -1, -1], + ["mma", -1, 1], + ["imma", 1, -1], + ["mpa", -1, 1], + ["impa", 3, -1], + ["mmi", -1, 1], + ["immi", 5, -1], + ["mpi", -1, 1], + ["impi", 7, -1], + ["ej\u00E4", -1, -1], + ["mm\u00E4", -1, 1], + ["imm\u00E4", 10, -1], + ["mp\u00E4", -1, 1], + ["imp\u00E4", 12, -1] + ]; + + /** @const */ var a_8 = [ + ["i", -1, -1], + ["j", -1, -1] + ]; + + /** @const */ var a_9 = [ + ["mma", -1, 1], + ["imma", 0, -1] + ]; + + /** @const */ var /** Array */ g_AEI = [17, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8]; + + /** @const */ var /** Array */ g_C = [119, 223, 119, 1]; + + /** @const */ var /** Array */ g_V1 = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32]; + + /** @const */ var /** Array */ g_V2 = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32]; + + /** @const */ var /** Array */ g_particle_end = [17, 97, 24, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32]; + + var /** boolean */ B_ending_removed = false; + var /** string */ S_x = ''; + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + I_p2 = base.limit; + golab0: while(true) + { + var /** number */ v_1 = base.cursor; + lab1: { + if (!(base.in_grouping(g_V1, 97, 246))) + { + break lab1; + } + base.cursor = v_1; + break golab0; + } + base.cursor = v_1; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_V1, 97, 246))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + golab4: while(true) + { + var /** number */ v_3 = base.cursor; + lab5: { + if (!(base.in_grouping(g_V1, 97, 246))) + { + break lab5; + } + base.cursor = v_3; + break golab4; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab6: while(true) + { + lab7: { + if (!(base.out_grouping(g_V1, 97, 246))) + { + break lab7; + } + break golab6; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p2 = base.cursor; + return true; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_particle_etc() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!(base.in_grouping_b(g_particle_end, 97, 246))) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + break; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_possessive() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + { + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("k"))) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_3; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("kse"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_from("ksi")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + case 4: + if (base.find_among_b(a_1) == 0) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 5: + if (base.find_among_b(a_2) == 0) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 6: + if (base.find_among_b(a_3) == 0) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_LONG() { + if (base.find_among_b(a_5) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_VI() { + if (!(base.eq_s_b("i"))) + { + return false; + } + if (!(base.in_grouping_b(g_V2, 97, 246))) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_case_ending() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!(base.eq_s_b("a"))) + { + return false; + } + break; + case 2: + if (!(base.eq_s_b("e"))) + { + return false; + } + break; + case 3: + if (!(base.eq_s_b("i"))) + { + return false; + } + break; + case 4: + if (!(base.eq_s_b("o"))) + { + return false; + } + break; + case 5: + if (!(base.eq_s_b("\u00E4"))) + { + return false; + } + break; + case 6: + if (!(base.eq_s_b("\u00F6"))) + { + return false; + } + break; + case 7: + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + var /** number */ v_4 = base.limit - base.cursor; + lab1: { + var /** number */ v_5 = base.limit - base.cursor; + lab2: { + if (!r_LONG()) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_5; + if (!(base.eq_s_b("ie"))) + { + base.cursor = base.limit - v_3; + break lab0; + } + } + base.cursor = base.limit - v_4; + if (base.cursor <= base.limit_backward) + { + base.cursor = base.limit - v_3; + break lab0; + } + base.cursor--; + base.bra = base.cursor; + } + break; + case 8: + if (!(base.in_grouping_b(g_V1, 97, 246))) + { + return false; + } + if (!(base.in_grouping_b(g_C, 98, 122))) + { + return false; + } + break; + } + if (!base.slice_del()) + { + return false; + } + B_ending_removed = true; + return true; + }; + + /** @return {boolean} */ + function r_other_endings() { + var /** number */ among_var; + if (base.cursor < I_p2) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p2; + base.ket = base.cursor; + among_var = base.find_among_b(a_7); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + { + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("po"))) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_3; + } + break; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_i_plural() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_8) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_t_plural() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (!(base.eq_s_b("t"))) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + var /** number */ v_3 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_V1, 97, 246))) + { + base.limit_backward = v_2; + return false; + } + base.cursor = base.limit - v_3; + if (!base.slice_del()) + { + return false; + } + base.limit_backward = v_2; + if (base.cursor < I_p2) + { + return false; + } + var /** number */ v_5 = base.limit_backward; + base.limit_backward = I_p2; + base.ket = base.cursor; + among_var = base.find_among_b(a_9); + if (among_var == 0) + { + base.limit_backward = v_5; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_5; + switch (among_var) { + case 1: + { + var /** number */ v_6 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("po"))) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_6; + } + break; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_tidy() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + var /** number */ v_4 = base.limit - base.cursor; + if (!r_LONG()) + { + break lab0; + } + base.cursor = base.limit - v_4; + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + break lab0; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_3; + var /** number */ v_5 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (!(base.in_grouping_b(g_AEI, 97, 228))) + { + break lab1; + } + base.bra = base.cursor; + if (!(base.in_grouping_b(g_C, 98, 122))) + { + break lab1; + } + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_5; + var /** number */ v_6 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + if (!(base.eq_s_b("j"))) + { + break lab2; + } + base.bra = base.cursor; + lab3: { + var /** number */ v_7 = base.limit - base.cursor; + lab4: { + if (!(base.eq_s_b("o"))) + { + break lab4; + } + break lab3; + } + base.cursor = base.limit - v_7; + if (!(base.eq_s_b("u"))) + { + break lab2; + } + } + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_6; + var /** number */ v_8 = base.limit - base.cursor; + lab5: { + base.ket = base.cursor; + if (!(base.eq_s_b("o"))) + { + break lab5; + } + base.bra = base.cursor; + if (!(base.eq_s_b("j"))) + { + break lab5; + } + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_8; + base.limit_backward = v_2; + golab6: while(true) + { + var /** number */ v_9 = base.limit - base.cursor; + lab7: { + if (!(base.out_grouping_b(g_V1, 97, 246))) + { + break lab7; + } + base.cursor = base.limit - v_9; + break golab6; + } + base.cursor = base.limit - v_9; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + base.ket = base.cursor; + if (!(base.in_grouping_b(g_C, 98, 122))) + { + return false; + } + base.bra = base.cursor; + S_x = base.slice_to(); + if (S_x == '') + { + return false; + } + if (!(base.eq_s_b(S_x))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_mark_regions(); + base.cursor = v_1; + B_ending_removed = false; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_particle_etc(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_possessive(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_case_ending(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_other_endings(); + base.cursor = base.limit - v_5; + lab0: { + lab1: { + if (!B_ending_removed) + { + break lab1; + } + var /** number */ v_7 = base.limit - base.cursor; + r_i_plural(); + base.cursor = base.limit - v_7; + break lab0; + } + var /** number */ v_8 = base.limit - base.cursor; + r_t_plural(); + base.cursor = base.limit - v_8; + } + var /** number */ v_9 = base.limit - base.cursor; + r_tidy(); + base.cursor = base.limit - v_9; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['FinnishStemmer'] = FinnishStemmer; diff --git a/js/french-stemmer.js b/js/french-stemmer.js new file mode 100644 index 0000000..36475be --- /dev/null +++ b/js/french-stemmer.js @@ -0,0 +1,1311 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var FrenchStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["col", -1, -1], + ["par", -1, -1], + ["tap", -1, -1] + ]; + + /** @const */ var a_1 = [ + ["", -1, 7], + ["H", 0, 6], + ["He", 1, 4], + ["Hi", 1, 5], + ["I", 0, 1], + ["U", 0, 2], + ["Y", 0, 3] + ]; + + /** @const */ var a_2 = [ + ["iqU", -1, 3], + ["abl", -1, 3], + ["I\u00E8r", -1, 4], + ["i\u00E8r", -1, 4], + ["eus", -1, 2], + ["iv", -1, 1] + ]; + + /** @const */ var a_3 = [ + ["ic", -1, 2], + ["abil", -1, 1], + ["iv", -1, 3] + ]; + + /** @const */ var a_4 = [ + ["iqUe", -1, 1], + ["atrice", -1, 2], + ["ance", -1, 1], + ["ence", -1, 5], + ["logie", -1, 3], + ["able", -1, 1], + ["isme", -1, 1], + ["euse", -1, 11], + ["iste", -1, 1], + ["ive", -1, 8], + ["if", -1, 8], + ["usion", -1, 4], + ["ation", -1, 2], + ["ution", -1, 4], + ["ateur", -1, 2], + ["iqUes", -1, 1], + ["atrices", -1, 2], + ["ances", -1, 1], + ["ences", -1, 5], + ["logies", -1, 3], + ["ables", -1, 1], + ["ismes", -1, 1], + ["euses", -1, 11], + ["istes", -1, 1], + ["ives", -1, 8], + ["ifs", -1, 8], + ["usions", -1, 4], + ["ations", -1, 2], + ["utions", -1, 4], + ["ateurs", -1, 2], + ["ments", -1, 15], + ["ements", 30, 6], + ["issements", 31, 12], + ["it\u00E9s", -1, 7], + ["ment", -1, 15], + ["ement", 34, 6], + ["issement", 35, 12], + ["amment", 34, 13], + ["emment", 34, 14], + ["aux", -1, 10], + ["eaux", 39, 9], + ["eux", -1, 1], + ["it\u00E9", -1, 7] + ]; + + /** @const */ var a_5 = [ + ["ira", -1, 1], + ["ie", -1, 1], + ["isse", -1, 1], + ["issante", -1, 1], + ["i", -1, 1], + ["irai", 4, 1], + ["ir", -1, 1], + ["iras", -1, 1], + ["ies", -1, 1], + ["\u00EEmes", -1, 1], + ["isses", -1, 1], + ["issantes", -1, 1], + ["\u00EEtes", -1, 1], + ["is", -1, 1], + ["irais", 13, 1], + ["issais", 13, 1], + ["irions", -1, 1], + ["issions", -1, 1], + ["irons", -1, 1], + ["issons", -1, 1], + ["issants", -1, 1], + ["it", -1, 1], + ["irait", 21, 1], + ["issait", 21, 1], + ["issant", -1, 1], + ["iraIent", -1, 1], + ["issaIent", -1, 1], + ["irent", -1, 1], + ["issent", -1, 1], + ["iront", -1, 1], + ["\u00EEt", -1, 1], + ["iriez", -1, 1], + ["issiez", -1, 1], + ["irez", -1, 1], + ["issez", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["a", -1, 3], + ["era", 0, 2], + ["asse", -1, 3], + ["ante", -1, 3], + ["\u00E9e", -1, 2], + ["ai", -1, 3], + ["erai", 5, 2], + ["er", -1, 2], + ["as", -1, 3], + ["eras", 8, 2], + ["\u00E2mes", -1, 3], + ["asses", -1, 3], + ["antes", -1, 3], + ["\u00E2tes", -1, 3], + ["\u00E9es", -1, 2], + ["ais", -1, 3], + ["erais", 15, 2], + ["ions", -1, 1], + ["erions", 17, 2], + ["assions", 17, 3], + ["erons", -1, 2], + ["ants", -1, 3], + ["\u00E9s", -1, 2], + ["ait", -1, 3], + ["erait", 23, 2], + ["ant", -1, 3], + ["aIent", -1, 3], + ["eraIent", 26, 2], + ["\u00E8rent", -1, 2], + ["assent", -1, 3], + ["eront", -1, 2], + ["\u00E2t", -1, 3], + ["ez", -1, 2], + ["iez", 32, 2], + ["eriez", 33, 2], + ["assiez", 33, 3], + ["erez", 32, 2], + ["\u00E9", -1, 2] + ]; + + /** @const */ var a_7 = [ + ["e", -1, 3], + ["I\u00E8re", 0, 2], + ["i\u00E8re", 0, 2], + ["ion", -1, 1], + ["Ier", -1, 2], + ["ier", -1, 2] + ]; + + /** @const */ var a_8 = [ + ["ell", -1, -1], + ["eill", -1, -1], + ["enn", -1, -1], + ["onn", -1, -1], + ["ett", -1, -1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 130, 103, 8, 5]; + + /** @const */ var /** Array */ g_keep_with_s = [1, 65, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_prelude() { + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + var /** number */ v_2 = base.cursor; + lab2: { + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab4; + } + base.bra = base.cursor; + lab5: { + var /** number */ v_4 = base.cursor; + lab6: { + if (!(base.eq_s("u"))) + { + break lab6; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab6; + } + if (!base.slice_from("U")) + { + return false; + } + break lab5; + } + base.cursor = v_4; + lab7: { + if (!(base.eq_s("i"))) + { + break lab7; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab7; + } + if (!base.slice_from("I")) + { + return false; + } + break lab5; + } + base.cursor = v_4; + if (!(base.eq_s("y"))) + { + break lab4; + } + base.ket = base.cursor; + if (!base.slice_from("Y")) + { + return false; + } + } + break lab3; + } + base.cursor = v_3; + lab8: { + base.bra = base.cursor; + if (!(base.eq_s("\u00EB"))) + { + break lab8; + } + base.ket = base.cursor; + if (!base.slice_from("He")) + { + return false; + } + break lab3; + } + base.cursor = v_3; + lab9: { + base.bra = base.cursor; + if (!(base.eq_s("\u00EF"))) + { + break lab9; + } + base.ket = base.cursor; + if (!base.slice_from("Hi")) + { + return false; + } + break lab3; + } + base.cursor = v_3; + lab10: { + base.bra = base.cursor; + if (!(base.eq_s("y"))) + { + break lab10; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab10; + } + if (!base.slice_from("Y")) + { + return false; + } + break lab3; + } + base.cursor = v_3; + if (!(base.eq_s("q"))) + { + break lab2; + } + base.bra = base.cursor; + if (!(base.eq_s("u"))) + { + break lab2; + } + base.ket = base.cursor; + if (!base.slice_from("U")) + { + return false; + } + } + base.cursor = v_2; + break golab1; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab2; + } + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab2; + } + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + break lab1; + } + base.cursor = v_2; + lab3: { + if (base.find_among(a_0) == 0) + { + break lab3; + } + break lab1; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + golab4: while(true) + { + lab5: { + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab5; + } + break golab4; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_pV = base.cursor; + } + base.cursor = v_1; + var /** number */ v_4 = base.cursor; + lab6: { + golab7: while(true) + { + lab8: { + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab6; + } + base.cursor++; + } + golab9: while(true) + { + lab10: { + if (!(base.out_grouping(g_v, 97, 251))) + { + break lab10; + } + break golab9; + } + if (base.cursor >= base.limit) + { + break lab6; + } + base.cursor++; + } + I_p1 = base.cursor; + golab11: while(true) + { + lab12: { + if (!(base.in_grouping(g_v, 97, 251))) + { + break lab12; + } + break golab11; + } + if (base.cursor >= base.limit) + { + break lab6; + } + base.cursor++; + } + golab13: while(true) + { + lab14: { + if (!(base.out_grouping(g_v, 97, 251))) + { + break lab14; + } + break golab13; + } + if (base.cursor >= base.limit) + { + break lab6; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_4; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_1); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("i")) + { + return false; + } + break; + case 2: + if (!base.slice_from("u")) + { + return false; + } + break; + case 3: + if (!base.slice_from("y")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u00EB")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u00EF")) + { + return false; + } + break; + case 6: + if (!base.slice_del()) + { + return false; + } + break; + case 7: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("ic"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + if (!r_R2()) + { + break lab2; + } + if (!base.slice_del()) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + if (!base.slice_from("iqU")) + { + return false; + } + } + } + break; + case 3: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("log")) + { + return false; + } + break; + case 4: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("u")) + { + return false; + } + break; + case 5: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("ent")) + { + return false; + } + break; + case 6: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + base.cursor = base.limit - v_3; + break lab3; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + base.cursor = base.limit - v_3; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_3; + break lab3; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_3; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + lab4: { + var /** number */ v_4 = base.limit - base.cursor; + lab5: { + if (!r_R2()) + { + break lab5; + } + if (!base.slice_del()) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_4; + if (!r_R1()) + { + base.cursor = base.limit - v_3; + break lab3; + } + if (!base.slice_from("eux")) + { + return false; + } + } + break; + case 3: + if (!r_R2()) + { + base.cursor = base.limit - v_3; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + break; + case 4: + if (!r_RV()) + { + base.cursor = base.limit - v_3; + break lab3; + } + if (!base.slice_from("i")) + { + return false; + } + break; + } + } + break; + case 7: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_5 = base.limit - base.cursor; + lab6: { + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + base.cursor = base.limit - v_5; + break lab6; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab7: { + var /** number */ v_6 = base.limit - base.cursor; + lab8: { + if (!r_R2()) + { + break lab8; + } + if (!base.slice_del()) + { + return false; + } + break lab7; + } + base.cursor = base.limit - v_6; + if (!base.slice_from("abl")) + { + return false; + } + } + break; + case 2: + lab9: { + var /** number */ v_7 = base.limit - base.cursor; + lab10: { + if (!r_R2()) + { + break lab10; + } + if (!base.slice_del()) + { + return false; + } + break lab9; + } + base.cursor = base.limit - v_7; + if (!base.slice_from("iqU")) + { + return false; + } + } + break; + case 3: + if (!r_R2()) + { + base.cursor = base.limit - v_5; + break lab6; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + break; + case 8: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_8 = base.limit - base.cursor; + lab11: { + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_8; + break lab11; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_8; + break lab11; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("ic"))) + { + base.cursor = base.limit - v_8; + break lab11; + } + base.bra = base.cursor; + lab12: { + var /** number */ v_9 = base.limit - base.cursor; + lab13: { + if (!r_R2()) + { + break lab13; + } + if (!base.slice_del()) + { + return false; + } + break lab12; + } + base.cursor = base.limit - v_9; + if (!base.slice_from("iqU")) + { + return false; + } + } + } + break; + case 9: + if (!base.slice_from("eau")) + { + return false; + } + break; + case 10: + if (!r_R1()) + { + return false; + } + if (!base.slice_from("al")) + { + return false; + } + break; + case 11: + lab14: { + var /** number */ v_10 = base.limit - base.cursor; + lab15: { + if (!r_R2()) + { + break lab15; + } + if (!base.slice_del()) + { + return false; + } + break lab14; + } + base.cursor = base.limit - v_10; + if (!r_R1()) + { + return false; + } + if (!base.slice_from("eux")) + { + return false; + } + } + break; + case 12: + if (!r_R1()) + { + return false; + } + if (!(base.out_grouping_b(g_v, 97, 251))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 13: + if (!r_RV()) + { + return false; + } + if (!base.slice_from("ant")) + { + return false; + } + return false; + case 14: + if (!r_RV()) + { + return false; + } + if (!base.slice_from("ent")) + { + return false; + } + return false; + case 15: + var /** number */ v_11 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_v, 97, 251))) + { + return false; + } + if (!r_RV()) + { + return false; + } + base.cursor = base.limit - v_11; + if (!base.slice_del()) + { + return false; + } + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_i_verb_suffix() { + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + if (base.find_among_b(a_5) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + { + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("H"))) + { + break lab0; + } + base.limit_backward = v_2; + return false; + } + base.cursor = base.limit - v_3; + } + if (!(base.out_grouping_b(g_v, 97, 251))) + { + base.limit_backward = v_2; + return false; + } + if (!base.slice_del()) + { + return false; + } + base.limit_backward = v_2; + return true; + }; + + /** @return {boolean} */ + function r_verb_suffix() { + var /** number */ among_var; + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + base.limit_backward = v_2; + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("e"))) + { + base.cursor = base.limit - v_3; + break lab0; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + break; + } + base.limit_backward = v_2; + return true; + }; + + /** @return {boolean} */ + function r_residual_suffix() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("s"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + if (!(base.eq_s_b("Hi"))) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_3; + if (!(base.out_grouping_b(g_keep_with_s, 97, 232))) + { + base.cursor = base.limit - v_1; + break lab0; + } + } + base.cursor = base.limit - v_2; + if (!base.slice_del()) + { + return false; + } + } + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_5 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + among_var = base.find_among_b(a_7); + if (among_var == 0) + { + base.limit_backward = v_5; + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + base.limit_backward = v_5; + return false; + } + lab3: { + var /** number */ v_6 = base.limit - base.cursor; + lab4: { + if (!(base.eq_s_b("s"))) + { + break lab4; + } + break lab3; + } + base.cursor = base.limit - v_6; + if (!(base.eq_s_b("t"))) + { + base.limit_backward = v_5; + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("i")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + base.limit_backward = v_5; + return true; + }; + + /** @return {boolean} */ + function r_un_double() { + var /** number */ v_1 = base.limit - base.cursor; + if (base.find_among_b(a_8) == 0) + { + return false; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_un_accent() { + { + var v_1 = 1; + while(true) + { + lab0: { + if (!(base.out_grouping_b(g_v, 97, 251))) + { + break lab0; + } + v_1--; + continue; + } + break; + } + if (v_1 > 0) + { + return false; + } + } + base.ket = base.cursor; + lab1: { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + if (!(base.eq_s_b("\u00E9"))) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_3; + if (!(base.eq_s_b("\u00E8"))) + { + return false; + } + } + base.bra = base.cursor; + if (!base.slice_from("e")) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_prelude(); + base.cursor = v_1; + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + var /** number */ v_5 = base.limit - base.cursor; + lab3: { + var /** number */ v_6 = base.limit - base.cursor; + lab4: { + if (!r_standard_suffix()) + { + break lab4; + } + break lab3; + } + base.cursor = base.limit - v_6; + lab5: { + if (!r_i_verb_suffix()) + { + break lab5; + } + break lab3; + } + base.cursor = base.limit - v_6; + if (!r_verb_suffix()) + { + break lab2; + } + } + base.cursor = base.limit - v_5; + var /** number */ v_7 = base.limit - base.cursor; + lab6: { + base.ket = base.cursor; + lab7: { + var /** number */ v_8 = base.limit - base.cursor; + lab8: { + if (!(base.eq_s_b("Y"))) + { + break lab8; + } + base.bra = base.cursor; + if (!base.slice_from("i")) + { + return false; + } + break lab7; + } + base.cursor = base.limit - v_8; + if (!(base.eq_s_b("\u00E7"))) + { + base.cursor = base.limit - v_7; + break lab6; + } + base.bra = base.cursor; + if (!base.slice_from("c")) + { + return false; + } + } + } + break lab1; + } + base.cursor = base.limit - v_4; + if (!r_residual_suffix()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_3; + var /** number */ v_9 = base.limit - base.cursor; + r_un_double(); + base.cursor = base.limit - v_9; + var /** number */ v_10 = base.limit - base.cursor; + r_un_accent(); + base.cursor = base.limit - v_10; + base.cursor = base.limit_backward; + var /** number */ v_11 = base.cursor; + r_postlude(); + base.cursor = v_11; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['FrenchStemmer'] = FrenchStemmer; diff --git a/js/german-stemmer.js b/js/german-stemmer.js new file mode 100644 index 0000000..93f92f8 --- /dev/null +++ b/js/german-stemmer.js @@ -0,0 +1,591 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var GermanStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["", -1, 5], + ["ae", 0, 2], + ["oe", 0, 3], + ["qu", 0, -1], + ["ue", 0, 4], + ["\u00DF", 0, 1] + ]; + + /** @const */ var a_1 = [ + ["", -1, 5], + ["U", 0, 2], + ["Y", 0, 1], + ["\u00E4", 0, 3], + ["\u00F6", 0, 4], + ["\u00FC", 0, 2] + ]; + + /** @const */ var a_2 = [ + ["e", -1, 2], + ["em", -1, 1], + ["en", -1, 2], + ["ern", -1, 1], + ["er", -1, 1], + ["s", -1, 3], + ["es", 5, 2] + ]; + + /** @const */ var a_3 = [ + ["en", -1, 1], + ["er", -1, 1], + ["st", -1, 2], + ["est", 2, 1] + ]; + + /** @const */ var a_4 = [ + ["ig", -1, 1], + ["lich", -1, 1] + ]; + + /** @const */ var a_5 = [ + ["end", -1, 1], + ["ig", -1, 2], + ["ung", -1, 1], + ["lich", -1, 3], + ["isch", -1, 2], + ["ik", -1, 2], + ["heit", -1, 3], + ["keit", -1, 4] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8]; + + /** @const */ var /** Array */ g_s_ending = [117, 30, 5]; + + /** @const */ var /** Array */ g_st_ending = [117, 30, 4]; + + var /** number */ I_x = 0; + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_prelude() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + while(true) + { + var /** number */ v_2 = base.cursor; + lab0: { + golab1: while(true) + { + var /** number */ v_3 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab2; + } + base.bra = base.cursor; + lab3: { + var /** number */ v_4 = base.cursor; + lab4: { + if (!(base.eq_s("u"))) + { + break lab4; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab4; + } + if (!base.slice_from("U")) + { + return false; + } + break lab3; + } + base.cursor = v_4; + if (!(base.eq_s("y"))) + { + break lab2; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab2; + } + if (!base.slice_from("Y")) + { + return false; + } + } + base.cursor = v_3; + break golab1; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + continue; + } + base.cursor = v_2; + break; + } + base.cursor = v_1; + while(true) + { + var /** number */ v_5 = base.cursor; + lab5: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("ss")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u00E4")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u00F6")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u00FC")) + { + return false; + } + break; + case 5: + if (base.cursor >= base.limit) + { + break lab5; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_5; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + { + var /** number */ c1 = base.cursor + 3; + if (c1 > base.limit) + { + return false; + } + base.cursor = c1; + } + I_x = base.cursor; + base.cursor = v_1; + golab0: while(true) + { + lab1: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab1; + } + break golab0; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + lab4: { + if (I_p1 >= I_x) + { + break lab4; + } + I_p1 = I_x; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p2 = base.cursor; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_1); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("y")) + { + return false; + } + break; + case 2: + if (!base.slice_from("u")) + { + return false; + } + break; + case 3: + if (!base.slice_from("a")) + { + return false; + } + break; + case 4: + if (!base.slice_from("o")) + { + return false; + } + break; + case 5: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!r_R1()) + { + break lab0; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (!(base.eq_s_b("s"))) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!(base.eq_s_b("nis"))) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 3: + if (!(base.in_grouping_b(g_s_ending, 98, 116))) + { + break lab0; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + base.cursor = base.limit - v_1; + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + break lab2; + } + base.bra = base.cursor; + if (!r_R1()) + { + break lab2; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.in_grouping_b(g_st_ending, 98, 116))) + { + break lab2; + } + { + var /** number */ c1 = base.cursor - 3; + if (c1 < base.limit_backward) + { + break lab2; + } + base.cursor = c1; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + break lab3; + } + base.bra = base.cursor; + if (!r_R2()) + { + break lab3; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + var /** number */ v_5 = base.limit - base.cursor; + lab4: { + base.ket = base.cursor; + if (!(base.eq_s_b("ig"))) + { + base.cursor = base.limit - v_5; + break lab4; + } + base.bra = base.cursor; + { + var /** number */ v_6 = base.limit - base.cursor; + lab5: { + if (!(base.eq_s_b("e"))) + { + break lab5; + } + base.cursor = base.limit - v_5; + break lab4; + } + base.cursor = base.limit - v_6; + } + if (!r_R2()) + { + base.cursor = base.limit - v_5; + break lab4; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 2: + { + var /** number */ v_7 = base.limit - base.cursor; + lab6: { + if (!(base.eq_s_b("e"))) + { + break lab6; + } + break lab3; + } + base.cursor = base.limit - v_7; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + var /** number */ v_8 = base.limit - base.cursor; + lab7: { + base.ket = base.cursor; + lab8: { + var /** number */ v_9 = base.limit - base.cursor; + lab9: { + if (!(base.eq_s_b("er"))) + { + break lab9; + } + break lab8; + } + base.cursor = base.limit - v_9; + if (!(base.eq_s_b("en"))) + { + base.cursor = base.limit - v_8; + break lab7; + } + } + base.bra = base.cursor; + if (!r_R1()) + { + base.cursor = base.limit - v_8; + break lab7; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 4: + if (!base.slice_del()) + { + return false; + } + var /** number */ v_10 = base.limit - base.cursor; + lab10: { + base.ket = base.cursor; + if (base.find_among_b(a_4) == 0) + { + base.cursor = base.limit - v_10; + break lab10; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_10; + break lab10; + } + if (!base.slice_del()) + { + return false; + } + } + break; + } + } + base.cursor = base.limit - v_4; + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_prelude(); + base.cursor = v_1; + var /** number */ v_2 = base.cursor; + r_mark_regions(); + base.cursor = v_2; + base.limit_backward = base.cursor; base.cursor = base.limit; + r_standard_suffix(); + base.cursor = base.limit_backward; + var /** number */ v_4 = base.cursor; + r_postlude(); + base.cursor = v_4; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['GermanStemmer'] = GermanStemmer; diff --git a/js/greek-stemmer.js b/js/greek-stemmer.js new file mode 100644 index 0000000..865b0ea --- /dev/null +++ b/js/greek-stemmer.js @@ -0,0 +1,2870 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var GreekStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["", -1, 25], + ["\u0386", 0, 1], + ["\u0388", 0, 5], + ["\u0389", 0, 7], + ["\u038A", 0, 9], + ["\u038C", 0, 15], + ["\u038E", 0, 20], + ["\u038F", 0, 24], + ["\u0390", 0, 7], + ["\u0391", 0, 1], + ["\u0392", 0, 2], + ["\u0393", 0, 3], + ["\u0394", 0, 4], + ["\u0395", 0, 5], + ["\u0396", 0, 6], + ["\u0397", 0, 7], + ["\u0398", 0, 8], + ["\u0399", 0, 9], + ["\u039A", 0, 10], + ["\u039B", 0, 11], + ["\u039C", 0, 12], + ["\u039D", 0, 13], + ["\u039E", 0, 14], + ["\u039F", 0, 15], + ["\u03A0", 0, 16], + ["\u03A1", 0, 17], + ["\u03A3", 0, 18], + ["\u03A4", 0, 19], + ["\u03A5", 0, 20], + ["\u03A6", 0, 21], + ["\u03A7", 0, 22], + ["\u03A8", 0, 23], + ["\u03A9", 0, 24], + ["\u03AA", 0, 9], + ["\u03AB", 0, 20], + ["\u03AC", 0, 1], + ["\u03AD", 0, 5], + ["\u03AE", 0, 7], + ["\u03AF", 0, 9], + ["\u03B0", 0, 20], + ["\u03C2", 0, 18], + ["\u03CA", 0, 7], + ["\u03CB", 0, 20], + ["\u03CC", 0, 15], + ["\u03CD", 0, 20], + ["\u03CE", 0, 24] + ]; + + /** @const */ var a_1 = [ + ["\u03C3\u03BA\u03B1\u03B3\u03B9\u03B1", -1, 2], + ["\u03C6\u03B1\u03B3\u03B9\u03B1", -1, 1], + ["\u03BF\u03BB\u03BF\u03B3\u03B9\u03B1", -1, 3], + ["\u03C3\u03BF\u03B3\u03B9\u03B1", -1, 4], + ["\u03C4\u03B1\u03C4\u03BF\u03B3\u03B9\u03B1", -1, 5], + ["\u03BA\u03C1\u03B5\u03B1\u03C4\u03B1", -1, 6], + ["\u03C0\u03B5\u03C1\u03B1\u03C4\u03B1", -1, 7], + ["\u03C4\u03B5\u03C1\u03B1\u03C4\u03B1", -1, 8], + ["\u03B3\u03B5\u03B3\u03BF\u03BD\u03BF\u03C4\u03B1", -1, 11], + ["\u03BA\u03B1\u03B8\u03B5\u03C3\u03C4\u03C9\u03C4\u03B1", -1, 10], + ["\u03C6\u03C9\u03C4\u03B1", -1, 9], + ["\u03C0\u03B5\u03C1\u03B1\u03C4\u03B7", -1, 7], + ["\u03C3\u03BA\u03B1\u03B3\u03B9\u03C9\u03BD", -1, 2], + ["\u03C6\u03B1\u03B3\u03B9\u03C9\u03BD", -1, 1], + ["\u03BF\u03BB\u03BF\u03B3\u03B9\u03C9\u03BD", -1, 3], + ["\u03C3\u03BF\u03B3\u03B9\u03C9\u03BD", -1, 4], + ["\u03C4\u03B1\u03C4\u03BF\u03B3\u03B9\u03C9\u03BD", -1, 5], + ["\u03BA\u03C1\u03B5\u03B1\u03C4\u03C9\u03BD", -1, 6], + ["\u03C0\u03B5\u03C1\u03B1\u03C4\u03C9\u03BD", -1, 7], + ["\u03C4\u03B5\u03C1\u03B1\u03C4\u03C9\u03BD", -1, 8], + ["\u03B3\u03B5\u03B3\u03BF\u03BD\u03BF\u03C4\u03C9\u03BD", -1, 11], + ["\u03BA\u03B1\u03B8\u03B5\u03C3\u03C4\u03C9\u03C4\u03C9\u03BD", -1, 10], + ["\u03C6\u03C9\u03C4\u03C9\u03BD", -1, 9], + ["\u03BA\u03C1\u03B5\u03B1\u03C3", -1, 6], + ["\u03C0\u03B5\u03C1\u03B1\u03C3", -1, 7], + ["\u03C4\u03B5\u03C1\u03B1\u03C3", -1, 8], + ["\u03B3\u03B5\u03B3\u03BF\u03BD\u03BF\u03C3", -1, 11], + ["\u03BA\u03C1\u03B5\u03B1\u03C4\u03BF\u03C3", -1, 6], + ["\u03C0\u03B5\u03C1\u03B1\u03C4\u03BF\u03C3", -1, 7], + ["\u03C4\u03B5\u03C1\u03B1\u03C4\u03BF\u03C3", -1, 8], + ["\u03B3\u03B5\u03B3\u03BF\u03BD\u03BF\u03C4\u03BF\u03C3", -1, 11], + ["\u03BA\u03B1\u03B8\u03B5\u03C3\u03C4\u03C9\u03C4\u03BF\u03C3", -1, 10], + ["\u03C6\u03C9\u03C4\u03BF\u03C3", -1, 9], + ["\u03BA\u03B1\u03B8\u03B5\u03C3\u03C4\u03C9\u03C3", -1, 10], + ["\u03C6\u03C9\u03C3", -1, 9], + ["\u03C3\u03BA\u03B1\u03B3\u03B9\u03BF\u03C5", -1, 2], + ["\u03C6\u03B1\u03B3\u03B9\u03BF\u03C5", -1, 1], + ["\u03BF\u03BB\u03BF\u03B3\u03B9\u03BF\u03C5", -1, 3], + ["\u03C3\u03BF\u03B3\u03B9\u03BF\u03C5", -1, 4], + ["\u03C4\u03B1\u03C4\u03BF\u03B3\u03B9\u03BF\u03C5", -1, 5] + ]; + + /** @const */ var a_2 = [ + ["\u03C0\u03B1", -1, 1], + ["\u03BE\u03B1\u03BD\u03B1\u03C0\u03B1", 0, 1], + ["\u03B5\u03C0\u03B1", 0, 1], + ["\u03C0\u03B5\u03C1\u03B9\u03C0\u03B1", 0, 1], + ["\u03B1\u03BD\u03B1\u03BC\u03C0\u03B1", 0, 1], + ["\u03B5\u03BC\u03C0\u03B1", 0, 1], + ["\u03B2", -1, 2], + ["\u03B4\u03B1\u03BD\u03B5", -1, 1], + ["\u03B2\u03B1\u03B8\u03C5\u03C1\u03B9", -1, 2], + ["\u03B2\u03B1\u03C1\u03BA", -1, 2], + ["\u03BC\u03B1\u03C1\u03BA", -1, 2], + ["\u03BB", -1, 2], + ["\u03BC", -1, 2], + ["\u03BA\u03BF\u03C1\u03BD", -1, 2], + ["\u03B1\u03B8\u03C1\u03BF", -1, 1], + ["\u03C3\u03C5\u03BD\u03B1\u03B8\u03C1\u03BF", 14, 1], + ["\u03C0", -1, 2], + ["\u03B9\u03BC\u03C0", 16, 2], + ["\u03C1", -1, 2], + ["\u03BC\u03B1\u03C1", 18, 2], + ["\u03B1\u03BC\u03C0\u03B1\u03C1", 18, 2], + ["\u03B3\u03BA\u03C1", 18, 2], + ["\u03B2\u03BF\u03BB\u03B2\u03BF\u03C1", 18, 2], + ["\u03B3\u03BB\u03C5\u03BA\u03BF\u03C1", 18, 2], + ["\u03C0\u03B9\u03C0\u03B5\u03C1\u03BF\u03C1", 18, 2], + ["\u03C0\u03C1", 18, 2], + ["\u03BC\u03C0\u03C1", 25, 2], + ["\u03B1\u03C1\u03C1", 18, 2], + ["\u03B3\u03BB\u03C5\u03BA\u03C5\u03C1", 18, 2], + ["\u03C0\u03BF\u03BB\u03C5\u03C1", 18, 2], + ["\u03BB\u03BF\u03C5", -1, 2] + ]; + + /** @const */ var a_3 = [ + ["\u03B9\u03B6\u03B1", -1, 1], + ["\u03B9\u03B6\u03B5", -1, 1], + ["\u03B9\u03B6\u03B1\u03BC\u03B5", -1, 1], + ["\u03B9\u03B6\u03BF\u03C5\u03BC\u03B5", -1, 1], + ["\u03B9\u03B6\u03B1\u03BD\u03B5", -1, 1], + ["\u03B9\u03B6\u03BF\u03C5\u03BD\u03B5", -1, 1], + ["\u03B9\u03B6\u03B1\u03C4\u03B5", -1, 1], + ["\u03B9\u03B6\u03B5\u03C4\u03B5", -1, 1], + ["\u03B9\u03B6\u03B5\u03B9", -1, 1], + ["\u03B9\u03B6\u03B1\u03BD", -1, 1], + ["\u03B9\u03B6\u03BF\u03C5\u03BD", -1, 1], + ["\u03B9\u03B6\u03B5\u03C3", -1, 1], + ["\u03B9\u03B6\u03B5\u03B9\u03C3", -1, 1], + ["\u03B9\u03B6\u03C9", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["\u03B2\u03B9", -1, 1], + ["\u03BB\u03B9", -1, 1], + ["\u03B1\u03BB", -1, 1], + ["\u03B5\u03BD", -1, 1], + ["\u03C3", -1, 1], + ["\u03C7", -1, 1], + ["\u03C5\u03C8", -1, 1], + ["\u03B6\u03C9", -1, 1] + ]; + + /** @const */ var a_5 = [ + ["\u03C9\u03B8\u03B7\u03BA\u03B1", -1, 1], + ["\u03C9\u03B8\u03B7\u03BA\u03B5", -1, 1], + ["\u03C9\u03B8\u03B7\u03BA\u03B1\u03BC\u03B5", -1, 1], + ["\u03C9\u03B8\u03B7\u03BA\u03B1\u03BD\u03B5", -1, 1], + ["\u03C9\u03B8\u03B7\u03BA\u03B1\u03C4\u03B5", -1, 1], + ["\u03C9\u03B8\u03B7\u03BA\u03B1\u03BD", -1, 1], + ["\u03C9\u03B8\u03B7\u03BA\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["\u03BE\u03B1\u03BD\u03B1\u03C0\u03B1", -1, 1], + ["\u03B5\u03C0\u03B1", -1, 1], + ["\u03C0\u03B5\u03C1\u03B9\u03C0\u03B1", -1, 1], + ["\u03B1\u03BD\u03B1\u03BC\u03C0\u03B1", -1, 1], + ["\u03B5\u03BC\u03C0\u03B1", -1, 1], + ["\u03C7\u03B1\u03C1\u03C4\u03BF\u03C0\u03B1", -1, 1], + ["\u03B5\u03BE\u03B1\u03C1\u03C7\u03B1", -1, 1], + ["\u03B3\u03B5", -1, 2], + ["\u03B3\u03BA\u03B5", -1, 2], + ["\u03BA\u03BB\u03B5", -1, 1], + ["\u03B5\u03BA\u03BB\u03B5", 9, 1], + ["\u03B1\u03C0\u03B5\u03BA\u03BB\u03B5", 10, 1], + ["\u03B1\u03C0\u03BF\u03BA\u03BB\u03B5", 9, 1], + ["\u03B5\u03C3\u03C9\u03BA\u03BB\u03B5", 9, 1], + ["\u03B4\u03B1\u03BD\u03B5", -1, 1], + ["\u03C0\u03B5", -1, 1], + ["\u03B5\u03C0\u03B5", 15, 1], + ["\u03BC\u03B5\u03C4\u03B5\u03C0\u03B5", 16, 1], + ["\u03B5\u03C3\u03B5", -1, 1], + ["\u03B3\u03BA", -1, 2], + ["\u03BC", -1, 2], + ["\u03C0\u03BF\u03C5\u03BA\u03B1\u03BC", 20, 2], + ["\u03BA\u03BF\u03BC", 20, 2], + ["\u03B1\u03BD", -1, 2], + ["\u03BF\u03BB\u03BF", -1, 2], + ["\u03B1\u03B8\u03C1\u03BF", -1, 1], + ["\u03C3\u03C5\u03BD\u03B1\u03B8\u03C1\u03BF", 25, 1], + ["\u03C0", -1, 2], + ["\u03BB\u03B1\u03C1", -1, 2], + ["\u03B4\u03B7\u03BC\u03BF\u03BA\u03C1\u03B1\u03C4", -1, 2], + ["\u03B1\u03C6", -1, 2], + ["\u03B3\u03B9\u03B3\u03B1\u03BD\u03C4\u03BF\u03B1\u03C6", 30, 2] + ]; + + /** @const */ var a_7 = [ + ["\u03B9\u03C3\u03B1", -1, 1], + ["\u03B9\u03C3\u03B1\u03BC\u03B5", -1, 1], + ["\u03B9\u03C3\u03B1\u03BD\u03B5", -1, 1], + ["\u03B9\u03C3\u03B5", -1, 1], + ["\u03B9\u03C3\u03B1\u03C4\u03B5", -1, 1], + ["\u03B9\u03C3\u03B1\u03BD", -1, 1], + ["\u03B9\u03C3\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_8 = [ + ["\u03BE\u03B1\u03BD\u03B1\u03C0\u03B1", -1, 1], + ["\u03B5\u03C0\u03B1", -1, 1], + ["\u03C0\u03B5\u03C1\u03B9\u03C0\u03B1", -1, 1], + ["\u03B1\u03BD\u03B1\u03BC\u03C0\u03B1", -1, 1], + ["\u03B5\u03BC\u03C0\u03B1", -1, 1], + ["\u03C7\u03B1\u03C1\u03C4\u03BF\u03C0\u03B1", -1, 1], + ["\u03B5\u03BE\u03B1\u03C1\u03C7\u03B1", -1, 1], + ["\u03BA\u03BB\u03B5", -1, 1], + ["\u03B5\u03BA\u03BB\u03B5", 7, 1], + ["\u03B1\u03C0\u03B5\u03BA\u03BB\u03B5", 8, 1], + ["\u03B1\u03C0\u03BF\u03BA\u03BB\u03B5", 7, 1], + ["\u03B5\u03C3\u03C9\u03BA\u03BB\u03B5", 7, 1], + ["\u03B4\u03B1\u03BD\u03B5", -1, 1], + ["\u03C0\u03B5", -1, 1], + ["\u03B5\u03C0\u03B5", 13, 1], + ["\u03BC\u03B5\u03C4\u03B5\u03C0\u03B5", 14, 1], + ["\u03B5\u03C3\u03B5", -1, 1], + ["\u03B1\u03B8\u03C1\u03BF", -1, 1], + ["\u03C3\u03C5\u03BD\u03B1\u03B8\u03C1\u03BF", 17, 1] + ]; + + /** @const */ var a_9 = [ + ["\u03B9\u03C3\u03BF\u03C5\u03BC\u03B5", -1, 1], + ["\u03B9\u03C3\u03BF\u03C5\u03BD\u03B5", -1, 1], + ["\u03B9\u03C3\u03B5\u03C4\u03B5", -1, 1], + ["\u03B9\u03C3\u03B5\u03B9", -1, 1], + ["\u03B9\u03C3\u03BF\u03C5\u03BD", -1, 1], + ["\u03B9\u03C3\u03B5\u03B9\u03C3", -1, 1], + ["\u03B9\u03C3\u03C9", -1, 1] + ]; + + /** @const */ var a_10 = [ + ["\u03B1\u03C4\u03B1", -1, 2], + ["\u03C6\u03B1", -1, 2], + ["\u03B7\u03C6\u03B1", 1, 2], + ["\u03BC\u03B5\u03B3", -1, 2], + ["\u03BB\u03C5\u03B3", -1, 2], + ["\u03B7\u03B4", -1, 2], + ["\u03BA\u03BB\u03B5", -1, 1], + ["\u03B5\u03C3\u03C9\u03BA\u03BB\u03B5", 6, 1], + ["\u03C0\u03BB\u03B5", -1, 1], + ["\u03B4\u03B1\u03BD\u03B5", -1, 1], + ["\u03C3\u03B5", -1, 1], + ["\u03B1\u03C3\u03B5", 10, 1], + ["\u03BA\u03B1\u03B8", -1, 2], + ["\u03B5\u03C7\u03B8", -1, 2], + ["\u03BA\u03B1\u03BA", -1, 2], + ["\u03BC\u03B1\u03BA", -1, 2], + ["\u03C3\u03BA", -1, 2], + ["\u03C6\u03B9\u03BB", -1, 2], + ["\u03BA\u03C5\u03BB", -1, 2], + ["\u03BC", -1, 2], + ["\u03B3\u03B5\u03BC", 19, 2], + ["\u03B1\u03C7\u03BD", -1, 2], + ["\u03C3\u03C5\u03BD\u03B1\u03B8\u03C1\u03BF", -1, 1], + ["\u03C0", -1, 2], + ["\u03B1\u03C0", 23, 2], + ["\u03B5\u03BC\u03C0", 23, 2], + ["\u03B5\u03C5\u03C0", 23, 2], + ["\u03B1\u03C1", -1, 2], + ["\u03B1\u03BF\u03C1", -1, 2], + ["\u03B3\u03C5\u03C1", -1, 2], + ["\u03C7\u03C1", -1, 2], + ["\u03C7\u03C9\u03C1", -1, 2], + ["\u03BA\u03C4", -1, 2], + ["\u03B1\u03BA\u03C4", 32, 2], + ["\u03C7\u03C4", -1, 2], + ["\u03B1\u03C7\u03C4", 34, 2], + ["\u03C4\u03B1\u03C7", -1, 2], + ["\u03C3\u03C7", -1, 2], + ["\u03B1\u03C3\u03C7", 37, 2], + ["\u03C5\u03C8", -1, 2] + ]; + + /** @const */ var a_11 = [ + ["\u03B9\u03C3\u03C4\u03B1", -1, 1], + ["\u03B9\u03C3\u03C4\u03B5", -1, 1], + ["\u03B9\u03C3\u03C4\u03B7", -1, 1], + ["\u03B9\u03C3\u03C4\u03BF\u03B9", -1, 1], + ["\u03B9\u03C3\u03C4\u03C9\u03BD", -1, 1], + ["\u03B9\u03C3\u03C4\u03BF", -1, 1], + ["\u03B9\u03C3\u03C4\u03B5\u03C3", -1, 1], + ["\u03B9\u03C3\u03C4\u03B7\u03C3", -1, 1], + ["\u03B9\u03C3\u03C4\u03BF\u03C3", -1, 1], + ["\u03B9\u03C3\u03C4\u03BF\u03C5\u03C3", -1, 1], + ["\u03B9\u03C3\u03C4\u03BF\u03C5", -1, 1] + ]; + + /** @const */ var a_12 = [ + ["\u03B5\u03B3\u03BA\u03BB\u03B5", -1, 1], + ["\u03B1\u03C0\u03BF\u03BA\u03BB\u03B5", -1, 1], + ["\u03B4\u03B1\u03BD\u03B5", -1, 2], + ["\u03B1\u03BD\u03C4\u03B9\u03B4\u03B1\u03BD\u03B5", 2, 2], + ["\u03C3\u03B5", -1, 1], + ["\u03BC\u03B5\u03C4\u03B1\u03C3\u03B5", 4, 1], + ["\u03BC\u03B9\u03BA\u03C1\u03BF\u03C3\u03B5", 4, 1] + ]; + + /** @const */ var a_13 = [ + ["\u03B1\u03C4\u03BF\u03BC\u03B9\u03BA", -1, 2], + ["\u03B5\u03B8\u03BD\u03B9\u03BA", -1, 4], + ["\u03C4\u03BF\u03C0\u03B9\u03BA", -1, 7], + ["\u03B5\u03BA\u03BB\u03B5\u03BA\u03C4\u03B9\u03BA", -1, 5], + ["\u03C3\u03BA\u03B5\u03C0\u03C4\u03B9\u03BA", -1, 6], + ["\u03B3\u03BD\u03C9\u03C3\u03C4\u03B9\u03BA", -1, 3], + ["\u03B1\u03B3\u03BD\u03C9\u03C3\u03C4\u03B9\u03BA", 5, 1], + ["\u03B1\u03BB\u03B5\u03BE\u03B1\u03BD\u03B4\u03C1\u03B9\u03BD", -1, 8], + ["\u03B8\u03B5\u03B1\u03C4\u03C1\u03B9\u03BD", -1, 10], + ["\u03B2\u03C5\u03B6\u03B1\u03BD\u03C4\u03B9\u03BD", -1, 9] + ]; + + /** @const */ var a_14 = [ + ["\u03B9\u03C3\u03BC\u03BF\u03B9", -1, 1], + ["\u03B9\u03C3\u03BC\u03C9\u03BD", -1, 1], + ["\u03B9\u03C3\u03BC\u03BF", -1, 1], + ["\u03B9\u03C3\u03BC\u03BF\u03C3", -1, 1], + ["\u03B9\u03C3\u03BC\u03BF\u03C5\u03C3", -1, 1], + ["\u03B9\u03C3\u03BC\u03BF\u03C5", -1, 1] + ]; + + /** @const */ var a_15 = [ + ["\u03C3", -1, 1], + ["\u03C7", -1, 1] + ]; + + /** @const */ var a_16 = [ + ["\u03BF\u03C5\u03B4\u03B1\u03BA\u03B9\u03B1", -1, 1], + ["\u03B1\u03C1\u03B1\u03BA\u03B9\u03B1", -1, 1], + ["\u03BF\u03C5\u03B4\u03B1\u03BA\u03B9", -1, 1], + ["\u03B1\u03C1\u03B1\u03BA\u03B9", -1, 1] + ]; + + /** @const */ var a_17 = [ + ["\u03B2", -1, 2], + ["\u03B2\u03B1\u03BC\u03B2", 0, 1], + ["\u03C3\u03BB\u03BF\u03B2", 0, 1], + ["\u03C4\u03C3\u03B5\u03C7\u03BF\u03C3\u03BB\u03BF\u03B2", 2, 1], + ["\u03BA\u03B1\u03C1\u03B4", -1, 2], + ["\u03B6", -1, 2], + ["\u03C4\u03B6", 5, 1], + ["\u03BA", -1, 1], + ["\u03BA\u03B1\u03C0\u03B1\u03BA", 7, 1], + ["\u03C3\u03BF\u03BA", 7, 1], + ["\u03C3\u03BA", 7, 1], + ["\u03B2\u03B1\u03BB", -1, 2], + ["\u03BC\u03B1\u03BB", -1, 1], + ["\u03B3\u03BB", -1, 2], + ["\u03C4\u03C1\u03B9\u03C0\u03BF\u03BB", -1, 2], + ["\u03C0\u03BB", -1, 1], + ["\u03BB\u03BF\u03C5\u03BB", -1, 1], + ["\u03C6\u03C5\u03BB", -1, 1], + ["\u03BA\u03B1\u03B9\u03BC", -1, 1], + ["\u03BA\u03BB\u03B9\u03BC", -1, 1], + ["\u03C6\u03B1\u03C1\u03BC", -1, 1], + ["\u03B3\u03B9\u03B1\u03BD", -1, 2], + ["\u03C3\u03C0\u03B1\u03BD", -1, 1], + ["\u03B7\u03B3\u03BF\u03C5\u03BC\u03B5\u03BD", -1, 2], + ["\u03BA\u03BF\u03BD", -1, 1], + ["\u03BC\u03B1\u03BA\u03C1\u03C5\u03BD", -1, 2], + ["\u03C0", -1, 2], + ["\u03BA\u03B1\u03C4\u03C1\u03B1\u03C0", 26, 1], + ["\u03C1", -1, 1], + ["\u03B2\u03C1", 28, 1], + ["\u03BB\u03B1\u03B2\u03C1", 29, 1], + ["\u03B1\u03BC\u03B2\u03C1", 29, 1], + ["\u03BC\u03B5\u03C1", 28, 1], + ["\u03C0\u03B1\u03C4\u03B5\u03C1", 28, 2], + ["\u03B1\u03BD\u03B8\u03C1", 28, 1], + ["\u03BA\u03BF\u03C1", 28, 1], + ["\u03C3", -1, 1], + ["\u03BD\u03B1\u03B3\u03BA\u03B1\u03C3", 36, 1], + ["\u03C4\u03BF\u03C3", 36, 2], + ["\u03BC\u03BF\u03C5\u03C3\u03C4", -1, 1], + ["\u03C1\u03C5", -1, 1], + ["\u03C6", -1, 1], + ["\u03C3\u03C6", 41, 1], + ["\u03B1\u03BB\u03B9\u03C3\u03C6", 42, 1], + ["\u03BD\u03C5\u03C6", 41, 2], + ["\u03C7", -1, 1] + ]; + + /** @const */ var a_18 = [ + ["\u03B1\u03BA\u03B9\u03B1", -1, 1], + ["\u03B1\u03C1\u03B1\u03BA\u03B9\u03B1", 0, 1], + ["\u03B9\u03C4\u03C3\u03B1", -1, 1], + ["\u03B1\u03BA\u03B9", -1, 1], + ["\u03B1\u03C1\u03B1\u03BA\u03B9", 3, 1], + ["\u03B9\u03C4\u03C3\u03C9\u03BD", -1, 1], + ["\u03B9\u03C4\u03C3\u03B1\u03C3", -1, 1], + ["\u03B9\u03C4\u03C3\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_19 = [ + ["\u03C8\u03B1\u03BB", -1, 1], + ["\u03B1\u03B9\u03C6\u03BD", -1, 1], + ["\u03BF\u03BB\u03BF", -1, 1], + ["\u03B9\u03C1", -1, 1] + ]; + + /** @const */ var a_20 = [ + ["\u03B5", -1, 1], + ["\u03C0\u03B1\u03B9\u03C7\u03BD", -1, 1] + ]; + + /** @const */ var a_21 = [ + ["\u03B9\u03B4\u03B9\u03B1", -1, 1], + ["\u03B9\u03B4\u03B9\u03C9\u03BD", -1, 1], + ["\u03B9\u03B4\u03B9\u03BF", -1, 1] + ]; + + /** @const */ var a_22 = [ + ["\u03B9\u03B2", -1, 1], + ["\u03B4", -1, 1], + ["\u03C6\u03C1\u03B1\u03B3\u03BA", -1, 1], + ["\u03BB\u03C5\u03BA", -1, 1], + ["\u03BF\u03B2\u03B5\u03BB", -1, 1], + ["\u03BC\u03B7\u03BD", -1, 1], + ["\u03C1", -1, 1] + ]; + + /** @const */ var a_23 = [ + ["\u03B9\u03C3\u03BA\u03B5", -1, 1], + ["\u03B9\u03C3\u03BA\u03BF", -1, 1], + ["\u03B9\u03C3\u03BA\u03BF\u03C3", -1, 1], + ["\u03B9\u03C3\u03BA\u03BF\u03C5", -1, 1] + ]; + + /** @const */ var a_24 = [ + ["\u03B1\u03B4\u03C9\u03BD", -1, 1], + ["\u03B1\u03B4\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_25 = [ + ["\u03B3\u03B9\u03B1\u03B3\u03B9", -1, -1], + ["\u03B8\u03B5\u03B9", -1, -1], + ["\u03BF\u03BA", -1, -1], + ["\u03BC\u03B1\u03BC", -1, -1], + ["\u03BC\u03B1\u03BD", -1, -1], + ["\u03BC\u03C0\u03B1\u03BC\u03C0", -1, -1], + ["\u03C0\u03B5\u03B8\u03B5\u03C1", -1, -1], + ["\u03C0\u03B1\u03C4\u03B5\u03C1", -1, -1], + ["\u03BA\u03C5\u03C1", -1, -1], + ["\u03BD\u03C4\u03B1\u03BD\u03C4", -1, -1] + ]; + + /** @const */ var a_26 = [ + ["\u03B5\u03B4\u03C9\u03BD", -1, 1], + ["\u03B5\u03B4\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_27 = [ + ["\u03BC\u03B9\u03BB", -1, 1], + ["\u03B4\u03B1\u03C0", -1, 1], + ["\u03B3\u03B7\u03C0", -1, 1], + ["\u03B9\u03C0", -1, 1], + ["\u03B5\u03BC\u03C0", -1, 1], + ["\u03BF\u03C0", -1, 1], + ["\u03BA\u03C1\u03B1\u03C3\u03C0", -1, 1], + ["\u03C5\u03C0", -1, 1] + ]; + + /** @const */ var a_28 = [ + ["\u03BF\u03C5\u03B4\u03C9\u03BD", -1, 1], + ["\u03BF\u03C5\u03B4\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_29 = [ + ["\u03C4\u03C1\u03B1\u03B3", -1, 1], + ["\u03C6\u03B5", -1, 1], + ["\u03BA\u03B1\u03BB\u03B9\u03B1\u03BA", -1, 1], + ["\u03B1\u03C1\u03BA", -1, 1], + ["\u03C3\u03BA", -1, 1], + ["\u03C0\u03B5\u03C4\u03B1\u03BB", -1, 1], + ["\u03B2\u03B5\u03BB", -1, 1], + ["\u03BB\u03BF\u03C5\u03BB", -1, 1], + ["\u03C6\u03BB", -1, 1], + ["\u03C7\u03BD", -1, 1], + ["\u03C0\u03BB\u03B5\u03BE", -1, 1], + ["\u03C3\u03C0", -1, 1], + ["\u03C6\u03C1", -1, 1], + ["\u03C3", -1, 1], + ["\u03BB\u03B9\u03C7", -1, 1] + ]; + + /** @const */ var a_30 = [ + ["\u03B5\u03C9\u03BD", -1, 1], + ["\u03B5\u03C9\u03C3", -1, 1] + ]; + + /** @const */ var a_31 = [ + ["\u03B4", -1, 1], + ["\u03B9\u03B4", 0, 1], + ["\u03B8", -1, 1], + ["\u03B3\u03B1\u03BB", -1, 1], + ["\u03B5\u03BB", -1, 1], + ["\u03BD", -1, 1], + ["\u03C0", -1, 1], + ["\u03C0\u03B1\u03C1", -1, 1] + ]; + + /** @const */ var a_32 = [ + ["\u03B9\u03B1", -1, 1], + ["\u03B9\u03C9\u03BD", -1, 1], + ["\u03B9\u03BF\u03C5", -1, 1] + ]; + + /** @const */ var a_33 = [ + ["\u03B9\u03BA\u03B1", -1, 1], + ["\u03B9\u03BA\u03C9\u03BD", -1, 1], + ["\u03B9\u03BA\u03BF", -1, 1], + ["\u03B9\u03BA\u03BF\u03C5", -1, 1] + ]; + + /** @const */ var a_34 = [ + ["\u03B1\u03B4", -1, 1], + ["\u03C3\u03C5\u03BD\u03B1\u03B4", 0, 1], + ["\u03BA\u03B1\u03C4\u03B1\u03B4", 0, 1], + ["\u03B1\u03BD\u03C4\u03B9\u03B4", -1, 1], + ["\u03B5\u03BD\u03B4", -1, 1], + ["\u03C6\u03C5\u03BB\u03BF\u03B4", -1, 1], + ["\u03C5\u03C0\u03BF\u03B4", -1, 1], + ["\u03C0\u03C1\u03C9\u03C4\u03BF\u03B4", -1, 1], + ["\u03B5\u03BE\u03C9\u03B4", -1, 1], + ["\u03B7\u03B8", -1, 1], + ["\u03B1\u03BD\u03B7\u03B8", 9, 1], + ["\u03BE\u03B9\u03BA", -1, 1], + ["\u03B1\u03BB", -1, 1], + ["\u03B1\u03BC\u03BC\u03BF\u03C7\u03B1\u03BB", 12, 1], + ["\u03C3\u03C5\u03BD\u03BF\u03BC\u03B7\u03BB", -1, 1], + ["\u03BC\u03C0\u03BF\u03BB", -1, 1], + ["\u03BC\u03BF\u03C5\u03BB", -1, 1], + ["\u03C4\u03C3\u03B1\u03BC", -1, 1], + ["\u03B2\u03C1\u03C9\u03BC", -1, 1], + ["\u03B1\u03BC\u03B1\u03BD", -1, 1], + ["\u03BC\u03C0\u03B1\u03BD", -1, 1], + ["\u03BA\u03B1\u03BB\u03BB\u03B9\u03BD", -1, 1], + ["\u03C0\u03BF\u03C3\u03C4\u03B5\u03BB\u03BD", -1, 1], + ["\u03C6\u03B9\u03BB\u03BF\u03BD", -1, 1], + ["\u03BA\u03B1\u03BB\u03C0", -1, 1], + ["\u03B3\u03B5\u03C1", -1, 1], + ["\u03C7\u03B1\u03C3", -1, 1], + ["\u03BC\u03C0\u03BF\u03C3", -1, 1], + ["\u03C0\u03BB\u03B9\u03B1\u03C4\u03C3", -1, 1], + ["\u03C0\u03B5\u03C4\u03C3", -1, 1], + ["\u03C0\u03B9\u03C4\u03C3", -1, 1], + ["\u03C6\u03C5\u03C3", -1, 1], + ["\u03BC\u03C0\u03B1\u03B3\u03B9\u03B1\u03C4", -1, 1], + ["\u03BD\u03B9\u03C4", -1, 1], + ["\u03C0\u03B9\u03BA\u03B1\u03BD\u03C4", -1, 1], + ["\u03C3\u03B5\u03C1\u03C4", -1, 1] + ]; + + /** @const */ var a_35 = [ + ["\u03B1\u03B3\u03B1\u03BC\u03B5", -1, 1], + ["\u03B7\u03BA\u03B1\u03BC\u03B5", -1, 1], + ["\u03B7\u03B8\u03B7\u03BA\u03B1\u03BC\u03B5", 1, 1], + ["\u03B7\u03C3\u03B1\u03BC\u03B5", -1, 1], + ["\u03BF\u03C5\u03C3\u03B1\u03BC\u03B5", -1, 1] + ]; + + /** @const */ var a_36 = [ + ["\u03B2\u03BF\u03C5\u03B2", -1, 1], + ["\u03BE\u03B5\u03B8", -1, 1], + ["\u03C0\u03B5\u03B8", -1, 1], + ["\u03B1\u03C0\u03BF\u03B8", -1, 1], + ["\u03B1\u03C0\u03BF\u03BA", -1, 1], + ["\u03BF\u03C5\u03BB", -1, 1], + ["\u03B1\u03BD\u03B1\u03C0", -1, 1], + ["\u03C0\u03B9\u03BA\u03C1", -1, 1], + ["\u03C0\u03BF\u03C4", -1, 1], + ["\u03B1\u03C0\u03BF\u03C3\u03C4", -1, 1], + ["\u03C7", -1, 1], + ["\u03C3\u03B9\u03C7", 10, 1] + ]; + + /** @const */ var a_37 = [ + ["\u03C4\u03C1", -1, 1], + ["\u03C4\u03C3", -1, 1] + ]; + + /** @const */ var a_38 = [ + ["\u03B1\u03B3\u03B1\u03BD\u03B5", -1, 1], + ["\u03B7\u03BA\u03B1\u03BD\u03B5", -1, 1], + ["\u03B7\u03B8\u03B7\u03BA\u03B1\u03BD\u03B5", 1, 1], + ["\u03B7\u03C3\u03B1\u03BD\u03B5", -1, 1], + ["\u03BF\u03C5\u03C3\u03B1\u03BD\u03B5", -1, 1], + ["\u03BF\u03BD\u03C4\u03B1\u03BD\u03B5", -1, 1], + ["\u03B9\u03BF\u03BD\u03C4\u03B1\u03BD\u03B5", 5, 1], + ["\u03BF\u03C5\u03BD\u03C4\u03B1\u03BD\u03B5", -1, 1], + ["\u03B9\u03BF\u03C5\u03BD\u03C4\u03B1\u03BD\u03B5", 7, 1], + ["\u03BF\u03C4\u03B1\u03BD\u03B5", -1, 1], + ["\u03B9\u03BF\u03C4\u03B1\u03BD\u03B5", 9, 1] + ]; + + /** @const */ var a_39 = [ + ["\u03C4\u03B1\u03B2", -1, 1], + ["\u03BD\u03C4\u03B1\u03B2", 0, 1], + ["\u03C8\u03B7\u03BB\u03BF\u03C4\u03B1\u03B2", 0, 1], + ["\u03BB\u03B9\u03B2", -1, 1], + ["\u03BA\u03BB\u03B9\u03B2", 3, 1], + ["\u03BE\u03B7\u03C1\u03BF\u03BA\u03BB\u03B9\u03B2", 4, 1], + ["\u03B3", -1, 1], + ["\u03B1\u03B3", 6, 1], + ["\u03C4\u03C1\u03B1\u03B3", 7, 1], + ["\u03C4\u03C3\u03B1\u03B3", 7, 1], + ["\u03B1\u03B8\u03B9\u03B3\u03B3", 6, 1], + ["\u03C4\u03C3\u03B9\u03B3\u03B3", 6, 1], + ["\u03B1\u03C4\u03C3\u03B9\u03B3\u03B3", 11, 1], + ["\u03C3\u03C4\u03B5\u03B3", 6, 1], + ["\u03B1\u03C0\u03B7\u03B3", 6, 1], + ["\u03C3\u03B9\u03B3", 6, 1], + ["\u03B1\u03BD\u03BF\u03C1\u03B3", 6, 1], + ["\u03B5\u03BD\u03BF\u03C1\u03B3", 6, 1], + ["\u03BA\u03B1\u03BB\u03C0\u03BF\u03C5\u03B6", -1, 1], + ["\u03B8", -1, 1], + ["\u03BC\u03C9\u03B1\u03BC\u03B5\u03B8", 19, 1], + ["\u03C0\u03B9\u03B8", 19, 1], + ["\u03B1\u03C0\u03B9\u03B8", 21, 1], + ["\u03B4\u03B5\u03BA", -1, 1], + ["\u03C0\u03B5\u03BB\u03B5\u03BA", -1, 1], + ["\u03B9\u03BA", -1, 1], + ["\u03B1\u03BD\u03B9\u03BA", 25, 1], + ["\u03B2\u03BF\u03C5\u03BB\u03BA", -1, 1], + ["\u03B2\u03B1\u03C3\u03BA", -1, 1], + ["\u03B2\u03C1\u03B1\u03C7\u03C5\u03BA", -1, 1], + ["\u03B3\u03B1\u03BB", -1, 1], + ["\u03BA\u03B1\u03C4\u03B1\u03B3\u03B1\u03BB", 30, 1], + ["\u03BF\u03BB\u03BF\u03B3\u03B1\u03BB", 30, 1], + ["\u03B2\u03B1\u03B8\u03C5\u03B3\u03B1\u03BB", 30, 1], + ["\u03BC\u03B5\u03BB", -1, 1], + ["\u03BA\u03B1\u03C3\u03C4\u03B5\u03BB", -1, 1], + ["\u03C0\u03BF\u03C1\u03C4\u03BF\u03BB", -1, 1], + ["\u03C0\u03BB", -1, 1], + ["\u03B4\u03B9\u03C0\u03BB", 37, 1], + ["\u03BB\u03B1\u03BF\u03C0\u03BB", 37, 1], + ["\u03C8\u03C5\u03C7\u03BF\u03C0\u03BB", 37, 1], + ["\u03BF\u03C5\u03BB", -1, 1], + ["\u03BC", -1, 1], + ["\u03BF\u03BB\u03B9\u03B3\u03BF\u03B4\u03B1\u03BC", 42, 1], + ["\u03BC\u03BF\u03C5\u03C3\u03BF\u03C5\u03BB\u03BC", 42, 1], + ["\u03B4\u03C1\u03B1\u03B4\u03BF\u03C5\u03BC", 42, 1], + ["\u03B2\u03C1\u03B1\u03C7\u03BC", 42, 1], + ["\u03BD", -1, 1], + ["\u03B1\u03BC\u03B5\u03C1\u03B9\u03BA\u03B1\u03BD", 47, 1], + ["\u03C0", -1, 1], + ["\u03B1\u03B4\u03B1\u03C0", 49, 1], + ["\u03C7\u03B1\u03BC\u03B7\u03BB\u03BF\u03B4\u03B1\u03C0", 49, 1], + ["\u03C0\u03BF\u03BB\u03C5\u03B4\u03B1\u03C0", 49, 1], + ["\u03BA\u03BF\u03C0", 49, 1], + ["\u03C5\u03C0\u03BF\u03BA\u03BF\u03C0", 53, 1], + ["\u03C4\u03C3\u03BF\u03C0", 49, 1], + ["\u03C3\u03C0", 49, 1], + ["\u03B5\u03C1", -1, 1], + ["\u03B3\u03B5\u03C1", 57, 1], + ["\u03B2\u03B5\u03C4\u03B5\u03C1", 57, 1], + ["\u03BB\u03BF\u03C5\u03B8\u03B7\u03C1", -1, 1], + ["\u03BA\u03BF\u03C1\u03BC\u03BF\u03C1", -1, 1], + ["\u03C0\u03B5\u03C1\u03B9\u03C4\u03C1", -1, 1], + ["\u03BF\u03C5\u03C1", -1, 1], + ["\u03C3", -1, 1], + ["\u03B2\u03B1\u03C3", 64, 1], + ["\u03C0\u03BF\u03BB\u03B9\u03C3", 64, 1], + ["\u03C3\u03B1\u03C1\u03B1\u03BA\u03B1\u03C4\u03C3", 64, 1], + ["\u03B8\u03C5\u03C3", 64, 1], + ["\u03B4\u03B9\u03B1\u03C4", -1, 1], + ["\u03C0\u03BB\u03B1\u03C4", -1, 1], + ["\u03C4\u03C3\u03B1\u03C1\u03BB\u03B1\u03C4", -1, 1], + ["\u03C4\u03B5\u03C4", -1, 1], + ["\u03C0\u03BF\u03C5\u03C1\u03B9\u03C4", -1, 1], + ["\u03C3\u03BF\u03C5\u03BB\u03C4", -1, 1], + ["\u03BC\u03B1\u03B9\u03BD\u03C4", -1, 1], + ["\u03B6\u03C9\u03BD\u03C4", -1, 1], + ["\u03BA\u03B1\u03C3\u03C4", -1, 1], + ["\u03C6", -1, 1], + ["\u03B4\u03B9\u03B1\u03C6", 78, 1], + ["\u03C3\u03C4\u03B5\u03C6", 78, 1], + ["\u03C6\u03C9\u03C4\u03BF\u03C3\u03C4\u03B5\u03C6", 80, 1], + ["\u03C0\u03B5\u03C1\u03B7\u03C6", 78, 1], + ["\u03C5\u03C0\u03B5\u03C1\u03B7\u03C6", 82, 1], + ["\u03BA\u03BF\u03B9\u03BB\u03B1\u03C1\u03C6", 78, 1], + ["\u03C0\u03B5\u03BD\u03C4\u03B1\u03C1\u03C6", 78, 1], + ["\u03BF\u03C1\u03C6", 78, 1], + ["\u03C7", -1, 1], + ["\u03B1\u03BC\u03B7\u03C7", 87, 1], + ["\u03B2\u03B9\u03BF\u03BC\u03B7\u03C7", 87, 1], + ["\u03BC\u03B5\u03B3\u03BB\u03BF\u03B2\u03B9\u03BF\u03BC\u03B7\u03C7", 89, 1], + ["\u03BA\u03B1\u03C0\u03BD\u03BF\u03B2\u03B9\u03BF\u03BC\u03B7\u03C7", 89, 1], + ["\u03BC\u03B9\u03BA\u03C1\u03BF\u03B2\u03B9\u03BF\u03BC\u03B7\u03C7", 89, 1], + ["\u03C0\u03BF\u03BB\u03C5\u03BC\u03B7\u03C7", 87, 1], + ["\u03BB\u03B9\u03C7", 87, 1] + ]; + + /** @const */ var a_40 = [ + ["\u03B7\u03C3\u03B5\u03C4\u03B5", -1, 1] + ]; + + /** @const */ var a_41 = [ + ["\u03B5\u03BD\u03B4", -1, 1], + ["\u03C3\u03C5\u03BD\u03B4", -1, 1], + ["\u03BF\u03B4", -1, 1], + ["\u03B4\u03B9\u03B1\u03B8", -1, 1], + ["\u03BA\u03B1\u03B8", -1, 1], + ["\u03C1\u03B1\u03B8", -1, 1], + ["\u03C4\u03B1\u03B8", -1, 1], + ["\u03C4\u03B9\u03B8", -1, 1], + ["\u03B5\u03BA\u03B8", -1, 1], + ["\u03B5\u03BD\u03B8", -1, 1], + ["\u03C3\u03C5\u03BD\u03B8", -1, 1], + ["\u03C1\u03BF\u03B8", -1, 1], + ["\u03C5\u03C0\u03B5\u03C1\u03B8", -1, 1], + ["\u03C3\u03B8", -1, 1], + ["\u03B5\u03C5\u03B8", -1, 1], + ["\u03B1\u03C1\u03BA", -1, 1], + ["\u03C9\u03C6\u03B5\u03BB", -1, 1], + ["\u03B2\u03BF\u03BB", -1, 1], + ["\u03B1\u03B9\u03BD", -1, 1], + ["\u03C0\u03BF\u03BD", -1, 1], + ["\u03C1\u03BF\u03BD", -1, 1], + ["\u03C3\u03C5\u03BD", -1, 1], + ["\u03B2\u03B1\u03C1", -1, 1], + ["\u03B2\u03C1", -1, 1], + ["\u03B1\u03B9\u03C1", -1, 1], + ["\u03C6\u03BF\u03C1", -1, 1], + ["\u03B5\u03C5\u03C1", -1, 1], + ["\u03C0\u03C5\u03C1", -1, 1], + ["\u03C7\u03C9\u03C1", -1, 1], + ["\u03BD\u03B5\u03C4", -1, 1], + ["\u03C3\u03C7", -1, 1] + ]; + + /** @const */ var a_42 = [ + ["\u03C0\u03B1\u03B3", -1, 1], + ["\u03B4", -1, 1], + ["\u03B1\u03B4", 1, 1], + ["\u03B8", -1, 1], + ["\u03B1\u03B8", 3, 1], + ["\u03C4\u03BF\u03BA", -1, 1], + ["\u03C3\u03BA", -1, 1], + ["\u03C0\u03B1\u03C1\u03B1\u03BA\u03B1\u03BB", -1, 1], + ["\u03C3\u03BA\u03B5\u03BB", -1, 1], + ["\u03B1\u03C0\u03BB", -1, 1], + ["\u03B5\u03BC", -1, 1], + ["\u03B1\u03BD", -1, 1], + ["\u03B2\u03B5\u03BD", -1, 1], + ["\u03B2\u03B1\u03C1\u03BF\u03BD", -1, 1], + ["\u03BA\u03BF\u03C0", -1, 1], + ["\u03C3\u03B5\u03C1\u03C0", -1, 1], + ["\u03B1\u03B2\u03B1\u03C1", -1, 1], + ["\u03B5\u03BD\u03B1\u03C1", -1, 1], + ["\u03B1\u03B2\u03C1", -1, 1], + ["\u03BC\u03C0\u03BF\u03C1", -1, 1], + ["\u03B8\u03B1\u03C1\u03C1", -1, 1], + ["\u03BD\u03C4\u03C1", -1, 1], + ["\u03C5", -1, 1], + ["\u03BD\u03B9\u03C6", -1, 1], + ["\u03C3\u03C5\u03C1\u03C6", -1, 1] + ]; + + /** @const */ var a_43 = [ + ["\u03BF\u03BD\u03C4\u03B1\u03C3", -1, 1], + ["\u03C9\u03BD\u03C4\u03B1\u03C3", -1, 1] + ]; + + /** @const */ var a_44 = [ + ["\u03BF\u03BC\u03B1\u03C3\u03C4\u03B5", -1, 1], + ["\u03B9\u03BF\u03BC\u03B1\u03C3\u03C4\u03B5", 0, 1] + ]; + + /** @const */ var a_45 = [ + ["\u03C0", -1, 1], + ["\u03B1\u03C0", 0, 1], + ["\u03B1\u03BA\u03B1\u03C4\u03B1\u03C0", 1, 1], + ["\u03C3\u03C5\u03BC\u03C0", 0, 1], + ["\u03B1\u03C3\u03C5\u03BC\u03C0", 3, 1], + ["\u03B1\u03BC\u03B5\u03C4\u03B1\u03BC\u03C6", -1, 1] + ]; + + /** @const */ var a_46 = [ + ["\u03B6", -1, 1], + ["\u03B1\u03BB", -1, 1], + ["\u03C0\u03B1\u03C1\u03B1\u03BA\u03B1\u03BB", 1, 1], + ["\u03B5\u03BA\u03C4\u03B5\u03BB", -1, 1], + ["\u03BC", -1, 1], + ["\u03BE", -1, 1], + ["\u03C0\u03C1\u03BF", -1, 1], + ["\u03B1\u03C1", -1, 1], + ["\u03BD\u03B9\u03C3", -1, 1] + ]; + + /** @const */ var a_47 = [ + ["\u03B7\u03B8\u03B7\u03BA\u03B1", -1, 1], + ["\u03B7\u03B8\u03B7\u03BA\u03B5", -1, 1], + ["\u03B7\u03B8\u03B7\u03BA\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_48 = [ + ["\u03C0\u03B9\u03B8", -1, 1], + ["\u03BF\u03B8", -1, 1], + ["\u03BD\u03B1\u03C1\u03B8", -1, 1], + ["\u03C3\u03BA\u03BF\u03C5\u03BB", -1, 1], + ["\u03C3\u03BA\u03C9\u03BB", -1, 1], + ["\u03C3\u03C6", -1, 1] + ]; + + /** @const */ var a_49 = [ + ["\u03B8", -1, 1], + ["\u03B4\u03B9\u03B1\u03B8", 0, 1], + ["\u03C0\u03B1\u03C1\u03B1\u03BA\u03B1\u03C4\u03B1\u03B8", 0, 1], + ["\u03C3\u03C5\u03BD\u03B8", 0, 1], + ["\u03C0\u03C1\u03BF\u03C3\u03B8", 0, 1] + ]; + + /** @const */ var a_50 = [ + ["\u03B7\u03BA\u03B1", -1, 1], + ["\u03B7\u03BA\u03B5", -1, 1], + ["\u03B7\u03BA\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_51 = [ + ["\u03C6\u03B1\u03B3", -1, 1], + ["\u03BB\u03B7\u03B3", -1, 1], + ["\u03C6\u03C1\u03C5\u03B4", -1, 1], + ["\u03BC\u03B1\u03BD\u03C4\u03B9\u03BB", -1, 1], + ["\u03BC\u03B1\u03BB\u03BB", -1, 1], + ["\u03BF\u03BC", -1, 1], + ["\u03B2\u03BB\u03B5\u03C0", -1, 1], + ["\u03C0\u03BF\u03B4\u03B1\u03C1", -1, 1], + ["\u03BA\u03C5\u03BC\u03B1\u03C4", -1, 1], + ["\u03C0\u03C1\u03C9\u03C4", -1, 1], + ["\u03BB\u03B1\u03C7", -1, 1], + ["\u03C0\u03B1\u03BD\u03C4\u03B1\u03C7", -1, 1] + ]; + + /** @const */ var a_52 = [ + ["\u03C4\u03C3\u03B1", -1, 1], + ["\u03C7\u03B1\u03B4", -1, 1], + ["\u03BC\u03B5\u03B4", -1, 1], + ["\u03BB\u03B1\u03BC\u03C0\u03B9\u03B4", -1, 1], + ["\u03B4\u03B5", -1, 1], + ["\u03C0\u03BB\u03B5", -1, 1], + ["\u03BC\u03B5\u03C3\u03B1\u03B6", -1, 1], + ["\u03B4\u03B5\u03C3\u03C0\u03BF\u03B6", -1, 1], + ["\u03B1\u03B9\u03B8", -1, 1], + ["\u03C6\u03B1\u03C1\u03BC\u03B1\u03BA", -1, 1], + ["\u03B1\u03B3\u03BA", -1, 1], + ["\u03B1\u03BD\u03B7\u03BA", -1, 1], + ["\u03BB", -1, 1], + ["\u03BC", -1, 1], + ["\u03B1\u03BC", 13, 1], + ["\u03B2\u03C1\u03BF\u03BC", 13, 1], + ["\u03C5\u03C0\u03BF\u03C4\u03B5\u03B9\u03BD", -1, 1], + ["\u03B5\u03BA\u03BB\u03B9\u03C0", -1, 1], + ["\u03C1", -1, 1], + ["\u03B5\u03BD\u03B4\u03B9\u03B1\u03C6\u03B5\u03C1", 18, 1], + ["\u03B1\u03BD\u03B1\u03C1\u03C1", 18, 1], + ["\u03C0\u03B1\u03C4", -1, 1], + ["\u03BA\u03B1\u03B8\u03B1\u03C1\u03B5\u03C5", -1, 1], + ["\u03B4\u03B5\u03C5\u03C4\u03B5\u03C1\u03B5\u03C5", -1, 1], + ["\u03BB\u03B5\u03C7", -1, 1] + ]; + + /** @const */ var a_53 = [ + ["\u03BF\u03C5\u03C3\u03B1", -1, 1], + ["\u03BF\u03C5\u03C3\u03B5", -1, 1], + ["\u03BF\u03C5\u03C3\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_54 = [ + ["\u03C0\u03B5\u03BB", -1, 1], + ["\u03BB\u03BB", -1, 1], + ["\u03C3\u03BC\u03B7\u03BD", -1, 1], + ["\u03C1\u03C0", -1, 1], + ["\u03C0\u03C1", -1, 1], + ["\u03C6\u03C1", -1, 1], + ["\u03C7\u03BF\u03C1\u03C4", -1, 1], + ["\u03BF\u03C6", -1, 1], + ["\u03C8\u03BF\u03C6", 7, -1], + ["\u03C3\u03C6", -1, 1], + ["\u03BB\u03BF\u03C7", -1, 1], + ["\u03BD\u03B1\u03C5\u03BB\u03BF\u03C7", 10, -1] + ]; + + /** @const */ var a_55 = [ + ["\u03B1\u03BC\u03B1\u03BB\u03BB\u03B9", -1, 1], + ["\u03BB", -1, 1], + ["\u03B1\u03BC\u03B1\u03BB", 1, 1], + ["\u03BC", -1, 1], + ["\u03BF\u03C5\u03BB\u03B1\u03BC", 3, 1], + ["\u03B5\u03BD", -1, 1], + ["\u03B4\u03B5\u03C1\u03B2\u03B5\u03BD", 5, 1], + ["\u03C0", -1, 1], + ["\u03B1\u03B5\u03B9\u03C0", 7, 1], + ["\u03B1\u03C1\u03C4\u03B9\u03C0", 7, 1], + ["\u03C3\u03C5\u03BC\u03C0", 7, 1], + ["\u03BD\u03B5\u03BF\u03C0", 7, 1], + ["\u03BA\u03C1\u03BF\u03BA\u03B1\u03BB\u03BF\u03C0", 7, 1], + ["\u03BF\u03BB\u03BF\u03C0", 7, 1], + ["\u03C0\u03C1\u03BF\u03C3\u03C9\u03C0\u03BF\u03C0", 7, 1], + ["\u03C3\u03B9\u03B4\u03B7\u03C1\u03BF\u03C0", 7, 1], + ["\u03B4\u03C1\u03BF\u03C3\u03BF\u03C0", 7, 1], + ["\u03B1\u03C3\u03C0", 7, 1], + ["\u03B1\u03BD\u03C5\u03C0", 7, 1], + ["\u03C1", -1, 1], + ["\u03B1\u03C3\u03C0\u03B1\u03C1", 19, 1], + ["\u03C7\u03B1\u03C1", 19, 1], + ["\u03B1\u03C7\u03B1\u03C1", 21, 1], + ["\u03B1\u03C0\u03B5\u03C1", 19, 1], + ["\u03C4\u03C1", 19, 1], + ["\u03BF\u03C5\u03C1", 19, 1], + ["\u03C4", -1, 1], + ["\u03B4\u03B9\u03B1\u03C4", 26, 1], + ["\u03B5\u03C0\u03B9\u03C4", 26, 1], + ["\u03C3\u03C5\u03BD\u03C4", 26, 1], + ["\u03BF\u03BC\u03BF\u03C4", 26, 1], + ["\u03BD\u03BF\u03BC\u03BF\u03C4", 30, 1], + ["\u03B1\u03C0\u03BF\u03C4", 26, 1], + ["\u03C5\u03C0\u03BF\u03C4", 26, 1], + ["\u03B1\u03B2\u03B1\u03C3\u03C4", 26, 1], + ["\u03B1\u03B9\u03BC\u03BF\u03C3\u03C4", 26, 1], + ["\u03C0\u03C1\u03BF\u03C3\u03C4", 26, 1], + ["\u03B1\u03BD\u03C5\u03C3\u03C4", 26, 1], + ["\u03BD\u03B1\u03C5", -1, 1], + ["\u03B1\u03C6", -1, 1], + ["\u03BE\u03B5\u03C6", -1, 1], + ["\u03B1\u03B4\u03B7\u03C6", -1, 1], + ["\u03C0\u03B1\u03BC\u03C6", -1, 1], + ["\u03C0\u03BF\u03BB\u03C5\u03C6", -1, 1] + ]; + + /** @const */ var a_56 = [ + ["\u03B1\u03B3\u03B1", -1, 1], + ["\u03B1\u03B3\u03B5", -1, 1], + ["\u03B1\u03B3\u03B5\u03C3", -1, 1] + ]; + + /** @const */ var a_57 = [ + ["\u03B7\u03C3\u03B1", -1, 1], + ["\u03B7\u03C3\u03B5", -1, 1], + ["\u03B7\u03C3\u03BF\u03C5", -1, 1] + ]; + + /** @const */ var a_58 = [ + ["\u03BD", -1, 1], + ["\u03B4\u03C9\u03B4\u03B5\u03BA\u03B1\u03BD", 0, 1], + ["\u03B5\u03C0\u03C4\u03B1\u03BD", 0, 1], + ["\u03BC\u03B5\u03B3\u03B1\u03BB\u03BF\u03BD", 0, 1], + ["\u03B5\u03C1\u03B7\u03BC\u03BF\u03BD", 0, 1], + ["\u03C7\u03B5\u03C1\u03C3\u03BF\u03BD", 0, 1] + ]; + + /** @const */ var a_59 = [ + ["\u03B7\u03C3\u03C4\u03B5", -1, 1] + ]; + + /** @const */ var a_60 = [ + ["\u03C3\u03B2", -1, 1], + ["\u03B1\u03C3\u03B2", 0, 1], + ["\u03B1\u03C0\u03BB", -1, 1], + ["\u03B1\u03B5\u03B9\u03BC\u03BD", -1, 1], + ["\u03C7\u03C1", -1, 1], + ["\u03B1\u03C7\u03C1", 4, 1], + ["\u03BA\u03BF\u03B9\u03BD\u03BF\u03C7\u03C1", 4, 1], + ["\u03B4\u03C5\u03C3\u03C7\u03C1", 4, 1], + ["\u03B5\u03C5\u03C7\u03C1", 4, 1], + ["\u03C0\u03B1\u03BB\u03B9\u03BC\u03C8", -1, 1] + ]; + + /** @const */ var a_61 = [ + ["\u03BF\u03C5\u03BD\u03B5", -1, 1], + ["\u03B7\u03B8\u03BF\u03C5\u03BD\u03B5", 0, 1], + ["\u03B7\u03C3\u03BF\u03C5\u03BD\u03B5", 0, 1] + ]; + + /** @const */ var a_62 = [ + ["\u03C3\u03C0\u03B9", -1, 1], + ["\u03BD", -1, 1], + ["\u03B5\u03BE\u03C9\u03BD", 1, 1], + ["\u03C1", -1, 1], + ["\u03C3\u03C4\u03C1\u03B1\u03B2\u03BF\u03BC\u03BF\u03C5\u03C4\u03C3", -1, 1], + ["\u03BA\u03B1\u03BA\u03BF\u03BC\u03BF\u03C5\u03C4\u03C3", -1, 1] + ]; + + /** @const */ var a_63 = [ + ["\u03BF\u03C5\u03BC\u03B5", -1, 1], + ["\u03B7\u03B8\u03BF\u03C5\u03BC\u03B5", 0, 1], + ["\u03B7\u03C3\u03BF\u03C5\u03BC\u03B5", 0, 1] + ]; + + /** @const */ var a_64 = [ + ["\u03B1\u03B6", -1, 1], + ["\u03C9\u03C1\u03B9\u03BF\u03C0\u03BB", -1, 1], + ["\u03B1\u03C3\u03BF\u03C5\u03C3", -1, 1], + ["\u03C0\u03B1\u03C1\u03B1\u03C3\u03BF\u03C5\u03C3", 2, 1], + ["\u03B1\u03BB\u03BB\u03BF\u03C3\u03BF\u03C5\u03C3", -1, 1], + ["\u03C6", -1, 1], + ["\u03C7", -1, 1] + ]; + + /** @const */ var a_65 = [ + ["\u03BC\u03B1\u03C4\u03B1", -1, 1], + ["\u03BC\u03B1\u03C4\u03C9\u03BD", -1, 1], + ["\u03BC\u03B1\u03C4\u03BF\u03C3", -1, 1] + ]; + + /** @const */ var a_66 = [ + ["\u03B1", -1, 1], + ["\u03B9\u03BF\u03C5\u03BC\u03B1", 0, 1], + ["\u03BF\u03BC\u03BF\u03C5\u03BD\u03B1", 0, 1], + ["\u03B9\u03BF\u03BC\u03BF\u03C5\u03BD\u03B1", 2, 1], + ["\u03BF\u03C3\u03BF\u03C5\u03BD\u03B1", 0, 1], + ["\u03B9\u03BF\u03C3\u03BF\u03C5\u03BD\u03B1", 4, 1], + ["\u03B5", -1, 1], + ["\u03B1\u03B3\u03B1\u03C4\u03B5", 6, 1], + ["\u03B7\u03BA\u03B1\u03C4\u03B5", 6, 1], + ["\u03B7\u03B8\u03B7\u03BA\u03B1\u03C4\u03B5", 8, 1], + ["\u03B7\u03C3\u03B1\u03C4\u03B5", 6, 1], + ["\u03BF\u03C5\u03C3\u03B1\u03C4\u03B5", 6, 1], + ["\u03B5\u03B9\u03C4\u03B5", 6, 1], + ["\u03B7\u03B8\u03B5\u03B9\u03C4\u03B5", 12, 1], + ["\u03B9\u03B5\u03BC\u03B1\u03C3\u03C4\u03B5", 6, 1], + ["\u03BF\u03C5\u03BC\u03B1\u03C3\u03C4\u03B5", 6, 1], + ["\u03B9\u03BF\u03C5\u03BC\u03B1\u03C3\u03C4\u03B5", 15, 1], + ["\u03B9\u03B5\u03C3\u03B1\u03C3\u03C4\u03B5", 6, 1], + ["\u03BF\u03C3\u03B1\u03C3\u03C4\u03B5", 6, 1], + ["\u03B9\u03BF\u03C3\u03B1\u03C3\u03C4\u03B5", 18, 1], + ["\u03B7", -1, 1], + ["\u03B9", -1, 1], + ["\u03B1\u03BC\u03B1\u03B9", 21, 1], + ["\u03B9\u03B5\u03BC\u03B1\u03B9", 21, 1], + ["\u03BF\u03BC\u03B1\u03B9", 21, 1], + ["\u03BF\u03C5\u03BC\u03B1\u03B9", 21, 1], + ["\u03B1\u03C3\u03B1\u03B9", 21, 1], + ["\u03B5\u03C3\u03B1\u03B9", 21, 1], + ["\u03B9\u03B5\u03C3\u03B1\u03B9", 27, 1], + ["\u03B1\u03C4\u03B1\u03B9", 21, 1], + ["\u03B5\u03C4\u03B1\u03B9", 21, 1], + ["\u03B9\u03B5\u03C4\u03B1\u03B9", 30, 1], + ["\u03BF\u03BD\u03C4\u03B1\u03B9", 21, 1], + ["\u03BF\u03C5\u03BD\u03C4\u03B1\u03B9", 21, 1], + ["\u03B9\u03BF\u03C5\u03BD\u03C4\u03B1\u03B9", 33, 1], + ["\u03B5\u03B9", 21, 1], + ["\u03B1\u03B5\u03B9", 35, 1], + ["\u03B7\u03B8\u03B5\u03B9", 35, 1], + ["\u03B7\u03C3\u03B5\u03B9", 35, 1], + ["\u03BF\u03B9", 21, 1], + ["\u03B1\u03BD", -1, 1], + ["\u03B1\u03B3\u03B1\u03BD", 40, 1], + ["\u03B7\u03BA\u03B1\u03BD", 40, 1], + ["\u03B7\u03B8\u03B7\u03BA\u03B1\u03BD", 42, 1], + ["\u03B7\u03C3\u03B1\u03BD", 40, 1], + ["\u03BF\u03C5\u03C3\u03B1\u03BD", 40, 1], + ["\u03BF\u03BD\u03C4\u03BF\u03C5\u03C3\u03B1\u03BD", 45, 1], + ["\u03B9\u03BF\u03BD\u03C4\u03BF\u03C5\u03C3\u03B1\u03BD", 46, 1], + ["\u03BF\u03BD\u03C4\u03B1\u03BD", 40, 1], + ["\u03B9\u03BF\u03BD\u03C4\u03B1\u03BD", 48, 1], + ["\u03BF\u03C5\u03BD\u03C4\u03B1\u03BD", 40, 1], + ["\u03B9\u03BF\u03C5\u03BD\u03C4\u03B1\u03BD", 50, 1], + ["\u03BF\u03C4\u03B1\u03BD", 40, 1], + ["\u03B9\u03BF\u03C4\u03B1\u03BD", 52, 1], + ["\u03BF\u03BC\u03B1\u03C3\u03C4\u03B1\u03BD", 40, 1], + ["\u03B9\u03BF\u03BC\u03B1\u03C3\u03C4\u03B1\u03BD", 54, 1], + ["\u03BF\u03C3\u03B1\u03C3\u03C4\u03B1\u03BD", 40, 1], + ["\u03B9\u03BF\u03C3\u03B1\u03C3\u03C4\u03B1\u03BD", 56, 1], + ["\u03BF\u03C5\u03BD", -1, 1], + ["\u03B7\u03B8\u03BF\u03C5\u03BD", 58, 1], + ["\u03BF\u03BC\u03BF\u03C5\u03BD", 58, 1], + ["\u03B9\u03BF\u03BC\u03BF\u03C5\u03BD", 60, 1], + ["\u03B7\u03C3\u03BF\u03C5\u03BD", 58, 1], + ["\u03BF\u03C3\u03BF\u03C5\u03BD", 58, 1], + ["\u03B9\u03BF\u03C3\u03BF\u03C5\u03BD", 63, 1], + ["\u03C9\u03BD", -1, 1], + ["\u03B7\u03B4\u03C9\u03BD", 65, 1], + ["\u03BF", -1, 1], + ["\u03B1\u03C3", -1, 1], + ["\u03B5\u03C3", -1, 1], + ["\u03B7\u03B4\u03B5\u03C3", 69, 1], + ["\u03B7\u03C3\u03B5\u03C3", 69, 1], + ["\u03B7\u03C3", -1, 1], + ["\u03B5\u03B9\u03C3", -1, 1], + ["\u03B7\u03B8\u03B5\u03B9\u03C3", 73, 1], + ["\u03BF\u03C3", -1, 1], + ["\u03C5\u03C3", -1, 1], + ["\u03BF\u03C5\u03C3", 76, 1], + ["\u03C5", -1, 1], + ["\u03BF\u03C5", 78, 1], + ["\u03C9", -1, 1], + ["\u03B1\u03C9", 80, 1], + ["\u03B7\u03B8\u03C9", 80, 1], + ["\u03B7\u03C3\u03C9", 80, 1] + ]; + + /** @const */ var a_67 = [ + ["\u03BF\u03C4\u03B5\u03C1", -1, 1], + ["\u03B5\u03C3\u03C4\u03B5\u03C1", -1, 1], + ["\u03C5\u03C4\u03B5\u03C1", -1, 1], + ["\u03C9\u03C4\u03B5\u03C1", -1, 1], + ["\u03BF\u03C4\u03B1\u03C4", -1, 1], + ["\u03B5\u03C3\u03C4\u03B1\u03C4", -1, 1], + ["\u03C5\u03C4\u03B1\u03C4", -1, 1], + ["\u03C9\u03C4\u03B1\u03C4", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [81, 65, 16, 1]; + + /** @const */ var /** Array */ g_v2 = [81, 65, 0, 1]; + + var /** boolean */ B_test1 = false; + + + /** @return {boolean} */ + function r_has_min_length() { + return base.current.length >= 3; + }; + + /** @return {boolean} */ + function r_tolower() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u03B1")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B2")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u03B3")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u03B4")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u03B5")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u03B6")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u03B7")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u03B8")) + { + return false; + } + break; + case 9: + if (!base.slice_from("\u03B9")) + { + return false; + } + break; + case 10: + if (!base.slice_from("\u03BA")) + { + return false; + } + break; + case 11: + if (!base.slice_from("\u03BB")) + { + return false; + } + break; + case 12: + if (!base.slice_from("\u03BC")) + { + return false; + } + break; + case 13: + if (!base.slice_from("\u03BD")) + { + return false; + } + break; + case 14: + if (!base.slice_from("\u03BE")) + { + return false; + } + break; + case 15: + if (!base.slice_from("\u03BF")) + { + return false; + } + break; + case 16: + if (!base.slice_from("\u03C0")) + { + return false; + } + break; + case 17: + if (!base.slice_from("\u03C1")) + { + return false; + } + break; + case 18: + if (!base.slice_from("\u03C3")) + { + return false; + } + break; + case 19: + if (!base.slice_from("\u03C4")) + { + return false; + } + break; + case 20: + if (!base.slice_from("\u03C5")) + { + return false; + } + break; + case 21: + if (!base.slice_from("\u03C6")) + { + return false; + } + break; + case 22: + if (!base.slice_from("\u03C7")) + { + return false; + } + break; + case 23: + if (!base.slice_from("\u03C8")) + { + return false; + } + break; + case 24: + if (!base.slice_from("\u03C9")) + { + return false; + } + break; + case 25: + if (base.cursor <= base.limit_backward) + { + break lab0; + } + base.cursor--; + break; + } + continue; + } + base.cursor = base.limit - v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_step1() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_1); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u03C6\u03B1")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03C3\u03BA\u03B1")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u03BF\u03BB\u03BF")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u03C3\u03BF")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u03C4\u03B1\u03C4\u03BF")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u03BA\u03C1\u03B5")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u03C0\u03B5\u03C1")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u03C4\u03B5\u03C1")) + { + return false; + } + break; + case 9: + if (!base.slice_from("\u03C6\u03C9")) + { + return false; + } + break; + case 10: + if (!base.slice_from("\u03BA\u03B1\u03B8\u03B5\u03C3\u03C4")) + { + return false; + } + break; + case 11: + if (!base.slice_from("\u03B3\u03B5\u03B3\u03BF\u03BD")) + { + return false; + } + break; + } + B_test1 = false; + return true; + }; + + /** @return {boolean} */ + function r_steps1() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u03B9")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B9\u03B6")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_steps2() { + base.ket = base.cursor; + if (base.find_among_b(a_5) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_4) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03C9\u03BD")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_steps3() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_7) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("\u03B9\u03C3\u03B1"))) + { + break lab1; + } + if (base.cursor > base.limit_backward) + { + break lab1; + } + if (!base.slice_from("\u03B9\u03C3")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + base.bra = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u03B9")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B9\u03C3")) + { + return false; + } + break; + } + } + return true; + }; + + /** @return {boolean} */ + function r_steps4() { + base.ket = base.cursor; + if (base.find_among_b(a_9) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_8) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B9")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_steps5() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_11) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + among_var = base.find_among_b(a_10); + if (among_var == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u03B9")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B9\u03C3\u03C4")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_steps6() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_14) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + among_var = base.find_among_b(a_12); + if (among_var == 0) + { + break lab1; + } + if (base.cursor > base.limit_backward) + { + break lab1; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u03B9\u03C3\u03BC")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B9")) + { + return false; + } + break; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + among_var = base.find_among_b(a_13); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u03B1\u03B3\u03BD\u03C9\u03C3\u03C4")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B1\u03C4\u03BF\u03BC")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u03B3\u03BD\u03C9\u03C3\u03C4")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u03B5\u03B8\u03BD")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u03B5\u03BA\u03BB\u03B5\u03BA\u03C4")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u03C3\u03BA\u03B5\u03C0\u03C4")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u03C4\u03BF\u03C0")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u03B1\u03BB\u03B5\u03BE\u03B1\u03BD\u03B4\u03C1")) + { + return false; + } + break; + case 9: + if (!base.slice_from("\u03B2\u03C5\u03B6\u03B1\u03BD\u03C4")) + { + return false; + } + break; + case 10: + if (!base.slice_from("\u03B8\u03B5\u03B1\u03C4\u03C1")) + { + return false; + } + break; + } + } + return true; + }; + + /** @return {boolean} */ + function r_steps7() { + base.ket = base.cursor; + if (base.find_among_b(a_16) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_15) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B1\u03C1\u03B1\u03BA")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_steps8() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_18) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + among_var = base.find_among_b(a_17); + if (among_var == 0) + { + break lab1; + } + if (base.cursor > base.limit_backward) + { + break lab1; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u03B1\u03BA")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u03B9\u03C4\u03C3")) + { + return false; + } + break; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.eq_s_b("\u03BA\u03BF\u03C1"))) + { + return false; + } + if (!base.slice_from("\u03B9\u03C4\u03C3")) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_steps9() { + base.ket = base.cursor; + if (base.find_among_b(a_21) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_19) == 0) + { + break lab1; + } + if (base.cursor > base.limit_backward) + { + break lab1; + } + if (!base.slice_from("\u03B9\u03B4")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_20) == 0) + { + return false; + } + if (!base.slice_from("\u03B9\u03B4")) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_steps10() { + base.ket = base.cursor; + if (base.find_among_b(a_23) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_22) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B9\u03C3\u03BA")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step2a() { + base.ket = base.cursor; + if (base.find_among_b(a_24) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (base.find_among_b(a_25) == 0) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_1; + } + { + var /** number */ c1 = base.cursor; + base.insert(base.cursor, base.cursor, "\u03B1\u03B4"); + base.cursor = c1; + } + return true; + }; + + /** @return {boolean} */ + function r_step2b() { + base.ket = base.cursor; + if (base.find_among_b(a_26) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_27) == 0) + { + return false; + } + if (!base.slice_from("\u03B5\u03B4")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step2c() { + base.ket = base.cursor; + if (base.find_among_b(a_28) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_29) == 0) + { + return false; + } + if (!base.slice_from("\u03BF\u03C5\u03B4")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step2d() { + base.ket = base.cursor; + if (base.find_among_b(a_30) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_31) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B5")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step3() { + base.ket = base.cursor; + if (base.find_among_b(a_32) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.in_grouping_b(g_v, 945, 969))) + { + return false; + } + if (!base.slice_from("\u03B9")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step4() { + base.ket = base.cursor; + if (base.find_among_b(a_33) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.in_grouping_b(g_v, 945, 969))) + { + break lab1; + } + if (!base.slice_from("\u03B9\u03BA")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + } + base.bra = base.cursor; + if (base.find_among_b(a_34) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B9\u03BA")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5a() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("\u03B1\u03B3\u03B1\u03BC\u03B5"))) + { + break lab0; + } + if (base.cursor > base.limit_backward) + { + break lab0; + } + if (!base.slice_from("\u03B1\u03B3\u03B1\u03BC")) + { + return false; + } + } + base.cursor = base.limit - v_1; + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (base.find_among_b(a_35) == 0) + { + break lab1; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + } + base.cursor = base.limit - v_2; + base.ket = base.cursor; + if (!(base.eq_s_b("\u03B1\u03BC\u03B5"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_36) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B1\u03BC")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5b() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (base.find_among_b(a_38) == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_37) == 0) + { + break lab0; + } + if (base.cursor > base.limit_backward) + { + break lab0; + } + if (!base.slice_from("\u03B1\u03B3\u03B1\u03BD")) + { + return false; + } + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (!(base.eq_s_b("\u03B1\u03BD\u03B5"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.in_grouping_b(g_v2, 945, 969))) + { + break lab2; + } + if (!base.slice_from("\u03B1\u03BD")) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + base.ket = base.cursor; + } + base.bra = base.cursor; + if (base.find_among_b(a_39) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B1\u03BD")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5c() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (base.find_among_b(a_40) == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (!(base.eq_s_b("\u03B5\u03C4\u03B5"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.in_grouping_b(g_v2, 945, 969))) + { + break lab2; + } + if (!base.slice_from("\u03B5\u03C4")) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + lab3: { + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_41) == 0) + { + break lab3; + } + if (!base.slice_from("\u03B5\u03C4")) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + base.ket = base.cursor; + } + base.bra = base.cursor; + if (base.find_among_b(a_42) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B5\u03C4")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5d() { + base.ket = base.cursor; + if (base.find_among_b(a_43) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.eq_s_b("\u03B1\u03C1\u03C7"))) + { + break lab1; + } + if (base.cursor > base.limit_backward) + { + break lab1; + } + if (!base.slice_from("\u03BF\u03BD\u03C4")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.eq_s_b("\u03BA\u03C1\u03B5"))) + { + return false; + } + if (!base.slice_from("\u03C9\u03BD\u03C4")) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_step5e() { + base.ket = base.cursor; + if (base.find_among_b(a_44) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.eq_s_b("\u03BF\u03BD"))) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03BF\u03BC\u03B1\u03C3\u03C4")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5f() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("\u03B9\u03B5\u03C3\u03C4\u03B5"))) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_45) == 0) + { + break lab0; + } + if (base.cursor > base.limit_backward) + { + break lab0; + } + if (!base.slice_from("\u03B9\u03B5\u03C3\u03C4")) + { + return false; + } + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (!(base.eq_s_b("\u03B5\u03C3\u03C4\u03B5"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_46) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B9\u03B5\u03C3\u03C4")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5g() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (base.find_among_b(a_47) == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (base.find_among_b(a_50) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_48) == 0) + { + break lab2; + } + if (!base.slice_from("\u03B7\u03BA")) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_49) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B7\u03BA")) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_step5h() { + base.ket = base.cursor; + if (base.find_among_b(a_53) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_51) == 0) + { + break lab1; + } + if (!base.slice_from("\u03BF\u03C5\u03C3")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_52) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03BF\u03C5\u03C3")) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_step5i() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_56) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + base.bra = base.cursor; + if (!(base.eq_s_b("\u03BA\u03BF\u03BB\u03BB"))) + { + break lab1; + } + if (!base.slice_from("\u03B1\u03B3")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + lab2: { + var /** number */ v_2 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + base.bra = base.cursor; + among_var = base.find_among_b(a_54); + if (among_var == 0) + { + break lab3; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u03B1\u03B3")) + { + return false; + } + break; + } + break lab2; + } + base.cursor = base.limit - v_2; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_55) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B1\u03B3")) + { + return false; + } + } + } + return true; + }; + + /** @return {boolean} */ + function r_step5j() { + base.ket = base.cursor; + if (base.find_among_b(a_57) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_58) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B7\u03C3")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5k() { + base.ket = base.cursor; + if (base.find_among_b(a_59) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_60) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03B7\u03C3\u03C4")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5l() { + base.ket = base.cursor; + if (base.find_among_b(a_61) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_62) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03BF\u03C5\u03BD")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step5m() { + base.ket = base.cursor; + if (base.find_among_b(a_63) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_test1 = false; + base.ket = base.cursor; + base.bra = base.cursor; + if (base.find_among_b(a_64) == 0) + { + return false; + } + if (base.cursor > base.limit_backward) + { + return false; + } + if (!base.slice_from("\u03BF\u03C5\u03BC")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step6() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (base.find_among_b(a_65) == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_from("\u03BC\u03B1")) + { + return false; + } + } + base.cursor = base.limit - v_1; + if (!B_test1) + { + return false; + } + base.ket = base.cursor; + if (base.find_among_b(a_66) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step7() { + base.ket = base.cursor; + if (base.find_among_b(a_67) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_1 = base.limit - base.cursor; + r_tolower(); + base.cursor = base.limit - v_1; + if (!r_has_min_length()) + { + return false; + } + B_test1 = true; + var /** number */ v_2 = base.limit - base.cursor; + r_step1(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_steps1(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_steps2(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_steps3(); + base.cursor = base.limit - v_5; + var /** number */ v_6 = base.limit - base.cursor; + r_steps4(); + base.cursor = base.limit - v_6; + var /** number */ v_7 = base.limit - base.cursor; + r_steps5(); + base.cursor = base.limit - v_7; + var /** number */ v_8 = base.limit - base.cursor; + r_steps6(); + base.cursor = base.limit - v_8; + var /** number */ v_9 = base.limit - base.cursor; + r_steps7(); + base.cursor = base.limit - v_9; + var /** number */ v_10 = base.limit - base.cursor; + r_steps8(); + base.cursor = base.limit - v_10; + var /** number */ v_11 = base.limit - base.cursor; + r_steps9(); + base.cursor = base.limit - v_11; + var /** number */ v_12 = base.limit - base.cursor; + r_steps10(); + base.cursor = base.limit - v_12; + var /** number */ v_13 = base.limit - base.cursor; + r_step2a(); + base.cursor = base.limit - v_13; + var /** number */ v_14 = base.limit - base.cursor; + r_step2b(); + base.cursor = base.limit - v_14; + var /** number */ v_15 = base.limit - base.cursor; + r_step2c(); + base.cursor = base.limit - v_15; + var /** number */ v_16 = base.limit - base.cursor; + r_step2d(); + base.cursor = base.limit - v_16; + var /** number */ v_17 = base.limit - base.cursor; + r_step3(); + base.cursor = base.limit - v_17; + var /** number */ v_18 = base.limit - base.cursor; + r_step4(); + base.cursor = base.limit - v_18; + var /** number */ v_19 = base.limit - base.cursor; + r_step5a(); + base.cursor = base.limit - v_19; + var /** number */ v_20 = base.limit - base.cursor; + r_step5b(); + base.cursor = base.limit - v_20; + var /** number */ v_21 = base.limit - base.cursor; + r_step5c(); + base.cursor = base.limit - v_21; + var /** number */ v_22 = base.limit - base.cursor; + r_step5d(); + base.cursor = base.limit - v_22; + var /** number */ v_23 = base.limit - base.cursor; + r_step5e(); + base.cursor = base.limit - v_23; + var /** number */ v_24 = base.limit - base.cursor; + r_step5f(); + base.cursor = base.limit - v_24; + var /** number */ v_25 = base.limit - base.cursor; + r_step5g(); + base.cursor = base.limit - v_25; + var /** number */ v_26 = base.limit - base.cursor; + r_step5h(); + base.cursor = base.limit - v_26; + var /** number */ v_27 = base.limit - base.cursor; + r_step5j(); + base.cursor = base.limit - v_27; + var /** number */ v_28 = base.limit - base.cursor; + r_step5i(); + base.cursor = base.limit - v_28; + var /** number */ v_29 = base.limit - base.cursor; + r_step5k(); + base.cursor = base.limit - v_29; + var /** number */ v_30 = base.limit - base.cursor; + r_step5l(); + base.cursor = base.limit - v_30; + var /** number */ v_31 = base.limit - base.cursor; + r_step5m(); + base.cursor = base.limit - v_31; + var /** number */ v_32 = base.limit - base.cursor; + r_step6(); + base.cursor = base.limit - v_32; + var /** number */ v_33 = base.limit - base.cursor; + r_step7(); + base.cursor = base.limit - v_33; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['GreekStemmer'] = GreekStemmer; diff --git a/js/hindi-stemmer.js b/js/hindi-stemmer.js new file mode 100644 index 0000000..20bf6f6 --- /dev/null +++ b/js/hindi-stemmer.js @@ -0,0 +1,183 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var HindiStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0906\u0901", -1, -1], + ["\u093E\u0901", -1, -1], + ["\u0907\u092F\u093E\u0901", 1, -1], + ["\u0906\u0907\u092F\u093E\u0901", 2, -1], + ["\u093E\u0907\u092F\u093E\u0901", 2, -1], + ["\u093F\u092F\u093E\u0901", 1, -1], + ["\u0906\u0902", -1, -1], + ["\u0909\u0906\u0902", 6, -1], + ["\u0941\u0906\u0902", 6, -1], + ["\u0908\u0902", -1, -1], + ["\u0906\u0908\u0902", 9, -1], + ["\u093E\u0908\u0902", 9, -1], + ["\u090F\u0902", -1, -1], + ["\u0906\u090F\u0902", 12, -1], + ["\u0909\u090F\u0902", 12, -1], + ["\u093E\u090F\u0902", 12, -1], + ["\u0924\u093E\u090F\u0902", 15, -1, r_CONSONANT], + ["\u0905\u0924\u093E\u090F\u0902", 16, -1], + ["\u0928\u093E\u090F\u0902", 15, -1, r_CONSONANT], + ["\u0905\u0928\u093E\u090F\u0902", 18, -1], + ["\u0941\u090F\u0902", 12, -1], + ["\u0913\u0902", -1, -1], + ["\u0906\u0913\u0902", 21, -1], + ["\u0909\u0913\u0902", 21, -1], + ["\u093E\u0913\u0902", 21, -1], + ["\u0924\u093E\u0913\u0902", 24, -1, r_CONSONANT], + ["\u0905\u0924\u093E\u0913\u0902", 25, -1], + ["\u0928\u093E\u0913\u0902", 24, -1, r_CONSONANT], + ["\u0905\u0928\u093E\u0913\u0902", 27, -1], + ["\u0941\u0913\u0902", 21, -1], + ["\u093E\u0902", -1, -1], + ["\u0907\u092F\u093E\u0902", 30, -1], + ["\u0906\u0907\u092F\u093E\u0902", 31, -1], + ["\u093E\u0907\u092F\u093E\u0902", 31, -1], + ["\u093F\u092F\u093E\u0902", 30, -1], + ["\u0940\u0902", -1, -1], + ["\u0924\u0940\u0902", 35, -1, r_CONSONANT], + ["\u0905\u0924\u0940\u0902", 36, -1], + ["\u0906\u0924\u0940\u0902", 36, -1], + ["\u093E\u0924\u0940\u0902", 36, -1], + ["\u0947\u0902", -1, -1], + ["\u094B\u0902", -1, -1], + ["\u0907\u092F\u094B\u0902", 41, -1], + ["\u0906\u0907\u092F\u094B\u0902", 42, -1], + ["\u093E\u0907\u092F\u094B\u0902", 42, -1], + ["\u093F\u092F\u094B\u0902", 41, -1], + ["\u0905", -1, -1], + ["\u0906", -1, -1], + ["\u0907", -1, -1], + ["\u0908", -1, -1], + ["\u0906\u0908", 49, -1], + ["\u093E\u0908", 49, -1], + ["\u0909", -1, -1], + ["\u090A", -1, -1], + ["\u090F", -1, -1], + ["\u0906\u090F", 54, -1], + ["\u0907\u090F", 54, -1], + ["\u0906\u0907\u090F", 56, -1], + ["\u093E\u0907\u090F", 56, -1], + ["\u093E\u090F", 54, -1], + ["\u093F\u090F", 54, -1], + ["\u0913", -1, -1], + ["\u0906\u0913", 61, -1], + ["\u093E\u0913", 61, -1], + ["\u0915\u0930", -1, -1, r_CONSONANT], + ["\u0905\u0915\u0930", 64, -1], + ["\u0906\u0915\u0930", 64, -1], + ["\u093E\u0915\u0930", 64, -1], + ["\u093E", -1, -1], + ["\u090A\u0902\u0917\u093E", 68, -1], + ["\u0906\u090A\u0902\u0917\u093E", 69, -1], + ["\u093E\u090A\u0902\u0917\u093E", 69, -1], + ["\u0942\u0902\u0917\u093E", 68, -1], + ["\u090F\u0917\u093E", 68, -1], + ["\u0906\u090F\u0917\u093E", 73, -1], + ["\u093E\u090F\u0917\u093E", 73, -1], + ["\u0947\u0917\u093E", 68, -1], + ["\u0924\u093E", 68, -1, r_CONSONANT], + ["\u0905\u0924\u093E", 77, -1], + ["\u0906\u0924\u093E", 77, -1], + ["\u093E\u0924\u093E", 77, -1], + ["\u0928\u093E", 68, -1, r_CONSONANT], + ["\u0905\u0928\u093E", 81, -1], + ["\u0906\u0928\u093E", 81, -1], + ["\u093E\u0928\u093E", 81, -1], + ["\u0906\u092F\u093E", 68, -1], + ["\u093E\u092F\u093E", 68, -1], + ["\u093F", -1, -1], + ["\u0940", -1, -1], + ["\u090A\u0902\u0917\u0940", 88, -1], + ["\u0906\u090A\u0902\u0917\u0940", 89, -1], + ["\u093E\u090A\u0902\u0917\u0940", 89, -1], + ["\u090F\u0902\u0917\u0940", 88, -1], + ["\u0906\u090F\u0902\u0917\u0940", 92, -1], + ["\u093E\u090F\u0902\u0917\u0940", 92, -1], + ["\u0942\u0902\u0917\u0940", 88, -1], + ["\u0947\u0902\u0917\u0940", 88, -1], + ["\u090F\u0917\u0940", 88, -1], + ["\u0906\u090F\u0917\u0940", 97, -1], + ["\u093E\u090F\u0917\u0940", 97, -1], + ["\u0913\u0917\u0940", 88, -1], + ["\u0906\u0913\u0917\u0940", 100, -1], + ["\u093E\u0913\u0917\u0940", 100, -1], + ["\u0947\u0917\u0940", 88, -1], + ["\u094B\u0917\u0940", 88, -1], + ["\u0924\u0940", 88, -1, r_CONSONANT], + ["\u0905\u0924\u0940", 105, -1], + ["\u0906\u0924\u0940", 105, -1], + ["\u093E\u0924\u0940", 105, -1], + ["\u0928\u0940", 88, -1, r_CONSONANT], + ["\u0905\u0928\u0940", 109, -1], + ["\u0941", -1, -1], + ["\u0942", -1, -1], + ["\u0947", -1, -1], + ["\u090F\u0902\u0917\u0947", 113, -1], + ["\u0906\u090F\u0902\u0917\u0947", 114, -1], + ["\u093E\u090F\u0902\u0917\u0947", 114, -1], + ["\u0947\u0902\u0917\u0947", 113, -1], + ["\u0913\u0917\u0947", 113, -1], + ["\u0906\u0913\u0917\u0947", 118, -1], + ["\u093E\u0913\u0917\u0947", 118, -1], + ["\u094B\u0917\u0947", 113, -1], + ["\u0924\u0947", 113, -1, r_CONSONANT], + ["\u0905\u0924\u0947", 122, -1], + ["\u0906\u0924\u0947", 122, -1], + ["\u093E\u0924\u0947", 122, -1], + ["\u0928\u0947", 113, -1, r_CONSONANT], + ["\u0905\u0928\u0947", 126, -1], + ["\u0906\u0928\u0947", 126, -1], + ["\u093E\u0928\u0947", 126, -1], + ["\u094B", -1, -1], + ["\u094D", -1, -1] + ]; + + /** @const */ var /** Array */ g_consonant = [255, 255, 255, 255, 159, 0, 0, 0, 248, 7]; + + + + /** @return {boolean} */ + function r_CONSONANT() { + if (!(base.in_grouping_b(g_consonant, 2325, 2399))) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + base.limit_backward = base.cursor; base.cursor = base.limit; + base.ket = base.cursor; + if (base.find_among_b(a_0) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['HindiStemmer'] = HindiStemmer; diff --git a/js/hungarian-stemmer.js b/js/hungarian-stemmer.js new file mode 100644 index 0000000..d1c8c56 --- /dev/null +++ b/js/hungarian-stemmer.js @@ -0,0 +1,709 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var HungarianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["cs", -1, -1], + ["dzs", -1, -1], + ["gy", -1, -1], + ["ly", -1, -1], + ["ny", -1, -1], + ["sz", -1, -1], + ["ty", -1, -1], + ["zs", -1, -1] + ]; + + /** @const */ var a_1 = [ + ["\u00E1", -1, 1], + ["\u00E9", -1, 2] + ]; + + /** @const */ var a_2 = [ + ["bb", -1, -1], + ["cc", -1, -1], + ["dd", -1, -1], + ["ff", -1, -1], + ["gg", -1, -1], + ["jj", -1, -1], + ["kk", -1, -1], + ["ll", -1, -1], + ["mm", -1, -1], + ["nn", -1, -1], + ["pp", -1, -1], + ["rr", -1, -1], + ["ccs", -1, -1], + ["ss", -1, -1], + ["zzs", -1, -1], + ["tt", -1, -1], + ["vv", -1, -1], + ["ggy", -1, -1], + ["lly", -1, -1], + ["nny", -1, -1], + ["tty", -1, -1], + ["ssz", -1, -1], + ["zz", -1, -1] + ]; + + /** @const */ var a_3 = [ + ["al", -1, 1], + ["el", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["ba", -1, -1], + ["ra", -1, -1], + ["be", -1, -1], + ["re", -1, -1], + ["ig", -1, -1], + ["nak", -1, -1], + ["nek", -1, -1], + ["val", -1, -1], + ["vel", -1, -1], + ["ul", -1, -1], + ["n\u00E1l", -1, -1], + ["n\u00E9l", -1, -1], + ["b\u00F3l", -1, -1], + ["r\u00F3l", -1, -1], + ["t\u00F3l", -1, -1], + ["\u00FCl", -1, -1], + ["b\u0151l", -1, -1], + ["r\u0151l", -1, -1], + ["t\u0151l", -1, -1], + ["n", -1, -1], + ["an", 19, -1], + ["ban", 20, -1], + ["en", 19, -1], + ["ben", 22, -1], + ["k\u00E9ppen", 22, -1], + ["on", 19, -1], + ["\u00F6n", 19, -1], + ["k\u00E9pp", -1, -1], + ["kor", -1, -1], + ["t", -1, -1], + ["at", 29, -1], + ["et", 29, -1], + ["k\u00E9nt", 29, -1], + ["ank\u00E9nt", 32, -1], + ["enk\u00E9nt", 32, -1], + ["onk\u00E9nt", 32, -1], + ["ot", 29, -1], + ["\u00E9rt", 29, -1], + ["\u00F6t", 29, -1], + ["hez", -1, -1], + ["hoz", -1, -1], + ["h\u00F6z", -1, -1], + ["v\u00E1", -1, -1], + ["v\u00E9", -1, -1] + ]; + + /** @const */ var a_5 = [ + ["\u00E1n", -1, 2], + ["\u00E9n", -1, 1], + ["\u00E1nk\u00E9nt", -1, 2] + ]; + + /** @const */ var a_6 = [ + ["stul", -1, 1], + ["astul", 0, 1], + ["\u00E1stul", 0, 2], + ["st\u00FCl", -1, 1], + ["est\u00FCl", 3, 1], + ["\u00E9st\u00FCl", 3, 3] + ]; + + /** @const */ var a_7 = [ + ["\u00E1", -1, 1], + ["\u00E9", -1, 1] + ]; + + /** @const */ var a_8 = [ + ["k", -1, 3], + ["ak", 0, 3], + ["ek", 0, 3], + ["ok", 0, 3], + ["\u00E1k", 0, 1], + ["\u00E9k", 0, 2], + ["\u00F6k", 0, 3] + ]; + + /** @const */ var a_9 = [ + ["\u00E9i", -1, 1], + ["\u00E1\u00E9i", 0, 3], + ["\u00E9\u00E9i", 0, 2], + ["\u00E9", -1, 1], + ["k\u00E9", 3, 1], + ["ak\u00E9", 4, 1], + ["ek\u00E9", 4, 1], + ["ok\u00E9", 4, 1], + ["\u00E1k\u00E9", 4, 3], + ["\u00E9k\u00E9", 4, 2], + ["\u00F6k\u00E9", 4, 1], + ["\u00E9\u00E9", 3, 2] + ]; + + /** @const */ var a_10 = [ + ["a", -1, 1], + ["ja", 0, 1], + ["d", -1, 1], + ["ad", 2, 1], + ["ed", 2, 1], + ["od", 2, 1], + ["\u00E1d", 2, 2], + ["\u00E9d", 2, 3], + ["\u00F6d", 2, 1], + ["e", -1, 1], + ["je", 9, 1], + ["nk", -1, 1], + ["unk", 11, 1], + ["\u00E1nk", 11, 2], + ["\u00E9nk", 11, 3], + ["\u00FCnk", 11, 1], + ["uk", -1, 1], + ["juk", 16, 1], + ["\u00E1juk", 17, 2], + ["\u00FCk", -1, 1], + ["j\u00FCk", 19, 1], + ["\u00E9j\u00FCk", 20, 3], + ["m", -1, 1], + ["am", 22, 1], + ["em", 22, 1], + ["om", 22, 1], + ["\u00E1m", 22, 2], + ["\u00E9m", 22, 3], + ["o", -1, 1], + ["\u00E1", -1, 2], + ["\u00E9", -1, 3] + ]; + + /** @const */ var a_11 = [ + ["id", -1, 1], + ["aid", 0, 1], + ["jaid", 1, 1], + ["eid", 0, 1], + ["jeid", 3, 1], + ["\u00E1id", 0, 2], + ["\u00E9id", 0, 3], + ["i", -1, 1], + ["ai", 7, 1], + ["jai", 8, 1], + ["ei", 7, 1], + ["jei", 10, 1], + ["\u00E1i", 7, 2], + ["\u00E9i", 7, 3], + ["itek", -1, 1], + ["eitek", 14, 1], + ["jeitek", 15, 1], + ["\u00E9itek", 14, 3], + ["ik", -1, 1], + ["aik", 18, 1], + ["jaik", 19, 1], + ["eik", 18, 1], + ["jeik", 21, 1], + ["\u00E1ik", 18, 2], + ["\u00E9ik", 18, 3], + ["ink", -1, 1], + ["aink", 25, 1], + ["jaink", 26, 1], + ["eink", 25, 1], + ["jeink", 28, 1], + ["\u00E1ink", 25, 2], + ["\u00E9ink", 25, 3], + ["aitok", -1, 1], + ["jaitok", 32, 1], + ["\u00E1itok", -1, 2], + ["im", -1, 1], + ["aim", 35, 1], + ["jaim", 36, 1], + ["eim", 35, 1], + ["jeim", 38, 1], + ["\u00E1im", 35, 2], + ["\u00E9im", 35, 3] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 36, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]; + + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + lab0: { + var /** number */ v_1 = base.cursor; + lab1: { + if (!(base.in_grouping(g_v, 97, 369))) + { + break lab1; + } + golab2: while(true) + { + var /** number */ v_2 = base.cursor; + lab3: { + if (!(base.out_grouping(g_v, 97, 369))) + { + break lab3; + } + base.cursor = v_2; + break golab2; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + lab4: { + var /** number */ v_3 = base.cursor; + lab5: { + if (base.find_among(a_0) == 0) + { + break lab5; + } + break lab4; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + I_p1 = base.cursor; + break lab0; + } + base.cursor = v_1; + if (!(base.out_grouping(g_v, 97, 369))) + { + return false; + } + golab6: while(true) + { + lab7: { + if (!(base.in_grouping(g_v, 97, 369))) + { + break lab7; + } + break golab6; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_v_ending() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_1); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("a")) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_double() { + var /** number */ v_1 = base.limit - base.cursor; + if (base.find_among_b(a_2) == 0) + { + return false; + } + base.cursor = base.limit - v_1; + return true; + }; + + /** @return {boolean} */ + function r_undouble() { + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.ket = base.cursor; + { + var /** number */ c1 = base.cursor - 1; + if (c1 < base.limit_backward) + { + return false; + } + base.cursor = c1; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_instrum() { + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + if (!r_double()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + if (!r_undouble()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_case() { + base.ket = base.cursor; + if (base.find_among_b(a_4) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + if (!r_v_ending()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_case_special() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("e")) + { + return false; + } + break; + case 2: + if (!base.slice_from("a")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_case_other() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("a")) + { + return false; + } + break; + case 3: + if (!base.slice_from("e")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_factive() { + base.ket = base.cursor; + if (base.find_among_b(a_7) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + if (!r_double()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + if (!r_undouble()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_plural() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_8); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("a")) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_owned() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_9); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + case 3: + if (!base.slice_from("a")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_sing_owner() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_10); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("a")) + { + return false; + } + break; + case 3: + if (!base.slice_from("e")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_plur_owner() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_11); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("a")) + { + return false; + } + break; + case 3: + if (!base.slice_from("e")) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_mark_regions(); + base.cursor = v_1; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_instrum(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_case(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_case_special(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_case_other(); + base.cursor = base.limit - v_5; + var /** number */ v_6 = base.limit - base.cursor; + r_factive(); + base.cursor = base.limit - v_6; + var /** number */ v_7 = base.limit - base.cursor; + r_owned(); + base.cursor = base.limit - v_7; + var /** number */ v_8 = base.limit - base.cursor; + r_sing_owner(); + base.cursor = base.limit - v_8; + var /** number */ v_9 = base.limit - base.cursor; + r_plur_owner(); + base.cursor = base.limit - v_9; + var /** number */ v_10 = base.limit - base.cursor; + r_plural(); + base.cursor = base.limit - v_10; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['HungarianStemmer'] = HungarianStemmer; diff --git a/js/indonesian-stemmer.js b/js/indonesian-stemmer.js new file mode 100644 index 0000000..eb9663b --- /dev/null +++ b/js/indonesian-stemmer.js @@ -0,0 +1,422 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var IndonesianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["kah", -1, 1], + ["lah", -1, 1], + ["pun", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["nya", -1, 1], + ["ku", -1, 1], + ["mu", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["i", -1, 1, r_SUFFIX_I_OK], + ["an", -1, 1, r_SUFFIX_AN_OK], + ["kan", 1, 1, r_SUFFIX_KAN_OK] + ]; + + /** @const */ var a_3 = [ + ["di", -1, 1], + ["ke", -1, 2], + ["me", -1, 1], + ["mem", 2, 5], + ["men", 2, 1], + ["meng", 4, 1], + ["meny", 4, 3, r_VOWEL], + ["pem", -1, 6], + ["pen", -1, 2], + ["peng", 8, 2], + ["peny", 8, 4, r_VOWEL], + ["ter", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["be", -1, 3, r_KER], + ["belajar", 0, 4], + ["ber", 0, 3], + ["pe", -1, 1], + ["pelajar", 3, 2], + ["per", 3, 1] + ]; + + /** @const */ var /** Array */ g_vowel = [17, 65, 16]; + + var /** number */ I_prefix = 0; + var /** number */ I_measure = 0; + + + /** @return {boolean} */ + function r_remove_particle() { + base.ket = base.cursor; + if (base.find_among_b(a_0) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + I_measure -= 1; + return true; + }; + + /** @return {boolean} */ + function r_remove_possessive_pronoun() { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + I_measure -= 1; + return true; + }; + + /** @return {boolean} */ + function r_SUFFIX_KAN_OK() { + if (I_prefix == 3) + { + return false; + } + if (I_prefix == 2) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_SUFFIX_AN_OK() { + return I_prefix != 1; + }; + + /** @return {boolean} */ + function r_SUFFIX_I_OK() { + if (I_prefix > 2) + { + return false; + } + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("s"))) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_1; + } + return true; + }; + + /** @return {boolean} */ + function r_remove_suffix() { + base.ket = base.cursor; + if (base.find_among_b(a_2) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + I_measure -= 1; + return true; + }; + + /** @return {boolean} */ + function r_VOWEL() { + if (!(base.in_grouping(g_vowel, 97, 117))) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_KER() { + if (!(base.out_grouping(g_vowel, 97, 117))) + { + return false; + } + if (!(base.eq_s("er"))) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_remove_first_order_prefix() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_3); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + I_prefix = 1; + I_measure -= 1; + break; + case 2: + if (!base.slice_del()) + { + return false; + } + I_prefix = 3; + I_measure -= 1; + break; + case 3: + I_prefix = 1; + if (!base.slice_from("s")) + { + return false; + } + I_measure -= 1; + break; + case 4: + I_prefix = 3; + if (!base.slice_from("s")) + { + return false; + } + I_measure -= 1; + break; + case 5: + I_prefix = 1; + I_measure -= 1; + lab0: { + var /** number */ v_1 = base.cursor; + lab1: { + var /** number */ v_2 = base.cursor; + if (!(base.in_grouping(g_vowel, 97, 117))) + { + break lab1; + } + base.cursor = v_2; + if (!base.slice_from("p")) + { + return false; + } + break lab0; + } + base.cursor = v_1; + if (!base.slice_del()) + { + return false; + } + } + break; + case 6: + I_prefix = 3; + I_measure -= 1; + lab2: { + var /** number */ v_3 = base.cursor; + lab3: { + var /** number */ v_4 = base.cursor; + if (!(base.in_grouping(g_vowel, 97, 117))) + { + break lab3; + } + base.cursor = v_4; + if (!base.slice_from("p")) + { + return false; + } + break lab2; + } + base.cursor = v_3; + if (!base.slice_del()) + { + return false; + } + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_remove_second_order_prefix() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_4); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + I_prefix = 2; + I_measure -= 1; + break; + case 2: + if (!base.slice_from("ajar")) + { + return false; + } + I_measure -= 1; + break; + case 3: + if (!base.slice_del()) + { + return false; + } + I_prefix = 4; + I_measure -= 1; + break; + case 4: + if (!base.slice_from("ajar")) + { + return false; + } + I_prefix = 4; + I_measure -= 1; + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + I_measure = 0; + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + golab2: while(true) + { + lab3: { + if (!(base.in_grouping(g_vowel, 97, 117))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + I_measure += 1; + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + if (I_measure <= 2) + { + return false; + } + I_prefix = 0; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_4 = base.limit - base.cursor; + r_remove_particle(); + base.cursor = base.limit - v_4; + if (I_measure <= 2) + { + return false; + } + var /** number */ v_5 = base.limit - base.cursor; + r_remove_possessive_pronoun(); + base.cursor = base.limit - v_5; + base.cursor = base.limit_backward; + if (I_measure <= 2) + { + return false; + } + lab4: { + var /** number */ v_6 = base.cursor; + lab5: { + var /** number */ v_7 = base.cursor; + if (!r_remove_first_order_prefix()) + { + break lab5; + } + var /** number */ v_8 = base.cursor; + lab6: { + var /** number */ v_9 = base.cursor; + if (I_measure <= 2) + { + break lab6; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + if (!r_remove_suffix()) + { + break lab6; + } + base.cursor = base.limit_backward; + base.cursor = v_9; + if (I_measure <= 2) + { + break lab6; + } + if (!r_remove_second_order_prefix()) + { + break lab6; + } + } + base.cursor = v_8; + base.cursor = v_7; + break lab4; + } + base.cursor = v_6; + var /** number */ v_10 = base.cursor; + r_remove_second_order_prefix(); + base.cursor = v_10; + var /** number */ v_11 = base.cursor; + lab7: { + if (I_measure <= 2) + { + break lab7; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + if (!r_remove_suffix()) + { + break lab7; + } + base.cursor = base.limit_backward; + } + base.cursor = v_11; + } + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['IndonesianStemmer'] = IndonesianStemmer; diff --git a/js/irish-stemmer.js b/js/irish-stemmer.js new file mode 100644 index 0000000..9051a97 --- /dev/null +++ b/js/irish-stemmer.js @@ -0,0 +1,419 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var IrishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["b'", -1, 1], + ["bh", -1, 4], + ["bhf", 1, 2], + ["bp", -1, 8], + ["ch", -1, 5], + ["d'", -1, 1], + ["d'fh", 5, 2], + ["dh", -1, 6], + ["dt", -1, 9], + ["fh", -1, 2], + ["gc", -1, 5], + ["gh", -1, 7], + ["h-", -1, 1], + ["m'", -1, 1], + ["mb", -1, 4], + ["mh", -1, 10], + ["n-", -1, 1], + ["nd", -1, 6], + ["ng", -1, 7], + ["ph", -1, 8], + ["sh", -1, 3], + ["t-", -1, 1], + ["th", -1, 9], + ["ts", -1, 3] + ]; + + /** @const */ var a_1 = [ + ["\u00EDochta", -1, 1], + ["a\u00EDochta", 0, 1], + ["ire", -1, 2], + ["aire", 2, 2], + ["abh", -1, 1], + ["eabh", 4, 1], + ["ibh", -1, 1], + ["aibh", 6, 1], + ["amh", -1, 1], + ["eamh", 8, 1], + ["imh", -1, 1], + ["aimh", 10, 1], + ["\u00EDocht", -1, 1], + ["a\u00EDocht", 12, 1], + ["ir\u00ED", -1, 2], + ["air\u00ED", 14, 2] + ]; + + /** @const */ var a_2 = [ + ["\u00F3ideacha", -1, 6], + ["patacha", -1, 5], + ["achta", -1, 1], + ["arcachta", 2, 2], + ["eachta", 2, 1], + ["grafa\u00EDochta", -1, 4], + ["paite", -1, 5], + ["ach", -1, 1], + ["each", 7, 1], + ["\u00F3ideach", 8, 6], + ["gineach", 8, 3], + ["patach", 7, 5], + ["grafa\u00EDoch", -1, 4], + ["pataigh", -1, 5], + ["\u00F3idigh", -1, 6], + ["acht\u00FAil", -1, 1], + ["eacht\u00FAil", 15, 1], + ["gineas", -1, 3], + ["ginis", -1, 3], + ["acht", -1, 1], + ["arcacht", 19, 2], + ["eacht", 19, 1], + ["grafa\u00EDocht", -1, 4], + ["arcachta\u00ED", -1, 2], + ["grafa\u00EDochta\u00ED", -1, 4] + ]; + + /** @const */ var a_3 = [ + ["imid", -1, 1], + ["aimid", 0, 1], + ["\u00EDmid", -1, 1], + ["a\u00EDmid", 2, 1], + ["adh", -1, 2], + ["eadh", 4, 2], + ["faidh", -1, 1], + ["fidh", -1, 1], + ["\u00E1il", -1, 2], + ["ain", -1, 2], + ["tear", -1, 2], + ["tar", -1, 2] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 2]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + lab2: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab2; + } + break golab1; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_pV = base.cursor; + golab3: while(true) + { + lab4: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab4; + } + break golab3; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p1 = base.cursor; + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_initial_morph() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("f")) + { + return false; + } + break; + case 3: + if (!base.slice_from("s")) + { + return false; + } + break; + case 4: + if (!base.slice_from("b")) + { + return false; + } + break; + case 5: + if (!base.slice_from("c")) + { + return false; + } + break; + case 6: + if (!base.slice_from("d")) + { + return false; + } + break; + case 7: + if (!base.slice_from("g")) + { + return false; + } + break; + case 8: + if (!base.slice_from("p")) + { + return false; + } + break; + case 9: + if (!base.slice_from("t")) + { + return false; + } + break; + case 10: + if (!base.slice_from("m")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_noun_sfx() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_1); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_deriv() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("arc")) + { + return false; + } + break; + case 3: + if (!base.slice_from("gin")) + { + return false; + } + break; + case 4: + if (!base.slice_from("graf")) + { + return false; + } + break; + case 5: + if (!base.slice_from("paite")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u00F3id")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_verb_sfx() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_initial_morph(); + base.cursor = v_1; + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_3 = base.limit - base.cursor; + r_noun_sfx(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_deriv(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_verb_sfx(); + base.cursor = base.limit - v_5; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['IrishStemmer'] = IrishStemmer; diff --git a/js/italian-stemmer.js b/js/italian-stemmer.js new file mode 100644 index 0000000..193cbf8 --- /dev/null +++ b/js/italian-stemmer.js @@ -0,0 +1,1008 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var ItalianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["", -1, 7], + ["qu", 0, 6], + ["\u00E1", 0, 1], + ["\u00E9", 0, 2], + ["\u00ED", 0, 3], + ["\u00F3", 0, 4], + ["\u00FA", 0, 5] + ]; + + /** @const */ var a_1 = [ + ["", -1, 3], + ["I", 0, 1], + ["U", 0, 2] + ]; + + /** @const */ var a_2 = [ + ["la", -1, -1], + ["cela", 0, -1], + ["gliela", 0, -1], + ["mela", 0, -1], + ["tela", 0, -1], + ["vela", 0, -1], + ["le", -1, -1], + ["cele", 6, -1], + ["gliele", 6, -1], + ["mele", 6, -1], + ["tele", 6, -1], + ["vele", 6, -1], + ["ne", -1, -1], + ["cene", 12, -1], + ["gliene", 12, -1], + ["mene", 12, -1], + ["sene", 12, -1], + ["tene", 12, -1], + ["vene", 12, -1], + ["ci", -1, -1], + ["li", -1, -1], + ["celi", 20, -1], + ["glieli", 20, -1], + ["meli", 20, -1], + ["teli", 20, -1], + ["veli", 20, -1], + ["gli", 20, -1], + ["mi", -1, -1], + ["si", -1, -1], + ["ti", -1, -1], + ["vi", -1, -1], + ["lo", -1, -1], + ["celo", 31, -1], + ["glielo", 31, -1], + ["melo", 31, -1], + ["telo", 31, -1], + ["velo", 31, -1] + ]; + + /** @const */ var a_3 = [ + ["ando", -1, 1], + ["endo", -1, 1], + ["ar", -1, 2], + ["er", -1, 2], + ["ir", -1, 2] + ]; + + /** @const */ var a_4 = [ + ["ic", -1, -1], + ["abil", -1, -1], + ["os", -1, -1], + ["iv", -1, 1] + ]; + + /** @const */ var a_5 = [ + ["ic", -1, 1], + ["abil", -1, 1], + ["iv", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["ica", -1, 1], + ["logia", -1, 3], + ["osa", -1, 1], + ["ista", -1, 1], + ["iva", -1, 9], + ["anza", -1, 1], + ["enza", -1, 5], + ["ice", -1, 1], + ["atrice", 7, 1], + ["iche", -1, 1], + ["logie", -1, 3], + ["abile", -1, 1], + ["ibile", -1, 1], + ["usione", -1, 4], + ["azione", -1, 2], + ["uzione", -1, 4], + ["atore", -1, 2], + ["ose", -1, 1], + ["ante", -1, 1], + ["mente", -1, 1], + ["amente", 19, 7], + ["iste", -1, 1], + ["ive", -1, 9], + ["anze", -1, 1], + ["enze", -1, 5], + ["ici", -1, 1], + ["atrici", 25, 1], + ["ichi", -1, 1], + ["abili", -1, 1], + ["ibili", -1, 1], + ["ismi", -1, 1], + ["usioni", -1, 4], + ["azioni", -1, 2], + ["uzioni", -1, 4], + ["atori", -1, 2], + ["osi", -1, 1], + ["anti", -1, 1], + ["amenti", -1, 6], + ["imenti", -1, 6], + ["isti", -1, 1], + ["ivi", -1, 9], + ["ico", -1, 1], + ["ismo", -1, 1], + ["oso", -1, 1], + ["amento", -1, 6], + ["imento", -1, 6], + ["ivo", -1, 9], + ["it\u00E0", -1, 8], + ["ist\u00E0", -1, 1], + ["ist\u00E8", -1, 1], + ["ist\u00EC", -1, 1] + ]; + + /** @const */ var a_7 = [ + ["isca", -1, 1], + ["enda", -1, 1], + ["ata", -1, 1], + ["ita", -1, 1], + ["uta", -1, 1], + ["ava", -1, 1], + ["eva", -1, 1], + ["iva", -1, 1], + ["erebbe", -1, 1], + ["irebbe", -1, 1], + ["isce", -1, 1], + ["ende", -1, 1], + ["are", -1, 1], + ["ere", -1, 1], + ["ire", -1, 1], + ["asse", -1, 1], + ["ate", -1, 1], + ["avate", 16, 1], + ["evate", 16, 1], + ["ivate", 16, 1], + ["ete", -1, 1], + ["erete", 20, 1], + ["irete", 20, 1], + ["ite", -1, 1], + ["ereste", -1, 1], + ["ireste", -1, 1], + ["ute", -1, 1], + ["erai", -1, 1], + ["irai", -1, 1], + ["isci", -1, 1], + ["endi", -1, 1], + ["erei", -1, 1], + ["irei", -1, 1], + ["assi", -1, 1], + ["ati", -1, 1], + ["iti", -1, 1], + ["eresti", -1, 1], + ["iresti", -1, 1], + ["uti", -1, 1], + ["avi", -1, 1], + ["evi", -1, 1], + ["ivi", -1, 1], + ["isco", -1, 1], + ["ando", -1, 1], + ["endo", -1, 1], + ["Yamo", -1, 1], + ["iamo", -1, 1], + ["avamo", -1, 1], + ["evamo", -1, 1], + ["ivamo", -1, 1], + ["eremo", -1, 1], + ["iremo", -1, 1], + ["assimo", -1, 1], + ["ammo", -1, 1], + ["emmo", -1, 1], + ["eremmo", 54, 1], + ["iremmo", 54, 1], + ["immo", -1, 1], + ["ano", -1, 1], + ["iscano", 58, 1], + ["avano", 58, 1], + ["evano", 58, 1], + ["ivano", 58, 1], + ["eranno", -1, 1], + ["iranno", -1, 1], + ["ono", -1, 1], + ["iscono", 65, 1], + ["arono", 65, 1], + ["erono", 65, 1], + ["irono", 65, 1], + ["erebbero", -1, 1], + ["irebbero", -1, 1], + ["assero", -1, 1], + ["essero", -1, 1], + ["issero", -1, 1], + ["ato", -1, 1], + ["ito", -1, 1], + ["uto", -1, 1], + ["avo", -1, 1], + ["evo", -1, 1], + ["ivo", -1, 1], + ["ar", -1, 1], + ["ir", -1, 1], + ["er\u00E0", -1, 1], + ["ir\u00E0", -1, 1], + ["er\u00F2", -1, 1], + ["ir\u00F2", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2, 1]; + + /** @const */ var /** Array */ g_AEIO = [17, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2]; + + /** @const */ var /** Array */ g_CG = [17]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_prelude() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + while(true) + { + var /** number */ v_2 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u00E0")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u00E8")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u00EC")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u00F2")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u00F9")) + { + return false; + } + break; + case 6: + if (!base.slice_from("qU")) + { + return false; + } + break; + case 7: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_2; + break; + } + base.cursor = v_1; + while(true) + { + var /** number */ v_3 = base.cursor; + lab1: { + golab2: while(true) + { + var /** number */ v_4 = base.cursor; + lab3: { + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab3; + } + base.bra = base.cursor; + lab4: { + var /** number */ v_5 = base.cursor; + lab5: { + if (!(base.eq_s("u"))) + { + break lab5; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab5; + } + if (!base.slice_from("U")) + { + return false; + } + break lab4; + } + base.cursor = v_5; + if (!(base.eq_s("i"))) + { + break lab3; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab3; + } + if (!base.slice_from("I")) + { + return false; + } + } + base.cursor = v_4; + break golab2; + } + base.cursor = v_4; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + continue; + } + base.cursor = v_3; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab2; + } + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.out_grouping(g_v, 97, 249))) + { + break lab4; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab4; + } + base.cursor++; + } + break lab3; + } + base.cursor = v_3; + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab2; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 249))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + } + break lab1; + } + base.cursor = v_2; + if (!(base.out_grouping(g_v, 97, 249))) + { + break lab0; + } + lab9: { + var /** number */ v_6 = base.cursor; + lab10: { + if (!(base.out_grouping(g_v, 97, 249))) + { + break lab10; + } + golab11: while(true) + { + lab12: { + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab12; + } + break golab11; + } + if (base.cursor >= base.limit) + { + break lab10; + } + base.cursor++; + } + break lab9; + } + base.cursor = v_6; + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab0; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_pV = base.cursor; + } + base.cursor = v_1; + var /** number */ v_8 = base.cursor; + lab13: { + golab14: while(true) + { + lab15: { + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab15; + } + break golab14; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab16: while(true) + { + lab17: { + if (!(base.out_grouping(g_v, 97, 249))) + { + break lab17; + } + break golab16; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p1 = base.cursor; + golab18: while(true) + { + lab19: { + if (!(base.in_grouping(g_v, 97, 249))) + { + break lab19; + } + break golab18; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab20: while(true) + { + lab21: { + if (!(base.out_grouping(g_v, 97, 249))) + { + break lab21; + } + break golab20; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_8; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_1); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("i")) + { + return false; + } + break; + case 2: + if (!base.slice_from("u")) + { + return false; + } + break; + case 3: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_attached_pronoun() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_2) == 0) + { + return false; + } + base.bra = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + if (!r_RV()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("ic"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 3: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("log")) + { + return false; + } + break; + case 4: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("u")) + { + return false; + } + break; + case 5: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("ente")) + { + return false; + } + break; + case 6: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 7: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + switch (among_var) { + case 1: + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + break; + case 8: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + if (base.find_among_b(a_5) == 0) + { + base.cursor = base.limit - v_3; + break lab2; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_3; + break lab2; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 9: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_4; + break lab3; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_4; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("ic"))) + { + base.cursor = base.limit - v_4; + break lab3; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_4; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_verb_suffix() { + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + if (base.find_among_b(a_7) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.limit_backward = v_2; + return true; + }; + + /** @return {boolean} */ + function r_vowel_suffix() { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.in_grouping_b(g_AEIO, 97, 242))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!r_RV()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("i"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!r_RV()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + } + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (!(base.eq_s_b("h"))) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!(base.in_grouping_b(g_CG, 99, 103))) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!r_RV()) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_exceptions() { + base.bra = base.cursor; + if (!(base.eq_s("divano"))) + { + return false; + } + if (base.cursor < base.limit) + { + return false; + } + base.ket = base.cursor; + if (!base.slice_from("divan")) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + lab0: { + var /** number */ v_1 = base.cursor; + lab1: { + if (!r_exceptions()) + { + break lab1; + } + break lab0; + } + base.cursor = v_1; + var /** number */ v_2 = base.cursor; + r_prelude(); + base.cursor = v_2; + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_4 = base.limit - base.cursor; + r_attached_pronoun(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + lab2: { + lab3: { + var /** number */ v_6 = base.limit - base.cursor; + lab4: { + if (!r_standard_suffix()) + { + break lab4; + } + break lab3; + } + base.cursor = base.limit - v_6; + if (!r_verb_suffix()) + { + break lab2; + } + } + } + base.cursor = base.limit - v_5; + var /** number */ v_7 = base.limit - base.cursor; + r_vowel_suffix(); + base.cursor = base.limit - v_7; + base.cursor = base.limit_backward; + var /** number */ v_8 = base.cursor; + r_postlude(); + base.cursor = v_8; + } + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['ItalianStemmer'] = ItalianStemmer; diff --git a/js/lithuanian-stemmer.js b/js/lithuanian-stemmer.js new file mode 100644 index 0000000..4d1d886 --- /dev/null +++ b/js/lithuanian-stemmer.js @@ -0,0 +1,558 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var LithuanianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["a", -1, -1], + ["ia", 0, -1], + ["eria", 1, -1], + ["osna", 0, -1], + ["iosna", 3, -1], + ["uosna", 3, -1], + ["iuosna", 5, -1], + ["ysna", 0, -1], + ["\u0117sna", 0, -1], + ["e", -1, -1], + ["ie", 9, -1], + ["enie", 10, -1], + ["erie", 10, -1], + ["oje", 9, -1], + ["ioje", 13, -1], + ["uje", 9, -1], + ["iuje", 15, -1], + ["yje", 9, -1], + ["enyje", 17, -1], + ["eryje", 17, -1], + ["\u0117je", 9, -1], + ["ame", 9, -1], + ["iame", 21, -1], + ["sime", 9, -1], + ["ome", 9, -1], + ["\u0117me", 9, -1], + ["tum\u0117me", 25, -1], + ["ose", 9, -1], + ["iose", 27, -1], + ["uose", 27, -1], + ["iuose", 29, -1], + ["yse", 9, -1], + ["enyse", 31, -1], + ["eryse", 31, -1], + ["\u0117se", 9, -1], + ["ate", 9, -1], + ["iate", 35, -1], + ["ite", 9, -1], + ["kite", 37, -1], + ["site", 37, -1], + ["ote", 9, -1], + ["tute", 9, -1], + ["\u0117te", 9, -1], + ["tum\u0117te", 42, -1], + ["i", -1, -1], + ["ai", 44, -1], + ["iai", 45, -1], + ["eriai", 46, -1], + ["ei", 44, -1], + ["tumei", 48, -1], + ["ki", 44, -1], + ["imi", 44, -1], + ["erimi", 51, -1], + ["umi", 44, -1], + ["iumi", 53, -1], + ["si", 44, -1], + ["asi", 55, -1], + ["iasi", 56, -1], + ["esi", 55, -1], + ["iesi", 58, -1], + ["siesi", 59, -1], + ["isi", 55, -1], + ["aisi", 61, -1], + ["eisi", 61, -1], + ["tumeisi", 63, -1], + ["uisi", 61, -1], + ["osi", 55, -1], + ["\u0117josi", 66, -1], + ["uosi", 66, -1], + ["iuosi", 68, -1], + ["siuosi", 69, -1], + ["usi", 55, -1], + ["ausi", 71, -1], + ["\u010Diausi", 72, -1], + ["\u0105si", 55, -1], + ["\u0117si", 55, -1], + ["\u0173si", 55, -1], + ["t\u0173si", 76, -1], + ["ti", 44, -1], + ["enti", 78, -1], + ["inti", 78, -1], + ["oti", 78, -1], + ["ioti", 81, -1], + ["uoti", 81, -1], + ["iuoti", 83, -1], + ["auti", 78, -1], + ["iauti", 85, -1], + ["yti", 78, -1], + ["\u0117ti", 78, -1], + ["tel\u0117ti", 88, -1], + ["in\u0117ti", 88, -1], + ["ter\u0117ti", 88, -1], + ["ui", 44, -1], + ["iui", 92, -1], + ["eniui", 93, -1], + ["oj", -1, -1], + ["\u0117j", -1, -1], + ["k", -1, -1], + ["am", -1, -1], + ["iam", 98, -1], + ["iem", -1, -1], + ["im", -1, -1], + ["sim", 101, -1], + ["om", -1, -1], + ["tum", -1, -1], + ["\u0117m", -1, -1], + ["tum\u0117m", 105, -1], + ["an", -1, -1], + ["on", -1, -1], + ["ion", 108, -1], + ["un", -1, -1], + ["iun", 110, -1], + ["\u0117n", -1, -1], + ["o", -1, -1], + ["io", 113, -1], + ["enio", 114, -1], + ["\u0117jo", 113, -1], + ["uo", 113, -1], + ["s", -1, -1], + ["as", 118, -1], + ["ias", 119, -1], + ["es", 118, -1], + ["ies", 121, -1], + ["is", 118, -1], + ["ais", 123, -1], + ["iais", 124, -1], + ["tumeis", 123, -1], + ["imis", 123, -1], + ["enimis", 127, -1], + ["omis", 123, -1], + ["iomis", 129, -1], + ["umis", 123, -1], + ["\u0117mis", 123, -1], + ["enis", 123, -1], + ["asis", 123, -1], + ["ysis", 123, -1], + ["ams", 118, -1], + ["iams", 136, -1], + ["iems", 118, -1], + ["ims", 118, -1], + ["enims", 139, -1], + ["erims", 139, -1], + ["oms", 118, -1], + ["ioms", 142, -1], + ["ums", 118, -1], + ["\u0117ms", 118, -1], + ["ens", 118, -1], + ["os", 118, -1], + ["ios", 147, -1], + ["uos", 147, -1], + ["iuos", 149, -1], + ["ers", 118, -1], + ["us", 118, -1], + ["aus", 152, -1], + ["iaus", 153, -1], + ["ius", 152, -1], + ["ys", 118, -1], + ["enys", 156, -1], + ["erys", 156, -1], + ["\u0105s", 118, -1], + ["i\u0105s", 159, -1], + ["\u0117s", 118, -1], + ["am\u0117s", 161, -1], + ["iam\u0117s", 162, -1], + ["im\u0117s", 161, -1], + ["kim\u0117s", 164, -1], + ["sim\u0117s", 164, -1], + ["om\u0117s", 161, -1], + ["\u0117m\u0117s", 161, -1], + ["tum\u0117m\u0117s", 168, -1], + ["at\u0117s", 161, -1], + ["iat\u0117s", 170, -1], + ["sit\u0117s", 161, -1], + ["ot\u0117s", 161, -1], + ["\u0117t\u0117s", 161, -1], + ["tum\u0117t\u0117s", 174, -1], + ["\u012Fs", 118, -1], + ["\u016Bs", 118, -1], + ["t\u0173s", 118, -1], + ["at", -1, -1], + ["iat", 179, -1], + ["it", -1, -1], + ["sit", 181, -1], + ["ot", -1, -1], + ["\u0117t", -1, -1], + ["tum\u0117t", 184, -1], + ["u", -1, -1], + ["au", 186, -1], + ["iau", 187, -1], + ["\u010Diau", 188, -1], + ["iu", 186, -1], + ["eniu", 190, -1], + ["siu", 190, -1], + ["y", -1, -1], + ["\u0105", -1, -1], + ["i\u0105", 194, -1], + ["\u0117", -1, -1], + ["\u0119", -1, -1], + ["\u012F", -1, -1], + ["en\u012F", 198, -1], + ["er\u012F", 198, -1], + ["\u0173", -1, -1], + ["i\u0173", 201, -1], + ["er\u0173", 201, -1] + ]; + + /** @const */ var a_1 = [ + ["ing", -1, -1], + ["aj", -1, -1], + ["iaj", 1, -1], + ["iej", -1, -1], + ["oj", -1, -1], + ["ioj", 4, -1], + ["uoj", 4, -1], + ["iuoj", 6, -1], + ["auj", -1, -1], + ["\u0105j", -1, -1], + ["i\u0105j", 9, -1], + ["\u0117j", -1, -1], + ["\u0173j", -1, -1], + ["i\u0173j", 12, -1], + ["ok", -1, -1], + ["iok", 14, -1], + ["iuk", -1, -1], + ["uliuk", 16, -1], + ["u\u010Diuk", 16, -1], + ["i\u0161k", -1, -1], + ["iul", -1, -1], + ["yl", -1, -1], + ["\u0117l", -1, -1], + ["am", -1, -1], + ["dam", 23, -1], + ["jam", 23, -1], + ["zgan", -1, -1], + ["ain", -1, -1], + ["esn", -1, -1], + ["op", -1, -1], + ["iop", 29, -1], + ["ias", -1, -1], + ["ies", -1, -1], + ["ais", -1, -1], + ["iais", 33, -1], + ["os", -1, -1], + ["ios", 35, -1], + ["uos", 35, -1], + ["iuos", 37, -1], + ["aus", -1, -1], + ["iaus", 39, -1], + ["\u0105s", -1, -1], + ["i\u0105s", 41, -1], + ["\u0119s", -1, -1], + ["ut\u0117ait", -1, -1], + ["ant", -1, -1], + ["iant", 45, -1], + ["siant", 46, -1], + ["int", -1, -1], + ["ot", -1, -1], + ["uot", 49, -1], + ["iuot", 50, -1], + ["yt", -1, -1], + ["\u0117t", -1, -1], + ["yk\u0161t", -1, -1], + ["iau", -1, -1], + ["dav", -1, -1], + ["sv", -1, -1], + ["\u0161v", -1, -1], + ["yk\u0161\u010D", -1, -1], + ["\u0119", -1, -1], + ["\u0117j\u0119", 60, -1] + ]; + + /** @const */ var a_2 = [ + ["ojime", -1, 7], + ["\u0117jime", -1, 3], + ["avime", -1, 6], + ["okate", -1, 8], + ["aite", -1, 1], + ["uote", -1, 2], + ["asius", -1, 5], + ["okat\u0117s", -1, 8], + ["ait\u0117s", -1, 1], + ["uot\u0117s", -1, 2], + ["esiu", -1, 4] + ]; + + /** @const */ var a_3 = [ + ["\u010D", -1, 1], + ["d\u017E", -1, 2] + ]; + + /** @const */ var a_4 = [ + ["gd", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 64, 1, 0, 64, 0, 0, 0, 0, 0, 0, 0, 4, 4]; + + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_step1() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_0) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_step2() { + while(true) + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (base.cursor < I_p1) + { + break lab0; + } + var /** number */ v_3 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + base.limit_backward = v_3; + break lab0; + } + base.bra = base.cursor; + base.limit_backward = v_3; + if (!base.slice_del()) + { + return false; + } + continue; + } + base.cursor = base.limit - v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_fix_conflicts() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("ait\u0117")) + { + return false; + } + break; + case 2: + if (!base.slice_from("uot\u0117")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u0117jimas")) + { + return false; + } + break; + case 4: + if (!base.slice_from("esys")) + { + return false; + } + break; + case 5: + if (!base.slice_from("asys")) + { + return false; + } + break; + case 6: + if (!base.slice_from("avimas")) + { + return false; + } + break; + case 7: + if (!base.slice_from("ojimas")) + { + return false; + } + break; + case 8: + if (!base.slice_from("okat\u0117")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_fix_chdz() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("t")) + { + return false; + } + break; + case 2: + if (!base.slice_from("d")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_fix_gd() { + base.ket = base.cursor; + if (base.find_among_b(a_4) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_from("g")) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + I_p1 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + var /** number */ v_2 = base.cursor; + lab1: { + var /** number */ v_3 = base.cursor; + if (!(base.eq_s("a"))) + { + base.cursor = v_2; + break lab1; + } + base.cursor = v_3; + if (base.current.length <= 6) + { + base.cursor = v_2; + break lab1; + } + { + var /** number */ c1 = base.cursor + 1; + if (c1 > base.limit) + { + base.cursor = v_2; + break lab1; + } + base.cursor = c1; + } + } + golab2: while(true) + { + lab3: { + if (!(base.in_grouping(g_v, 97, 371))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab4: while(true) + { + lab5: { + if (!(base.out_grouping(g_v, 97, 371))) + { + break lab5; + } + break golab4; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p1 = base.cursor; + } + base.cursor = v_1; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_6 = base.limit - base.cursor; + r_fix_conflicts(); + base.cursor = base.limit - v_6; + var /** number */ v_7 = base.limit - base.cursor; + r_step1(); + base.cursor = base.limit - v_7; + var /** number */ v_8 = base.limit - base.cursor; + r_fix_chdz(); + base.cursor = base.limit - v_8; + var /** number */ v_9 = base.limit - base.cursor; + r_step2(); + base.cursor = base.limit - v_9; + var /** number */ v_10 = base.limit - base.cursor; + r_fix_chdz(); + base.cursor = base.limit - v_10; + var /** number */ v_11 = base.limit - base.cursor; + r_fix_gd(); + base.cursor = base.limit - v_11; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['LithuanianStemmer'] = LithuanianStemmer; diff --git a/js/nepali-stemmer.js b/js/nepali-stemmer.js new file mode 100644 index 0000000..500c7bd --- /dev/null +++ b/js/nepali-stemmer.js @@ -0,0 +1,319 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var NepaliStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0932\u093E\u0907", -1, 1], + ["\u0932\u093E\u0908", -1, 1], + ["\u0938\u0901\u0917", -1, 1], + ["\u0938\u0902\u0917", -1, 1], + ["\u092E\u093E\u0930\u094D\u092B\u0924", -1, 1], + ["\u0930\u0924", -1, 1], + ["\u0915\u093E", -1, 2], + ["\u092E\u093E", -1, 1], + ["\u0926\u094D\u0935\u093E\u0930\u093E", -1, 1], + ["\u0915\u093F", -1, 2], + ["\u092A\u091B\u093F", -1, 1], + ["\u0915\u0940", -1, 2], + ["\u0932\u0947", -1, 1], + ["\u0915\u0948", -1, 2], + ["\u0938\u0901\u0917\u0948", -1, 1], + ["\u092E\u0948", -1, 1], + ["\u0915\u094B", -1, 2] + ]; + + /** @const */ var a_1 = [ + ["\u0901", -1, -1], + ["\u0902", -1, -1], + ["\u0948", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["\u0901", -1, 1], + ["\u0902", -1, 1], + ["\u0948", -1, 2] + ]; + + /** @const */ var a_3 = [ + ["\u0925\u093F\u090F", -1, 1], + ["\u091B", -1, 1], + ["\u0907\u091B", 1, 1], + ["\u090F\u091B", 1, 1], + ["\u093F\u091B", 1, 1], + ["\u0947\u091B", 1, 1], + ["\u0928\u0947\u091B", 5, 1], + ["\u0939\u0941\u0928\u0947\u091B", 6, 1], + ["\u0907\u0928\u094D\u091B", 1, 1], + ["\u093F\u0928\u094D\u091B", 1, 1], + ["\u0939\u0941\u0928\u094D\u091B", 1, 1], + ["\u090F\u0915\u093E", -1, 1], + ["\u0907\u090F\u0915\u093E", 11, 1], + ["\u093F\u090F\u0915\u093E", 11, 1], + ["\u0947\u0915\u093E", -1, 1], + ["\u0928\u0947\u0915\u093E", 14, 1], + ["\u0926\u093E", -1, 1], + ["\u0907\u0926\u093E", 16, 1], + ["\u093F\u0926\u093E", 16, 1], + ["\u0926\u0947\u0916\u093F", -1, 1], + ["\u092E\u093E\u0925\u093F", -1, 1], + ["\u090F\u0915\u0940", -1, 1], + ["\u0907\u090F\u0915\u0940", 21, 1], + ["\u093F\u090F\u0915\u0940", 21, 1], + ["\u0947\u0915\u0940", -1, 1], + ["\u0926\u0947\u0916\u0940", -1, 1], + ["\u0925\u0940", -1, 1], + ["\u0926\u0940", -1, 1], + ["\u091B\u0941", -1, 1], + ["\u090F\u091B\u0941", 28, 1], + ["\u0947\u091B\u0941", 28, 1], + ["\u0928\u0947\u091B\u0941", 30, 1], + ["\u0928\u0941", -1, 1], + ["\u0939\u0930\u0941", -1, 1], + ["\u0939\u0930\u0942", -1, 1], + ["\u091B\u0947", -1, 1], + ["\u0925\u0947", -1, 1], + ["\u0928\u0947", -1, 1], + ["\u090F\u0915\u0948", -1, 1], + ["\u0947\u0915\u0948", -1, 1], + ["\u0928\u0947\u0915\u0948", 39, 1], + ["\u0926\u0948", -1, 1], + ["\u0907\u0926\u0948", 41, 1], + ["\u093F\u0926\u0948", 41, 1], + ["\u090F\u0915\u094B", -1, 1], + ["\u0907\u090F\u0915\u094B", 44, 1], + ["\u093F\u090F\u0915\u094B", 44, 1], + ["\u0947\u0915\u094B", -1, 1], + ["\u0928\u0947\u0915\u094B", 47, 1], + ["\u0926\u094B", -1, 1], + ["\u0907\u0926\u094B", 49, 1], + ["\u093F\u0926\u094B", 49, 1], + ["\u092F\u094B", -1, 1], + ["\u0907\u092F\u094B", 52, 1], + ["\u092D\u092F\u094B", 52, 1], + ["\u093F\u092F\u094B", 52, 1], + ["\u0925\u093F\u092F\u094B", 55, 1], + ["\u0926\u093F\u092F\u094B", 55, 1], + ["\u0925\u094D\u092F\u094B", 52, 1], + ["\u091B\u094C", -1, 1], + ["\u0907\u091B\u094C", 59, 1], + ["\u090F\u091B\u094C", 59, 1], + ["\u093F\u091B\u094C", 59, 1], + ["\u0947\u091B\u094C", 59, 1], + ["\u0928\u0947\u091B\u094C", 63, 1], + ["\u092F\u094C", -1, 1], + ["\u0925\u093F\u092F\u094C", 65, 1], + ["\u091B\u094D\u092F\u094C", 65, 1], + ["\u0925\u094D\u092F\u094C", 65, 1], + ["\u091B\u0928\u094D", -1, 1], + ["\u0907\u091B\u0928\u094D", 69, 1], + ["\u090F\u091B\u0928\u094D", 69, 1], + ["\u093F\u091B\u0928\u094D", 69, 1], + ["\u0947\u091B\u0928\u094D", 69, 1], + ["\u0928\u0947\u091B\u0928\u094D", 73, 1], + ["\u0932\u093E\u0928\u094D", -1, 1], + ["\u091B\u093F\u0928\u094D", -1, 1], + ["\u0925\u093F\u0928\u094D", -1, 1], + ["\u092A\u0930\u094D", -1, 1], + ["\u0907\u0938\u094D", -1, 1], + ["\u0925\u093F\u0907\u0938\u094D", 79, 1], + ["\u091B\u0938\u094D", -1, 1], + ["\u0907\u091B\u0938\u094D", 81, 1], + ["\u090F\u091B\u0938\u094D", 81, 1], + ["\u093F\u091B\u0938\u094D", 81, 1], + ["\u0947\u091B\u0938\u094D", 81, 1], + ["\u0928\u0947\u091B\u0938\u094D", 85, 1], + ["\u093F\u0938\u094D", -1, 1], + ["\u0925\u093F\u0938\u094D", 87, 1], + ["\u091B\u0947\u0938\u094D", -1, 1], + ["\u0939\u094B\u0938\u094D", -1, 1] + ]; + + + + /** @return {boolean} */ + function r_remove_category_1() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + lab2: { + var /** number */ v_2 = base.limit - base.cursor; + lab3: { + if (!(base.eq_s_b("\u090F"))) + { + break lab3; + } + break lab2; + } + base.cursor = base.limit - v_2; + if (!(base.eq_s_b("\u0947"))) + { + break lab1; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!base.slice_del()) + { + return false; + } + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_check_category_2() { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + return true; + }; + + /** @return {boolean} */ + function r_remove_category_2() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("\u092F\u094C"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + lab2: { + if (!(base.eq_s_b("\u091B\u094C"))) + { + break lab2; + } + break lab0; + } + base.cursor = base.limit - v_1; + lab3: { + if (!(base.eq_s_b("\u0928\u094C"))) + { + break lab3; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("\u0925\u0947"))) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.eq_s_b("\u0924\u094D\u0930"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_remove_category_3() { + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_1 = base.limit - base.cursor; + r_remove_category_1(); + base.cursor = base.limit - v_1; + var /** number */ v_2 = base.limit - base.cursor; + lab0: { + while(true) + { + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + var /** number */ v_5 = base.limit - base.cursor; + if (!r_check_category_2()) + { + break lab2; + } + base.cursor = base.limit - v_5; + if (!r_remove_category_2()) + { + break lab2; + } + } + base.cursor = base.limit - v_4; + if (!r_remove_category_3()) + { + break lab1; + } + continue; + } + base.cursor = base.limit - v_3; + break; + } + } + base.cursor = base.limit - v_2; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['NepaliStemmer'] = NepaliStemmer; diff --git a/js/norwegian-stemmer.js b/js/norwegian-stemmer.js new file mode 100644 index 0000000..a037992 --- /dev/null +++ b/js/norwegian-stemmer.js @@ -0,0 +1,263 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var NorwegianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["a", -1, 1], + ["e", -1, 1], + ["ede", 1, 1], + ["ande", 1, 1], + ["ende", 1, 1], + ["ane", 1, 1], + ["ene", 1, 1], + ["hetene", 6, 1], + ["erte", 1, 3], + ["en", -1, 1], + ["heten", 9, 1], + ["ar", -1, 1], + ["er", -1, 1], + ["heter", 12, 1], + ["s", -1, 2], + ["as", 14, 1], + ["es", 14, 1], + ["edes", 16, 1], + ["endes", 16, 1], + ["enes", 16, 1], + ["hetenes", 19, 1], + ["ens", 14, 1], + ["hetens", 21, 1], + ["ers", 14, 1], + ["ets", 14, 1], + ["et", -1, 1], + ["het", 25, 1], + ["ert", -1, 3], + ["ast", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["dt", -1, -1], + ["vt", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["leg", -1, 1], + ["eleg", 0, 1], + ["ig", -1, 1], + ["eig", 2, 1], + ["lig", 2, 1], + ["elig", 4, 1], + ["els", -1, 1], + ["lov", -1, 1], + ["elov", 7, 1], + ["slov", 7, 1], + ["hetslov", 9, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128]; + + /** @const */ var /** Array */ g_s_ending = [119, 125, 149, 1]; + + var /** number */ I_x = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + var /** number */ v_1 = base.cursor; + { + var /** number */ c1 = base.cursor + 3; + if (c1 > base.limit) + { + return false; + } + base.cursor = c1; + } + I_x = base.cursor; + base.cursor = v_1; + golab0: while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + if (!(base.in_grouping(g_v, 97, 248))) + { + break lab1; + } + base.cursor = v_2; + break golab0; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_v, 97, 248))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + lab4: { + if (I_p1 >= I_x) + { + break lab4; + } + I_p1 = I_x; + } + return true; + }; + + /** @return {boolean} */ + function r_main_suffix() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + lab0: { + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + if (!(base.in_grouping_b(g_s_ending, 98, 122))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_3; + if (!(base.eq_s_b("k"))) + { + return false; + } + if (!(base.out_grouping_b(g_v, 97, 248))) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_from("er")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_consonant_pair() { + var /** number */ v_1 = base.limit - base.cursor; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_3 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + base.limit_backward = v_3; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_3; + base.cursor = base.limit - v_1; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_other_suffix() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + if (base.find_among_b(a_2) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_mark_regions(); + base.cursor = v_1; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_main_suffix(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_consonant_pair(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_other_suffix(); + base.cursor = base.limit - v_4; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['NorwegianStemmer'] = NorwegianStemmer; diff --git a/js/porter-stemmer.js b/js/porter-stemmer.js new file mode 100644 index 0000000..8a1fcc8 --- /dev/null +++ b/js/porter-stemmer.js @@ -0,0 +1,739 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var PorterStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["s", -1, 3], + ["ies", 0, 2], + ["sses", 0, 1], + ["ss", 0, -1] + ]; + + /** @const */ var a_1 = [ + ["", -1, 3], + ["bb", 0, 2], + ["dd", 0, 2], + ["ff", 0, 2], + ["gg", 0, 2], + ["bl", 0, 1], + ["mm", 0, 2], + ["nn", 0, 2], + ["pp", 0, 2], + ["rr", 0, 2], + ["at", 0, 1], + ["tt", 0, 2], + ["iz", 0, 1] + ]; + + /** @const */ var a_2 = [ + ["ed", -1, 2], + ["eed", 0, 1], + ["ing", -1, 2] + ]; + + /** @const */ var a_3 = [ + ["anci", -1, 3], + ["enci", -1, 2], + ["abli", -1, 4], + ["eli", -1, 6], + ["alli", -1, 9], + ["ousli", -1, 11], + ["entli", -1, 5], + ["aliti", -1, 9], + ["biliti", -1, 13], + ["iviti", -1, 12], + ["tional", -1, 1], + ["ational", 10, 8], + ["alism", -1, 9], + ["ation", -1, 8], + ["ization", 13, 7], + ["izer", -1, 7], + ["ator", -1, 8], + ["iveness", -1, 12], + ["fulness", -1, 10], + ["ousness", -1, 11] + ]; + + /** @const */ var a_4 = [ + ["icate", -1, 2], + ["ative", -1, 3], + ["alize", -1, 1], + ["iciti", -1, 2], + ["ical", -1, 2], + ["ful", -1, 3], + ["ness", -1, 3] + ]; + + /** @const */ var a_5 = [ + ["ic", -1, 1], + ["ance", -1, 1], + ["ence", -1, 1], + ["able", -1, 1], + ["ible", -1, 1], + ["ate", -1, 1], + ["ive", -1, 1], + ["ize", -1, 1], + ["iti", -1, 1], + ["al", -1, 1], + ["ism", -1, 1], + ["ion", -1, 2], + ["er", -1, 1], + ["ous", -1, 1], + ["ant", -1, 1], + ["ent", -1, 1], + ["ment", 15, 1], + ["ement", 16, 1], + ["ou", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1]; + + /** @const */ var /** Array */ g_v_WXY = [1, 17, 65, 208, 1]; + + var /** boolean */ B_Y_found = false; + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_shortv() { + if (!(base.out_grouping_b(g_v_WXY, 89, 121))) + { + return false; + } + if (!(base.in_grouping_b(g_v, 97, 121))) + { + return false; + } + if (!(base.out_grouping_b(g_v, 97, 121))) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_Step_1a() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("ss")) + { + return false; + } + break; + case 2: + if (!base.slice_from("i")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_1b() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + return false; + } + if (!base.slice_from("ee")) + { + return false; + } + break; + case 2: + var /** number */ v_1 = base.limit - base.cursor; + golab0: while(true) + { + lab1: { + if (!(base.in_grouping_b(g_v, 97, 121))) + { + break lab1; + } + break golab0; + } + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + base.cursor = base.limit - v_1; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + among_var = base.find_among_b(a_1); + base.cursor = base.limit - v_3; + switch (among_var) { + case 1: + { + var /** number */ c1 = base.cursor; + base.insert(base.cursor, base.cursor, "e"); + base.cursor = c1; + } + break; + case 2: + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (base.cursor != I_p1) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + if (!r_shortv()) + { + return false; + } + base.cursor = base.limit - v_4; + { + var /** number */ c2 = base.cursor; + base.insert(base.cursor, base.cursor, "e"); + base.cursor = c2; + } + break; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_1c() { + base.ket = base.cursor; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("y"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("Y"))) + { + return false; + } + } + base.bra = base.cursor; + golab2: while(true) + { + lab3: { + if (!(base.in_grouping_b(g_v, 97, 121))) + { + break lab3; + } + break golab2; + } + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + if (!base.slice_from("i")) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_2() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("tion")) + { + return false; + } + break; + case 2: + if (!base.slice_from("ence")) + { + return false; + } + break; + case 3: + if (!base.slice_from("ance")) + { + return false; + } + break; + case 4: + if (!base.slice_from("able")) + { + return false; + } + break; + case 5: + if (!base.slice_from("ent")) + { + return false; + } + break; + case 6: + if (!base.slice_from("e")) + { + return false; + } + break; + case 7: + if (!base.slice_from("ize")) + { + return false; + } + break; + case 8: + if (!base.slice_from("ate")) + { + return false; + } + break; + case 9: + if (!base.slice_from("al")) + { + return false; + } + break; + case 10: + if (!base.slice_from("ful")) + { + return false; + } + break; + case 11: + if (!base.slice_from("ous")) + { + return false; + } + break; + case 12: + if (!base.slice_from("ive")) + { + return false; + } + break; + case 13: + if (!base.slice_from("ble")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_3() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("al")) + { + return false; + } + break; + case 2: + if (!base.slice_from("ic")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_4() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R2()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("s"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("t"))) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_5a() { + base.ket = base.cursor; + if (!(base.eq_s_b("e"))) + { + return false; + } + base.bra = base.cursor; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!r_R2()) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!r_R1()) + { + return false; + } + { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + if (!r_shortv()) + { + break lab2; + } + return false; + } + base.cursor = base.limit - v_2; + } + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_5b() { + base.ket = base.cursor; + if (!(base.eq_s_b("l"))) + { + return false; + } + base.bra = base.cursor; + if (!r_R2()) + { + return false; + } + if (!(base.eq_s_b("l"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + B_Y_found = false; + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + if (!(base.eq_s("y"))) + { + break lab0; + } + base.ket = base.cursor; + if (!base.slice_from("Y")) + { + return false; + } + B_Y_found = true; + } + base.cursor = v_1; + var /** number */ v_2 = base.cursor; + lab1: { + while(true) + { + var /** number */ v_3 = base.cursor; + lab2: { + golab3: while(true) + { + var /** number */ v_4 = base.cursor; + lab4: { + if (!(base.in_grouping(g_v, 97, 121))) + { + break lab4; + } + base.bra = base.cursor; + if (!(base.eq_s("y"))) + { + break lab4; + } + base.ket = base.cursor; + base.cursor = v_4; + break golab3; + } + base.cursor = v_4; + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + if (!base.slice_from("Y")) + { + return false; + } + B_Y_found = true; + continue; + } + base.cursor = v_3; + break; + } + } + base.cursor = v_2; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_5 = base.cursor; + lab5: { + golab6: while(true) + { + lab7: { + if (!(base.in_grouping(g_v, 97, 121))) + { + break lab7; + } + break golab6; + } + if (base.cursor >= base.limit) + { + break lab5; + } + base.cursor++; + } + golab8: while(true) + { + lab9: { + if (!(base.out_grouping(g_v, 97, 121))) + { + break lab9; + } + break golab8; + } + if (base.cursor >= base.limit) + { + break lab5; + } + base.cursor++; + } + I_p1 = base.cursor; + golab10: while(true) + { + lab11: { + if (!(base.in_grouping(g_v, 97, 121))) + { + break lab11; + } + break golab10; + } + if (base.cursor >= base.limit) + { + break lab5; + } + base.cursor++; + } + golab12: while(true) + { + lab13: { + if (!(base.out_grouping(g_v, 97, 121))) + { + break lab13; + } + break golab12; + } + if (base.cursor >= base.limit) + { + break lab5; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_5; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_10 = base.limit - base.cursor; + r_Step_1a(); + base.cursor = base.limit - v_10; + var /** number */ v_11 = base.limit - base.cursor; + r_Step_1b(); + base.cursor = base.limit - v_11; + var /** number */ v_12 = base.limit - base.cursor; + r_Step_1c(); + base.cursor = base.limit - v_12; + var /** number */ v_13 = base.limit - base.cursor; + r_Step_2(); + base.cursor = base.limit - v_13; + var /** number */ v_14 = base.limit - base.cursor; + r_Step_3(); + base.cursor = base.limit - v_14; + var /** number */ v_15 = base.limit - base.cursor; + r_Step_4(); + base.cursor = base.limit - v_15; + var /** number */ v_16 = base.limit - base.cursor; + r_Step_5a(); + base.cursor = base.limit - v_16; + var /** number */ v_17 = base.limit - base.cursor; + r_Step_5b(); + base.cursor = base.limit - v_17; + base.cursor = base.limit_backward; + var /** number */ v_18 = base.cursor; + lab14: { + if (!B_Y_found) + { + break lab14; + } + while(true) + { + var /** number */ v_19 = base.cursor; + lab15: { + golab16: while(true) + { + var /** number */ v_20 = base.cursor; + lab17: { + base.bra = base.cursor; + if (!(base.eq_s("Y"))) + { + break lab17; + } + base.ket = base.cursor; + base.cursor = v_20; + break golab16; + } + base.cursor = v_20; + if (base.cursor >= base.limit) + { + break lab15; + } + base.cursor++; + } + if (!base.slice_from("y")) + { + return false; + } + continue; + } + base.cursor = v_19; + break; + } + } + base.cursor = v_18; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['PorterStemmer'] = PorterStemmer; diff --git a/js/portuguese-stemmer.js b/js/portuguese-stemmer.js new file mode 100644 index 0000000..ef5bcfc --- /dev/null +++ b/js/portuguese-stemmer.js @@ -0,0 +1,896 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var PortugueseStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["", -1, 3], + ["\u00E3", 0, 1], + ["\u00F5", 0, 2] + ]; + + /** @const */ var a_1 = [ + ["", -1, 3], + ["a~", 0, 1], + ["o~", 0, 2] + ]; + + /** @const */ var a_2 = [ + ["ic", -1, -1], + ["ad", -1, -1], + ["os", -1, -1], + ["iv", -1, 1] + ]; + + /** @const */ var a_3 = [ + ["ante", -1, 1], + ["avel", -1, 1], + ["\u00EDvel", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["ic", -1, 1], + ["abil", -1, 1], + ["iv", -1, 1] + ]; + + /** @const */ var a_5 = [ + ["ica", -1, 1], + ["\u00E2ncia", -1, 1], + ["\u00EAncia", -1, 4], + ["logia", -1, 2], + ["ira", -1, 9], + ["adora", -1, 1], + ["osa", -1, 1], + ["ista", -1, 1], + ["iva", -1, 8], + ["eza", -1, 1], + ["idade", -1, 7], + ["ante", -1, 1], + ["mente", -1, 6], + ["amente", 12, 5], + ["\u00E1vel", -1, 1], + ["\u00EDvel", -1, 1], + ["ico", -1, 1], + ["ismo", -1, 1], + ["oso", -1, 1], + ["amento", -1, 1], + ["imento", -1, 1], + ["ivo", -1, 8], + ["a\u00E7a~o", -1, 1], + ["u\u00E7a~o", -1, 3], + ["ador", -1, 1], + ["icas", -1, 1], + ["\u00EAncias", -1, 4], + ["logias", -1, 2], + ["iras", -1, 9], + ["adoras", -1, 1], + ["osas", -1, 1], + ["istas", -1, 1], + ["ivas", -1, 8], + ["ezas", -1, 1], + ["idades", -1, 7], + ["adores", -1, 1], + ["antes", -1, 1], + ["a\u00E7o~es", -1, 1], + ["u\u00E7o~es", -1, 3], + ["icos", -1, 1], + ["ismos", -1, 1], + ["osos", -1, 1], + ["amentos", -1, 1], + ["imentos", -1, 1], + ["ivos", -1, 8] + ]; + + /** @const */ var a_6 = [ + ["ada", -1, 1], + ["ida", -1, 1], + ["ia", -1, 1], + ["aria", 2, 1], + ["eria", 2, 1], + ["iria", 2, 1], + ["ara", -1, 1], + ["era", -1, 1], + ["ira", -1, 1], + ["ava", -1, 1], + ["asse", -1, 1], + ["esse", -1, 1], + ["isse", -1, 1], + ["aste", -1, 1], + ["este", -1, 1], + ["iste", -1, 1], + ["ei", -1, 1], + ["arei", 16, 1], + ["erei", 16, 1], + ["irei", 16, 1], + ["am", -1, 1], + ["iam", 20, 1], + ["ariam", 21, 1], + ["eriam", 21, 1], + ["iriam", 21, 1], + ["aram", 20, 1], + ["eram", 20, 1], + ["iram", 20, 1], + ["avam", 20, 1], + ["em", -1, 1], + ["arem", 29, 1], + ["erem", 29, 1], + ["irem", 29, 1], + ["assem", 29, 1], + ["essem", 29, 1], + ["issem", 29, 1], + ["ado", -1, 1], + ["ido", -1, 1], + ["ando", -1, 1], + ["endo", -1, 1], + ["indo", -1, 1], + ["ara~o", -1, 1], + ["era~o", -1, 1], + ["ira~o", -1, 1], + ["ar", -1, 1], + ["er", -1, 1], + ["ir", -1, 1], + ["as", -1, 1], + ["adas", 47, 1], + ["idas", 47, 1], + ["ias", 47, 1], + ["arias", 50, 1], + ["erias", 50, 1], + ["irias", 50, 1], + ["aras", 47, 1], + ["eras", 47, 1], + ["iras", 47, 1], + ["avas", 47, 1], + ["es", -1, 1], + ["ardes", 58, 1], + ["erdes", 58, 1], + ["irdes", 58, 1], + ["ares", 58, 1], + ["eres", 58, 1], + ["ires", 58, 1], + ["asses", 58, 1], + ["esses", 58, 1], + ["isses", 58, 1], + ["astes", 58, 1], + ["estes", 58, 1], + ["istes", 58, 1], + ["is", -1, 1], + ["ais", 71, 1], + ["eis", 71, 1], + ["areis", 73, 1], + ["ereis", 73, 1], + ["ireis", 73, 1], + ["\u00E1reis", 73, 1], + ["\u00E9reis", 73, 1], + ["\u00EDreis", 73, 1], + ["\u00E1sseis", 73, 1], + ["\u00E9sseis", 73, 1], + ["\u00EDsseis", 73, 1], + ["\u00E1veis", 73, 1], + ["\u00EDeis", 73, 1], + ["ar\u00EDeis", 84, 1], + ["er\u00EDeis", 84, 1], + ["ir\u00EDeis", 84, 1], + ["ados", -1, 1], + ["idos", -1, 1], + ["amos", -1, 1], + ["\u00E1ramos", 90, 1], + ["\u00E9ramos", 90, 1], + ["\u00EDramos", 90, 1], + ["\u00E1vamos", 90, 1], + ["\u00EDamos", 90, 1], + ["ar\u00EDamos", 95, 1], + ["er\u00EDamos", 95, 1], + ["ir\u00EDamos", 95, 1], + ["emos", -1, 1], + ["aremos", 99, 1], + ["eremos", 99, 1], + ["iremos", 99, 1], + ["\u00E1ssemos", 99, 1], + ["\u00EAssemos", 99, 1], + ["\u00EDssemos", 99, 1], + ["imos", -1, 1], + ["armos", -1, 1], + ["ermos", -1, 1], + ["irmos", -1, 1], + ["\u00E1mos", -1, 1], + ["ar\u00E1s", -1, 1], + ["er\u00E1s", -1, 1], + ["ir\u00E1s", -1, 1], + ["eu", -1, 1], + ["iu", -1, 1], + ["ou", -1, 1], + ["ar\u00E1", -1, 1], + ["er\u00E1", -1, 1], + ["ir\u00E1", -1, 1] + ]; + + /** @const */ var a_7 = [ + ["a", -1, 1], + ["i", -1, 1], + ["o", -1, 1], + ["os", -1, 1], + ["\u00E1", -1, 1], + ["\u00ED", -1, 1], + ["\u00F3", -1, 1] + ]; + + /** @const */ var a_8 = [ + ["e", -1, 1], + ["\u00E7", -1, 2], + ["\u00E9", -1, 1], + ["\u00EA", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 19, 12, 2]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_prelude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("a~")) + { + return false; + } + break; + case 2: + if (!base.slice_from("o~")) + { + return false; + } + break; + case 3: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab2; + } + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab4; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab4; + } + base.cursor++; + } + break lab3; + } + base.cursor = v_3; + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab2; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + } + break lab1; + } + base.cursor = v_2; + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab0; + } + lab9: { + var /** number */ v_6 = base.cursor; + lab10: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab10; + } + golab11: while(true) + { + lab12: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab12; + } + break golab11; + } + if (base.cursor >= base.limit) + { + break lab10; + } + base.cursor++; + } + break lab9; + } + base.cursor = v_6; + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab0; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_pV = base.cursor; + } + base.cursor = v_1; + var /** number */ v_8 = base.cursor; + lab13: { + golab14: while(true) + { + lab15: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab15; + } + break golab14; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab16: while(true) + { + lab17: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab17; + } + break golab16; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p1 = base.cursor; + golab18: while(true) + { + lab19: { + if (!(base.in_grouping(g_v, 97, 250))) + { + break lab19; + } + break golab18; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab20: while(true) + { + lab21: { + if (!(base.out_grouping(g_v, 97, 250))) + { + break lab21; + } + break golab20; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_8; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_1); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u00E3")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u00F5")) + { + return false; + } + break; + case 3: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("log")) + { + return false; + } + break; + case 3: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("u")) + { + return false; + } + break; + case 4: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("ente")) + { + return false; + } + break; + case 5: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + switch (among_var) { + case 1: + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + break; + case 6: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 7: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + if (base.find_among_b(a_4) == 0) + { + base.cursor = base.limit - v_3; + break lab2; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_3; + break lab2; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 8: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_4; + break lab3; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_4; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 9: + if (!r_RV()) + { + return false; + } + if (!(base.eq_s_b("e"))) + { + return false; + } + if (!base.slice_from("ir")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_verb_suffix() { + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + if (base.find_among_b(a_6) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.limit_backward = v_2; + return true; + }; + + /** @return {boolean} */ + function r_residual_suffix() { + base.ket = base.cursor; + if (base.find_among_b(a_7) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_residual_form() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_8); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("u"))) + { + break lab1; + } + base.bra = base.cursor; + var /** number */ v_2 = base.limit - base.cursor; + if (!(base.eq_s_b("g"))) + { + break lab1; + } + base.cursor = base.limit - v_2; + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("i"))) + { + return false; + } + base.bra = base.cursor; + var /** number */ v_3 = base.limit - base.cursor; + if (!(base.eq_s_b("c"))) + { + return false; + } + base.cursor = base.limit - v_3; + } + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("c")) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_prelude(); + base.cursor = v_1; + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + var /** number */ v_5 = base.limit - base.cursor; + lab3: { + var /** number */ v_6 = base.limit - base.cursor; + lab4: { + if (!r_standard_suffix()) + { + break lab4; + } + break lab3; + } + base.cursor = base.limit - v_6; + if (!r_verb_suffix()) + { + break lab2; + } + } + base.cursor = base.limit - v_5; + var /** number */ v_7 = base.limit - base.cursor; + lab5: { + base.ket = base.cursor; + if (!(base.eq_s_b("i"))) + { + break lab5; + } + base.bra = base.cursor; + var /** number */ v_8 = base.limit - base.cursor; + if (!(base.eq_s_b("c"))) + { + break lab5; + } + base.cursor = base.limit - v_8; + if (!r_RV()) + { + break lab5; + } + if (!base.slice_del()) + { + return false; + } + } + base.cursor = base.limit - v_7; + break lab1; + } + base.cursor = base.limit - v_4; + if (!r_residual_suffix()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_3; + var /** number */ v_9 = base.limit - base.cursor; + r_residual_form(); + base.cursor = base.limit - v_9; + base.cursor = base.limit_backward; + var /** number */ v_10 = base.cursor; + r_postlude(); + base.cursor = v_10; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['PortugueseStemmer'] = PortugueseStemmer; diff --git a/js/romanian-stemmer.js b/js/romanian-stemmer.js new file mode 100644 index 0000000..995e3b6 --- /dev/null +++ b/js/romanian-stemmer.js @@ -0,0 +1,918 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var RomanianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u015F", -1, 1], + ["\u0163", -1, 2] + ]; + + /** @const */ var a_1 = [ + ["", -1, 3], + ["I", 0, 1], + ["U", 0, 2] + ]; + + /** @const */ var a_2 = [ + ["ea", -1, 3], + ["a\u021Bia", -1, 7], + ["aua", -1, 2], + ["iua", -1, 4], + ["a\u021Bie", -1, 7], + ["ele", -1, 3], + ["ile", -1, 5], + ["iile", 6, 4], + ["iei", -1, 4], + ["atei", -1, 6], + ["ii", -1, 4], + ["ului", -1, 1], + ["ul", -1, 1], + ["elor", -1, 3], + ["ilor", -1, 4], + ["iilor", 14, 4] + ]; + + /** @const */ var a_3 = [ + ["icala", -1, 4], + ["iciva", -1, 4], + ["ativa", -1, 5], + ["itiva", -1, 6], + ["icale", -1, 4], + ["a\u021Biune", -1, 5], + ["i\u021Biune", -1, 6], + ["atoare", -1, 5], + ["itoare", -1, 6], + ["\u0103toare", -1, 5], + ["icitate", -1, 4], + ["abilitate", -1, 1], + ["ibilitate", -1, 2], + ["ivitate", -1, 3], + ["icive", -1, 4], + ["ative", -1, 5], + ["itive", -1, 6], + ["icali", -1, 4], + ["atori", -1, 5], + ["icatori", 18, 4], + ["itori", -1, 6], + ["\u0103tori", -1, 5], + ["icitati", -1, 4], + ["abilitati", -1, 1], + ["ivitati", -1, 3], + ["icivi", -1, 4], + ["ativi", -1, 5], + ["itivi", -1, 6], + ["icit\u0103i", -1, 4], + ["abilit\u0103i", -1, 1], + ["ivit\u0103i", -1, 3], + ["icit\u0103\u021Bi", -1, 4], + ["abilit\u0103\u021Bi", -1, 1], + ["ivit\u0103\u021Bi", -1, 3], + ["ical", -1, 4], + ["ator", -1, 5], + ["icator", 35, 4], + ["itor", -1, 6], + ["\u0103tor", -1, 5], + ["iciv", -1, 4], + ["ativ", -1, 5], + ["itiv", -1, 6], + ["ical\u0103", -1, 4], + ["iciv\u0103", -1, 4], + ["ativ\u0103", -1, 5], + ["itiv\u0103", -1, 6] + ]; + + /** @const */ var a_4 = [ + ["ica", -1, 1], + ["abila", -1, 1], + ["ibila", -1, 1], + ["oasa", -1, 1], + ["ata", -1, 1], + ["ita", -1, 1], + ["anta", -1, 1], + ["ista", -1, 3], + ["uta", -1, 1], + ["iva", -1, 1], + ["ic", -1, 1], + ["ice", -1, 1], + ["abile", -1, 1], + ["ibile", -1, 1], + ["isme", -1, 3], + ["iune", -1, 2], + ["oase", -1, 1], + ["ate", -1, 1], + ["itate", 17, 1], + ["ite", -1, 1], + ["ante", -1, 1], + ["iste", -1, 3], + ["ute", -1, 1], + ["ive", -1, 1], + ["ici", -1, 1], + ["abili", -1, 1], + ["ibili", -1, 1], + ["iuni", -1, 2], + ["atori", -1, 1], + ["osi", -1, 1], + ["ati", -1, 1], + ["itati", 30, 1], + ["iti", -1, 1], + ["anti", -1, 1], + ["isti", -1, 3], + ["uti", -1, 1], + ["i\u0219ti", -1, 3], + ["ivi", -1, 1], + ["it\u0103i", -1, 1], + ["o\u0219i", -1, 1], + ["it\u0103\u021Bi", -1, 1], + ["abil", -1, 1], + ["ibil", -1, 1], + ["ism", -1, 3], + ["ator", -1, 1], + ["os", -1, 1], + ["at", -1, 1], + ["it", -1, 1], + ["ant", -1, 1], + ["ist", -1, 3], + ["ut", -1, 1], + ["iv", -1, 1], + ["ic\u0103", -1, 1], + ["abil\u0103", -1, 1], + ["ibil\u0103", -1, 1], + ["oas\u0103", -1, 1], + ["at\u0103", -1, 1], + ["it\u0103", -1, 1], + ["ant\u0103", -1, 1], + ["ist\u0103", -1, 3], + ["ut\u0103", -1, 1], + ["iv\u0103", -1, 1] + ]; + + /** @const */ var a_5 = [ + ["ea", -1, 1], + ["ia", -1, 1], + ["esc", -1, 1], + ["\u0103sc", -1, 1], + ["ind", -1, 1], + ["\u00E2nd", -1, 1], + ["are", -1, 1], + ["ere", -1, 1], + ["ire", -1, 1], + ["\u00E2re", -1, 1], + ["se", -1, 2], + ["ase", 10, 1], + ["sese", 10, 2], + ["ise", 10, 1], + ["use", 10, 1], + ["\u00E2se", 10, 1], + ["e\u0219te", -1, 1], + ["\u0103\u0219te", -1, 1], + ["eze", -1, 1], + ["ai", -1, 1], + ["eai", 19, 1], + ["iai", 19, 1], + ["sei", -1, 2], + ["e\u0219ti", -1, 1], + ["\u0103\u0219ti", -1, 1], + ["ui", -1, 1], + ["ezi", -1, 1], + ["\u00E2i", -1, 1], + ["a\u0219i", -1, 1], + ["se\u0219i", -1, 2], + ["ase\u0219i", 29, 1], + ["sese\u0219i", 29, 2], + ["ise\u0219i", 29, 1], + ["use\u0219i", 29, 1], + ["\u00E2se\u0219i", 29, 1], + ["i\u0219i", -1, 1], + ["u\u0219i", -1, 1], + ["\u00E2\u0219i", -1, 1], + ["a\u021Bi", -1, 2], + ["ea\u021Bi", 38, 1], + ["ia\u021Bi", 38, 1], + ["e\u021Bi", -1, 2], + ["i\u021Bi", -1, 2], + ["\u00E2\u021Bi", -1, 2], + ["ar\u0103\u021Bi", -1, 1], + ["ser\u0103\u021Bi", -1, 2], + ["aser\u0103\u021Bi", 45, 1], + ["seser\u0103\u021Bi", 45, 2], + ["iser\u0103\u021Bi", 45, 1], + ["user\u0103\u021Bi", 45, 1], + ["\u00E2ser\u0103\u021Bi", 45, 1], + ["ir\u0103\u021Bi", -1, 1], + ["ur\u0103\u021Bi", -1, 1], + ["\u00E2r\u0103\u021Bi", -1, 1], + ["am", -1, 1], + ["eam", 54, 1], + ["iam", 54, 1], + ["em", -1, 2], + ["asem", 57, 1], + ["sesem", 57, 2], + ["isem", 57, 1], + ["usem", 57, 1], + ["\u00E2sem", 57, 1], + ["im", -1, 2], + ["\u00E2m", -1, 2], + ["\u0103m", -1, 2], + ["ar\u0103m", 65, 1], + ["ser\u0103m", 65, 2], + ["aser\u0103m", 67, 1], + ["seser\u0103m", 67, 2], + ["iser\u0103m", 67, 1], + ["user\u0103m", 67, 1], + ["\u00E2ser\u0103m", 67, 1], + ["ir\u0103m", 65, 1], + ["ur\u0103m", 65, 1], + ["\u00E2r\u0103m", 65, 1], + ["au", -1, 1], + ["eau", 76, 1], + ["iau", 76, 1], + ["indu", -1, 1], + ["\u00E2ndu", -1, 1], + ["ez", -1, 1], + ["easc\u0103", -1, 1], + ["ar\u0103", -1, 1], + ["ser\u0103", -1, 2], + ["aser\u0103", 84, 1], + ["seser\u0103", 84, 2], + ["iser\u0103", 84, 1], + ["user\u0103", 84, 1], + ["\u00E2ser\u0103", 84, 1], + ["ir\u0103", -1, 1], + ["ur\u0103", -1, 1], + ["\u00E2r\u0103", -1, 1], + ["eaz\u0103", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["a", -1, 1], + ["e", -1, 1], + ["ie", 1, 1], + ["i", -1, 1], + ["\u0103", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 32, 0, 0, 4]; + + var /** boolean */ B_standard_suffix_removed = false; + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_norm() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + golab2: while(true) + { + var /** number */ v_3 = base.cursor; + lab3: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + break lab3; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u0219")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u021B")) + { + return false; + } + break; + } + base.cursor = v_3; + break golab2; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_prelude() { + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab2; + } + base.bra = base.cursor; + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.eq_s("u"))) + { + break lab4; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab4; + } + if (!base.slice_from("U")) + { + return false; + } + break lab3; + } + base.cursor = v_3; + if (!(base.eq_s("i"))) + { + break lab2; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab2; + } + if (!base.slice_from("I")) + { + return false; + } + } + base.cursor = v_2; + break golab1; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab2; + } + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.out_grouping(g_v, 97, 259))) + { + break lab4; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab4; + } + base.cursor++; + } + break lab3; + } + base.cursor = v_3; + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab2; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 259))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + } + break lab1; + } + base.cursor = v_2; + if (!(base.out_grouping(g_v, 97, 259))) + { + break lab0; + } + lab9: { + var /** number */ v_6 = base.cursor; + lab10: { + if (!(base.out_grouping(g_v, 97, 259))) + { + break lab10; + } + golab11: while(true) + { + lab12: { + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab12; + } + break golab11; + } + if (base.cursor >= base.limit) + { + break lab10; + } + base.cursor++; + } + break lab9; + } + base.cursor = v_6; + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab0; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_pV = base.cursor; + } + base.cursor = v_1; + var /** number */ v_8 = base.cursor; + lab13: { + golab14: while(true) + { + lab15: { + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab15; + } + break golab14; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab16: while(true) + { + lab17: { + if (!(base.out_grouping(g_v, 97, 259))) + { + break lab17; + } + break golab16; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p1 = base.cursor; + golab18: while(true) + { + lab19: { + if (!(base.in_grouping(g_v, 97, 259))) + { + break lab19; + } + break golab18; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab20: while(true) + { + lab21: { + if (!(base.out_grouping(g_v, 97, 259))) + { + break lab21; + } + break golab20; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_8; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_1); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("i")) + { + return false; + } + break; + case 2: + if (!base.slice_from("u")) + { + return false; + } + break; + case 3: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_step_0() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("a")) + { + return false; + } + break; + case 3: + if (!base.slice_from("e")) + { + return false; + } + break; + case 4: + if (!base.slice_from("i")) + { + return false; + } + break; + case 5: + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("ab"))) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_1; + } + if (!base.slice_from("i")) + { + return false; + } + break; + case 6: + if (!base.slice_from("at")) + { + return false; + } + break; + case 7: + if (!base.slice_from("a\u021Bi")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_combo_suffix() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("abil")) + { + return false; + } + break; + case 2: + if (!base.slice_from("ibil")) + { + return false; + } + break; + case 3: + if (!base.slice_from("iv")) + { + return false; + } + break; + case 4: + if (!base.slice_from("ic")) + { + return false; + } + break; + case 5: + if (!base.slice_from("at")) + { + return false; + } + break; + case 6: + if (!base.slice_from("it")) + { + return false; + } + break; + } + B_standard_suffix_removed = true; + base.cursor = base.limit - v_1; + return true; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + B_standard_suffix_removed = false; + while(true) + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (!r_combo_suffix()) + { + break lab0; + } + continue; + } + base.cursor = base.limit - v_1; + break; + } + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R2()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.eq_s_b("\u021B"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_from("t")) + { + return false; + } + break; + case 3: + if (!base.slice_from("ist")) + { + return false; + } + break; + } + B_standard_suffix_removed = true; + return true; + }; + + /** @return {boolean} */ + function r_verb_suffix() { + var /** number */ among_var; + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + if (!(base.out_grouping_b(g_v, 97, 259))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_3; + if (!(base.eq_s_b("u"))) + { + base.limit_backward = v_2; + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + base.limit_backward = v_2; + return true; + }; + + /** @return {boolean} */ + function r_vowel_suffix() { + base.ket = base.cursor; + if (base.find_among_b(a_6) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_norm(); + var /** number */ v_2 = base.cursor; + r_prelude(); + base.cursor = v_2; + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_4 = base.limit - base.cursor; + r_step_0(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + r_standard_suffix(); + base.cursor = base.limit - v_5; + var /** number */ v_6 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_7 = base.limit - base.cursor; + lab2: { + if (!B_standard_suffix_removed) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_7; + if (!r_verb_suffix()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_6; + var /** number */ v_8 = base.limit - base.cursor; + r_vowel_suffix(); + base.cursor = base.limit - v_8; + base.cursor = base.limit_backward; + var /** number */ v_9 = base.cursor; + r_postlude(); + base.cursor = v_9; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['RomanianStemmer'] = RomanianStemmer; diff --git a/js/russian-stemmer.js b/js/russian-stemmer.js new file mode 100644 index 0000000..1f2aafb --- /dev/null +++ b/js/russian-stemmer.js @@ -0,0 +1,622 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var RussianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0432", -1, 1], + ["\u0438\u0432", 0, 2], + ["\u044B\u0432", 0, 2], + ["\u0432\u0448\u0438", -1, 1], + ["\u0438\u0432\u0448\u0438", 3, 2], + ["\u044B\u0432\u0448\u0438", 3, 2], + ["\u0432\u0448\u0438\u0441\u044C", -1, 1], + ["\u0438\u0432\u0448\u0438\u0441\u044C", 6, 2], + ["\u044B\u0432\u0448\u0438\u0441\u044C", 6, 2] + ]; + + /** @const */ var a_1 = [ + ["\u0435\u0435", -1, 1], + ["\u0438\u0435", -1, 1], + ["\u043E\u0435", -1, 1], + ["\u044B\u0435", -1, 1], + ["\u0438\u043C\u0438", -1, 1], + ["\u044B\u043C\u0438", -1, 1], + ["\u0435\u0439", -1, 1], + ["\u0438\u0439", -1, 1], + ["\u043E\u0439", -1, 1], + ["\u044B\u0439", -1, 1], + ["\u0435\u043C", -1, 1], + ["\u0438\u043C", -1, 1], + ["\u043E\u043C", -1, 1], + ["\u044B\u043C", -1, 1], + ["\u0435\u0433\u043E", -1, 1], + ["\u043E\u0433\u043E", -1, 1], + ["\u0435\u043C\u0443", -1, 1], + ["\u043E\u043C\u0443", -1, 1], + ["\u0438\u0445", -1, 1], + ["\u044B\u0445", -1, 1], + ["\u0435\u044E", -1, 1], + ["\u043E\u044E", -1, 1], + ["\u0443\u044E", -1, 1], + ["\u044E\u044E", -1, 1], + ["\u0430\u044F", -1, 1], + ["\u044F\u044F", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["\u0435\u043C", -1, 1], + ["\u043D\u043D", -1, 1], + ["\u0432\u0448", -1, 1], + ["\u0438\u0432\u0448", 2, 2], + ["\u044B\u0432\u0448", 2, 2], + ["\u0449", -1, 1], + ["\u044E\u0449", 5, 1], + ["\u0443\u044E\u0449", 6, 2] + ]; + + /** @const */ var a_3 = [ + ["\u0441\u044C", -1, 1], + ["\u0441\u044F", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["\u043B\u0430", -1, 1], + ["\u0438\u043B\u0430", 0, 2], + ["\u044B\u043B\u0430", 0, 2], + ["\u043D\u0430", -1, 1], + ["\u0435\u043D\u0430", 3, 2], + ["\u0435\u0442\u0435", -1, 1], + ["\u0438\u0442\u0435", -1, 2], + ["\u0439\u0442\u0435", -1, 1], + ["\u0435\u0439\u0442\u0435", 7, 2], + ["\u0443\u0439\u0442\u0435", 7, 2], + ["\u043B\u0438", -1, 1], + ["\u0438\u043B\u0438", 10, 2], + ["\u044B\u043B\u0438", 10, 2], + ["\u0439", -1, 1], + ["\u0435\u0439", 13, 2], + ["\u0443\u0439", 13, 2], + ["\u043B", -1, 1], + ["\u0438\u043B", 16, 2], + ["\u044B\u043B", 16, 2], + ["\u0435\u043C", -1, 1], + ["\u0438\u043C", -1, 2], + ["\u044B\u043C", -1, 2], + ["\u043D", -1, 1], + ["\u0435\u043D", 22, 2], + ["\u043B\u043E", -1, 1], + ["\u0438\u043B\u043E", 24, 2], + ["\u044B\u043B\u043E", 24, 2], + ["\u043D\u043E", -1, 1], + ["\u0435\u043D\u043E", 27, 2], + ["\u043D\u043D\u043E", 27, 1], + ["\u0435\u0442", -1, 1], + ["\u0443\u0435\u0442", 30, 2], + ["\u0438\u0442", -1, 2], + ["\u044B\u0442", -1, 2], + ["\u044E\u0442", -1, 1], + ["\u0443\u044E\u0442", 34, 2], + ["\u044F\u0442", -1, 2], + ["\u043D\u044B", -1, 1], + ["\u0435\u043D\u044B", 37, 2], + ["\u0442\u044C", -1, 1], + ["\u0438\u0442\u044C", 39, 2], + ["\u044B\u0442\u044C", 39, 2], + ["\u0435\u0448\u044C", -1, 1], + ["\u0438\u0448\u044C", -1, 2], + ["\u044E", -1, 2], + ["\u0443\u044E", 44, 2] + ]; + + /** @const */ var a_5 = [ + ["\u0430", -1, 1], + ["\u0435\u0432", -1, 1], + ["\u043E\u0432", -1, 1], + ["\u0435", -1, 1], + ["\u0438\u0435", 3, 1], + ["\u044C\u0435", 3, 1], + ["\u0438", -1, 1], + ["\u0435\u0438", 6, 1], + ["\u0438\u0438", 6, 1], + ["\u0430\u043C\u0438", 6, 1], + ["\u044F\u043C\u0438", 6, 1], + ["\u0438\u044F\u043C\u0438", 10, 1], + ["\u0439", -1, 1], + ["\u0435\u0439", 12, 1], + ["\u0438\u0435\u0439", 13, 1], + ["\u0438\u0439", 12, 1], + ["\u043E\u0439", 12, 1], + ["\u0430\u043C", -1, 1], + ["\u0435\u043C", -1, 1], + ["\u0438\u0435\u043C", 18, 1], + ["\u043E\u043C", -1, 1], + ["\u044F\u043C", -1, 1], + ["\u0438\u044F\u043C", 21, 1], + ["\u043E", -1, 1], + ["\u0443", -1, 1], + ["\u0430\u0445", -1, 1], + ["\u044F\u0445", -1, 1], + ["\u0438\u044F\u0445", 26, 1], + ["\u044B", -1, 1], + ["\u044C", -1, 1], + ["\u044E", -1, 1], + ["\u0438\u044E", 30, 1], + ["\u044C\u044E", 30, 1], + ["\u044F", -1, 1], + ["\u0438\u044F", 33, 1], + ["\u044C\u044F", 33, 1] + ]; + + /** @const */ var a_6 = [ + ["\u043E\u0441\u0442", -1, 1], + ["\u043E\u0441\u0442\u044C", -1, 1] + ]; + + /** @const */ var a_7 = [ + ["\u0435\u0439\u0448\u0435", -1, 1], + ["\u043D", -1, 2], + ["\u0435\u0439\u0448", -1, 1], + ["\u044C", -1, 3] + ]; + + /** @const */ var /** Array */ g_v = [33, 65, 8, 232]; + + var /** number */ I_p2 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + lab2: { + if (!(base.in_grouping(g_v, 1072, 1103))) + { + break lab2; + } + break golab1; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_pV = base.cursor; + golab3: while(true) + { + lab4: { + if (!(base.out_grouping(g_v, 1072, 1103))) + { + break lab4; + } + break golab3; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 1072, 1103))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 1072, 1103))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_perfective_gerund() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("\u0430"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("\u044F"))) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_adjective() { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_adjectival() { + var /** number */ among_var; + if (!r_adjective()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + if (!(base.eq_s_b("\u0430"))) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_2; + if (!(base.eq_s_b("\u044F"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + } + return true; + }; + + /** @return {boolean} */ + function r_reflexive() { + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_verb() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("\u0430"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("\u044F"))) + { + return false; + } + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_noun() { + base.ket = base.cursor; + if (base.find_among_b(a_5) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_derivational() { + base.ket = base.cursor; + if (base.find_among_b(a_6) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_tidy_up() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_7); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!(base.eq_s_b("\u043D"))) + { + return false; + } + base.bra = base.cursor; + if (!(base.eq_s_b("\u043D"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.eq_s_b("\u043D"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + golab2: while(true) + { + var /** number */ v_3 = base.cursor; + lab3: { + base.bra = base.cursor; + if (!(base.eq_s("\u0451"))) + { + break lab3; + } + base.ket = base.cursor; + base.cursor = v_3; + break golab2; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + if (!base.slice_from("\u0435")) + { + return false; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_6 = base.limit_backward; + base.limit_backward = I_pV; + var /** number */ v_7 = base.limit - base.cursor; + lab4: { + lab5: { + var /** number */ v_8 = base.limit - base.cursor; + lab6: { + if (!r_perfective_gerund()) + { + break lab6; + } + break lab5; + } + base.cursor = base.limit - v_8; + var /** number */ v_9 = base.limit - base.cursor; + lab7: { + if (!r_reflexive()) + { + base.cursor = base.limit - v_9; + break lab7; + } + } + lab8: { + var /** number */ v_10 = base.limit - base.cursor; + lab9: { + if (!r_adjectival()) + { + break lab9; + } + break lab8; + } + base.cursor = base.limit - v_10; + lab10: { + if (!r_verb()) + { + break lab10; + } + break lab8; + } + base.cursor = base.limit - v_10; + if (!r_noun()) + { + break lab4; + } + } + } + } + base.cursor = base.limit - v_7; + var /** number */ v_11 = base.limit - base.cursor; + lab11: { + base.ket = base.cursor; + if (!(base.eq_s_b("\u0438"))) + { + base.cursor = base.limit - v_11; + break lab11; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + var /** number */ v_12 = base.limit - base.cursor; + r_derivational(); + base.cursor = base.limit - v_12; + var /** number */ v_13 = base.limit - base.cursor; + r_tidy_up(); + base.cursor = base.limit - v_13; + base.limit_backward = v_6; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['RussianStemmer'] = RussianStemmer; diff --git a/js/serbian-stemmer.js b/js/serbian-stemmer.js new file mode 100644 index 0000000..f7c8c56 --- /dev/null +++ b/js/serbian-stemmer.js @@ -0,0 +1,4557 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var SerbianStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0430", -1, 1], + ["\u0431", -1, 2], + ["\u0432", -1, 3], + ["\u0433", -1, 4], + ["\u0434", -1, 5], + ["\u0435", -1, 7], + ["\u0436", -1, 8], + ["\u0437", -1, 9], + ["\u0438", -1, 10], + ["\u043A", -1, 12], + ["\u043B", -1, 13], + ["\u043C", -1, 15], + ["\u043D", -1, 16], + ["\u043E", -1, 18], + ["\u043F", -1, 19], + ["\u0440", -1, 20], + ["\u0441", -1, 21], + ["\u0442", -1, 22], + ["\u0443", -1, 24], + ["\u0444", -1, 25], + ["\u0445", -1, 26], + ["\u0446", -1, 27], + ["\u0447", -1, 28], + ["\u0448", -1, 30], + ["\u0452", -1, 6], + ["\u0458", -1, 11], + ["\u0459", -1, 14], + ["\u045A", -1, 17], + ["\u045B", -1, 23], + ["\u045F", -1, 29] + ]; + + /** @const */ var a_1 = [ + ["daba", -1, 73], + ["ajaca", -1, 12], + ["ejaca", -1, 14], + ["ljaca", -1, 13], + ["njaca", -1, 85], + ["ojaca", -1, 15], + ["alaca", -1, 82], + ["elaca", -1, 83], + ["olaca", -1, 84], + ["maca", -1, 75], + ["naca", -1, 76], + ["raca", -1, 81], + ["saca", -1, 80], + ["vaca", -1, 79], + ["\u0161aca", -1, 18], + ["aoca", -1, 82], + ["acaka", -1, 55], + ["ajaka", -1, 16], + ["ojaka", -1, 17], + ["anaka", -1, 78], + ["ataka", -1, 58], + ["etaka", -1, 59], + ["itaka", -1, 60], + ["otaka", -1, 61], + ["utaka", -1, 62], + ["a\u010Daka", -1, 54], + ["esama", -1, 67], + ["izama", -1, 87], + ["jacima", -1, 5], + ["nicima", -1, 23], + ["ticima", -1, 24], + ["teticima", 30, 21], + ["zicima", -1, 25], + ["atcima", -1, 58], + ["utcima", -1, 62], + ["\u010Dcima", -1, 74], + ["pesima", -1, 2], + ["inzima", -1, 19], + ["lozima", -1, 1], + ["metara", -1, 68], + ["centara", -1, 69], + ["istara", -1, 70], + ["ekata", -1, 86], + ["anata", -1, 53], + ["nstava", -1, 22], + ["kustava", -1, 29], + ["ajac", -1, 12], + ["ejac", -1, 14], + ["ljac", -1, 13], + ["njac", -1, 85], + ["anjac", 49, 11], + ["ojac", -1, 15], + ["alac", -1, 82], + ["elac", -1, 83], + ["olac", -1, 84], + ["mac", -1, 75], + ["nac", -1, 76], + ["rac", -1, 81], + ["sac", -1, 80], + ["vac", -1, 79], + ["\u0161ac", -1, 18], + ["jebe", -1, 88], + ["olce", -1, 84], + ["kuse", -1, 27], + ["rave", -1, 42], + ["save", -1, 52], + ["\u0161ave", -1, 51], + ["baci", -1, 89], + ["jaci", -1, 5], + ["tvenici", -1, 20], + ["snici", -1, 26], + ["tetici", -1, 21], + ["bojci", -1, 4], + ["vojci", -1, 3], + ["ojsci", -1, 66], + ["atci", -1, 58], + ["itci", -1, 60], + ["utci", -1, 62], + ["\u010Dci", -1, 74], + ["pesi", -1, 2], + ["inzi", -1, 19], + ["lozi", -1, 1], + ["acak", -1, 55], + ["usak", -1, 57], + ["atak", -1, 58], + ["etak", -1, 59], + ["itak", -1, 60], + ["otak", -1, 61], + ["utak", -1, 62], + ["a\u010Dak", -1, 54], + ["u\u0161ak", -1, 56], + ["izam", -1, 87], + ["tican", -1, 65], + ["cajan", -1, 7], + ["\u010Dajan", -1, 6], + ["voljan", -1, 77], + ["eskan", -1, 63], + ["alan", -1, 40], + ["bilan", -1, 33], + ["gilan", -1, 37], + ["nilan", -1, 39], + ["rilan", -1, 38], + ["silan", -1, 36], + ["tilan", -1, 34], + ["avilan", -1, 35], + ["laran", -1, 9], + ["eran", -1, 8], + ["asan", -1, 91], + ["esan", -1, 10], + ["dusan", -1, 31], + ["kusan", -1, 28], + ["atan", -1, 47], + ["pletan", -1, 50], + ["tetan", -1, 49], + ["antan", -1, 32], + ["pravan", -1, 44], + ["stavan", -1, 43], + ["sivan", -1, 46], + ["tivan", -1, 45], + ["ozan", -1, 41], + ["ti\u010Dan", -1, 64], + ["a\u0161an", -1, 90], + ["du\u0161an", -1, 30], + ["metar", -1, 68], + ["centar", -1, 69], + ["istar", -1, 70], + ["ekat", -1, 86], + ["enat", -1, 48], + ["oscu", -1, 72], + ["o\u0161\u0107u", -1, 71] + ]; + + /** @const */ var a_2 = [ + ["aca", -1, 124], + ["eca", -1, 125], + ["uca", -1, 126], + ["ga", -1, 20], + ["acega", 3, 124], + ["ecega", 3, 125], + ["ucega", 3, 126], + ["anjijega", 3, 84], + ["enjijega", 3, 85], + ["snjijega", 3, 122], + ["\u0161njijega", 3, 86], + ["kijega", 3, 95], + ["skijega", 11, 1], + ["\u0161kijega", 11, 2], + ["elijega", 3, 83], + ["nijega", 3, 13], + ["osijega", 3, 123], + ["atijega", 3, 120], + ["evitijega", 3, 92], + ["ovitijega", 3, 93], + ["astijega", 3, 94], + ["avijega", 3, 77], + ["evijega", 3, 78], + ["ivijega", 3, 79], + ["ovijega", 3, 80], + ["o\u0161ijega", 3, 91], + ["anjega", 3, 84], + ["enjega", 3, 85], + ["snjega", 3, 122], + ["\u0161njega", 3, 86], + ["kega", 3, 95], + ["skega", 30, 1], + ["\u0161kega", 30, 2], + ["elega", 3, 83], + ["nega", 3, 13], + ["anega", 34, 10], + ["enega", 34, 87], + ["snega", 34, 159], + ["\u0161nega", 34, 88], + ["osega", 3, 123], + ["atega", 3, 120], + ["evitega", 3, 92], + ["ovitega", 3, 93], + ["astega", 3, 94], + ["avega", 3, 77], + ["evega", 3, 78], + ["ivega", 3, 79], + ["ovega", 3, 80], + ["a\u0107ega", 3, 14], + ["e\u0107ega", 3, 15], + ["u\u0107ega", 3, 16], + ["o\u0161ega", 3, 91], + ["acoga", 3, 124], + ["ecoga", 3, 125], + ["ucoga", 3, 126], + ["anjoga", 3, 84], + ["enjoga", 3, 85], + ["snjoga", 3, 122], + ["\u0161njoga", 3, 86], + ["koga", 3, 95], + ["skoga", 59, 1], + ["\u0161koga", 59, 2], + ["loga", 3, 19], + ["eloga", 62, 83], + ["noga", 3, 13], + ["cinoga", 64, 137], + ["\u010Dinoga", 64, 89], + ["osoga", 3, 123], + ["atoga", 3, 120], + ["evitoga", 3, 92], + ["ovitoga", 3, 93], + ["astoga", 3, 94], + ["avoga", 3, 77], + ["evoga", 3, 78], + ["ivoga", 3, 79], + ["ovoga", 3, 80], + ["a\u0107oga", 3, 14], + ["e\u0107oga", 3, 15], + ["u\u0107oga", 3, 16], + ["o\u0161oga", 3, 91], + ["uga", 3, 18], + ["aja", -1, 109], + ["caja", 81, 26], + ["laja", 81, 30], + ["raja", 81, 31], + ["\u0107aja", 81, 28], + ["\u010Daja", 81, 27], + ["\u0111aja", 81, 29], + ["bija", -1, 32], + ["cija", -1, 33], + ["dija", -1, 34], + ["fija", -1, 40], + ["gija", -1, 39], + ["anjija", -1, 84], + ["enjija", -1, 85], + ["snjija", -1, 122], + ["\u0161njija", -1, 86], + ["kija", -1, 95], + ["skija", 97, 1], + ["\u0161kija", 97, 2], + ["lija", -1, 24], + ["elija", 100, 83], + ["mija", -1, 37], + ["nija", -1, 13], + ["ganija", 103, 9], + ["manija", 103, 6], + ["panija", 103, 7], + ["ranija", 103, 8], + ["tanija", 103, 5], + ["pija", -1, 41], + ["rija", -1, 42], + ["rarija", 110, 21], + ["sija", -1, 23], + ["osija", 112, 123], + ["tija", -1, 44], + ["atija", 114, 120], + ["evitija", 114, 92], + ["ovitija", 114, 93], + ["otija", 114, 22], + ["astija", 114, 94], + ["avija", -1, 77], + ["evija", -1, 78], + ["ivija", -1, 79], + ["ovija", -1, 80], + ["zija", -1, 45], + ["o\u0161ija", -1, 91], + ["\u017Eija", -1, 38], + ["anja", -1, 84], + ["enja", -1, 85], + ["snja", -1, 122], + ["\u0161nja", -1, 86], + ["ka", -1, 95], + ["ska", 131, 1], + ["\u0161ka", 131, 2], + ["ala", -1, 104], + ["acala", 134, 128], + ["astajala", 134, 106], + ["istajala", 134, 107], + ["ostajala", 134, 108], + ["ijala", 134, 47], + ["injala", 134, 114], + ["nala", 134, 46], + ["irala", 134, 100], + ["urala", 134, 105], + ["tala", 134, 113], + ["astala", 144, 110], + ["istala", 144, 111], + ["ostala", 144, 112], + ["avala", 134, 97], + ["evala", 134, 96], + ["ivala", 134, 98], + ["ovala", 134, 76], + ["uvala", 134, 99], + ["a\u010Dala", 134, 102], + ["ela", -1, 83], + ["ila", -1, 116], + ["acila", 155, 124], + ["lucila", 155, 121], + ["nila", 155, 103], + ["astanila", 158, 110], + ["istanila", 158, 111], + ["ostanila", 158, 112], + ["rosila", 155, 127], + ["jetila", 155, 118], + ["ozila", 155, 48], + ["a\u010Dila", 155, 101], + ["lu\u010Dila", 155, 117], + ["ro\u0161ila", 155, 90], + ["ola", -1, 50], + ["asla", -1, 115], + ["nula", -1, 13], + ["gama", -1, 20], + ["logama", 171, 19], + ["ugama", 171, 18], + ["ajama", -1, 109], + ["cajama", 174, 26], + ["lajama", 174, 30], + ["rajama", 174, 31], + ["\u0107ajama", 174, 28], + ["\u010Dajama", 174, 27], + ["\u0111ajama", 174, 29], + ["bijama", -1, 32], + ["cijama", -1, 33], + ["dijama", -1, 34], + ["fijama", -1, 40], + ["gijama", -1, 39], + ["lijama", -1, 35], + ["mijama", -1, 37], + ["nijama", -1, 36], + ["ganijama", 188, 9], + ["manijama", 188, 6], + ["panijama", 188, 7], + ["ranijama", 188, 8], + ["tanijama", 188, 5], + ["pijama", -1, 41], + ["rijama", -1, 42], + ["sijama", -1, 43], + ["tijama", -1, 44], + ["zijama", -1, 45], + ["\u017Eijama", -1, 38], + ["alama", -1, 104], + ["ijalama", 200, 47], + ["nalama", 200, 46], + ["elama", -1, 119], + ["ilama", -1, 116], + ["ramama", -1, 52], + ["lemama", -1, 51], + ["inama", -1, 11], + ["cinama", 207, 137], + ["\u010Dinama", 207, 89], + ["rama", -1, 52], + ["arama", 210, 53], + ["drama", 210, 54], + ["erama", 210, 55], + ["orama", 210, 56], + ["basama", -1, 135], + ["gasama", -1, 131], + ["jasama", -1, 129], + ["kasama", -1, 133], + ["nasama", -1, 132], + ["tasama", -1, 130], + ["vasama", -1, 134], + ["esama", -1, 152], + ["isama", -1, 154], + ["etama", -1, 70], + ["estama", -1, 71], + ["istama", -1, 72], + ["kstama", -1, 73], + ["ostama", -1, 74], + ["avama", -1, 77], + ["evama", -1, 78], + ["ivama", -1, 79], + ["ba\u0161ama", -1, 63], + ["ga\u0161ama", -1, 64], + ["ja\u0161ama", -1, 61], + ["ka\u0161ama", -1, 62], + ["na\u0161ama", -1, 60], + ["ta\u0161ama", -1, 59], + ["va\u0161ama", -1, 65], + ["e\u0161ama", -1, 66], + ["i\u0161ama", -1, 67], + ["lema", -1, 51], + ["acima", -1, 124], + ["ecima", -1, 125], + ["ucima", -1, 126], + ["ajima", -1, 109], + ["cajima", 245, 26], + ["lajima", 245, 30], + ["rajima", 245, 31], + ["\u0107ajima", 245, 28], + ["\u010Dajima", 245, 27], + ["\u0111ajima", 245, 29], + ["bijima", -1, 32], + ["cijima", -1, 33], + ["dijima", -1, 34], + ["fijima", -1, 40], + ["gijima", -1, 39], + ["anjijima", -1, 84], + ["enjijima", -1, 85], + ["snjijima", -1, 122], + ["\u0161njijima", -1, 86], + ["kijima", -1, 95], + ["skijima", 261, 1], + ["\u0161kijima", 261, 2], + ["lijima", -1, 35], + ["elijima", 264, 83], + ["mijima", -1, 37], + ["nijima", -1, 13], + ["ganijima", 267, 9], + ["manijima", 267, 6], + ["panijima", 267, 7], + ["ranijima", 267, 8], + ["tanijima", 267, 5], + ["pijima", -1, 41], + ["rijima", -1, 42], + ["sijima", -1, 43], + ["osijima", 275, 123], + ["tijima", -1, 44], + ["atijima", 277, 120], + ["evitijima", 277, 92], + ["ovitijima", 277, 93], + ["astijima", 277, 94], + ["avijima", -1, 77], + ["evijima", -1, 78], + ["ivijima", -1, 79], + ["ovijima", -1, 80], + ["zijima", -1, 45], + ["o\u0161ijima", -1, 91], + ["\u017Eijima", -1, 38], + ["anjima", -1, 84], + ["enjima", -1, 85], + ["snjima", -1, 122], + ["\u0161njima", -1, 86], + ["kima", -1, 95], + ["skima", 293, 1], + ["\u0161kima", 293, 2], + ["alima", -1, 104], + ["ijalima", 296, 47], + ["nalima", 296, 46], + ["elima", -1, 83], + ["ilima", -1, 116], + ["ozilima", 300, 48], + ["olima", -1, 50], + ["lemima", -1, 51], + ["nima", -1, 13], + ["anima", 304, 10], + ["inima", 304, 11], + ["cinima", 306, 137], + ["\u010Dinima", 306, 89], + ["onima", 304, 12], + ["arima", -1, 53], + ["drima", -1, 54], + ["erima", -1, 55], + ["orima", -1, 56], + ["basima", -1, 135], + ["gasima", -1, 131], + ["jasima", -1, 129], + ["kasima", -1, 133], + ["nasima", -1, 132], + ["tasima", -1, 130], + ["vasima", -1, 134], + ["esima", -1, 57], + ["isima", -1, 58], + ["osima", -1, 123], + ["atima", -1, 120], + ["ikatima", 324, 68], + ["latima", 324, 69], + ["etima", -1, 70], + ["evitima", -1, 92], + ["ovitima", -1, 93], + ["astima", -1, 94], + ["estima", -1, 71], + ["istima", -1, 72], + ["kstima", -1, 73], + ["ostima", -1, 74], + ["i\u0161tima", -1, 75], + ["avima", -1, 77], + ["evima", -1, 78], + ["ajevima", 337, 109], + ["cajevima", 338, 26], + ["lajevima", 338, 30], + ["rajevima", 338, 31], + ["\u0107ajevima", 338, 28], + ["\u010Dajevima", 338, 27], + ["\u0111ajevima", 338, 29], + ["ivima", -1, 79], + ["ovima", -1, 80], + ["govima", 346, 20], + ["ugovima", 347, 17], + ["lovima", 346, 82], + ["olovima", 349, 49], + ["movima", 346, 81], + ["onovima", 346, 12], + ["stvima", -1, 3], + ["\u0161tvima", -1, 4], + ["a\u0107ima", -1, 14], + ["e\u0107ima", -1, 15], + ["u\u0107ima", -1, 16], + ["ba\u0161ima", -1, 63], + ["ga\u0161ima", -1, 64], + ["ja\u0161ima", -1, 61], + ["ka\u0161ima", -1, 62], + ["na\u0161ima", -1, 60], + ["ta\u0161ima", -1, 59], + ["va\u0161ima", -1, 65], + ["e\u0161ima", -1, 66], + ["i\u0161ima", -1, 67], + ["o\u0161ima", -1, 91], + ["na", -1, 13], + ["ana", 368, 10], + ["acana", 369, 128], + ["urana", 369, 105], + ["tana", 369, 113], + ["avana", 369, 97], + ["evana", 369, 96], + ["ivana", 369, 98], + ["uvana", 369, 99], + ["a\u010Dana", 369, 102], + ["acena", 368, 124], + ["lucena", 368, 121], + ["a\u010Dena", 368, 101], + ["lu\u010Dena", 368, 117], + ["ina", 368, 11], + ["cina", 382, 137], + ["anina", 382, 10], + ["\u010Dina", 382, 89], + ["ona", 368, 12], + ["ara", -1, 53], + ["dra", -1, 54], + ["era", -1, 55], + ["ora", -1, 56], + ["basa", -1, 135], + ["gasa", -1, 131], + ["jasa", -1, 129], + ["kasa", -1, 133], + ["nasa", -1, 132], + ["tasa", -1, 130], + ["vasa", -1, 134], + ["esa", -1, 57], + ["isa", -1, 58], + ["osa", -1, 123], + ["ata", -1, 120], + ["ikata", 401, 68], + ["lata", 401, 69], + ["eta", -1, 70], + ["evita", -1, 92], + ["ovita", -1, 93], + ["asta", -1, 94], + ["esta", -1, 71], + ["ista", -1, 72], + ["ksta", -1, 73], + ["osta", -1, 74], + ["nuta", -1, 13], + ["i\u0161ta", -1, 75], + ["ava", -1, 77], + ["eva", -1, 78], + ["ajeva", 415, 109], + ["cajeva", 416, 26], + ["lajeva", 416, 30], + ["rajeva", 416, 31], + ["\u0107ajeva", 416, 28], + ["\u010Dajeva", 416, 27], + ["\u0111ajeva", 416, 29], + ["iva", -1, 79], + ["ova", -1, 80], + ["gova", 424, 20], + ["ugova", 425, 17], + ["lova", 424, 82], + ["olova", 427, 49], + ["mova", 424, 81], + ["onova", 424, 12], + ["stva", -1, 3], + ["\u0161tva", -1, 4], + ["a\u0107a", -1, 14], + ["e\u0107a", -1, 15], + ["u\u0107a", -1, 16], + ["ba\u0161a", -1, 63], + ["ga\u0161a", -1, 64], + ["ja\u0161a", -1, 61], + ["ka\u0161a", -1, 62], + ["na\u0161a", -1, 60], + ["ta\u0161a", -1, 59], + ["va\u0161a", -1, 65], + ["e\u0161a", -1, 66], + ["i\u0161a", -1, 67], + ["o\u0161a", -1, 91], + ["ace", -1, 124], + ["ece", -1, 125], + ["uce", -1, 126], + ["luce", 448, 121], + ["astade", -1, 110], + ["istade", -1, 111], + ["ostade", -1, 112], + ["ge", -1, 20], + ["loge", 453, 19], + ["uge", 453, 18], + ["aje", -1, 104], + ["caje", 456, 26], + ["laje", 456, 30], + ["raje", 456, 31], + ["astaje", 456, 106], + ["istaje", 456, 107], + ["ostaje", 456, 108], + ["\u0107aje", 456, 28], + ["\u010Daje", 456, 27], + ["\u0111aje", 456, 29], + ["ije", -1, 116], + ["bije", 466, 32], + ["cije", 466, 33], + ["dije", 466, 34], + ["fije", 466, 40], + ["gije", 466, 39], + ["anjije", 466, 84], + ["enjije", 466, 85], + ["snjije", 466, 122], + ["\u0161njije", 466, 86], + ["kije", 466, 95], + ["skije", 476, 1], + ["\u0161kije", 476, 2], + ["lije", 466, 35], + ["elije", 479, 83], + ["mije", 466, 37], + ["nije", 466, 13], + ["ganije", 482, 9], + ["manije", 482, 6], + ["panije", 482, 7], + ["ranije", 482, 8], + ["tanije", 482, 5], + ["pije", 466, 41], + ["rije", 466, 42], + ["sije", 466, 43], + ["osije", 490, 123], + ["tije", 466, 44], + ["atije", 492, 120], + ["evitije", 492, 92], + ["ovitije", 492, 93], + ["astije", 492, 94], + ["avije", 466, 77], + ["evije", 466, 78], + ["ivije", 466, 79], + ["ovije", 466, 80], + ["zije", 466, 45], + ["o\u0161ije", 466, 91], + ["\u017Eije", 466, 38], + ["anje", -1, 84], + ["enje", -1, 85], + ["snje", -1, 122], + ["\u0161nje", -1, 86], + ["uje", -1, 25], + ["lucuje", 508, 121], + ["iruje", 508, 100], + ["lu\u010Duje", 508, 117], + ["ke", -1, 95], + ["ske", 512, 1], + ["\u0161ke", 512, 2], + ["ale", -1, 104], + ["acale", 515, 128], + ["astajale", 515, 106], + ["istajale", 515, 107], + ["ostajale", 515, 108], + ["ijale", 515, 47], + ["injale", 515, 114], + ["nale", 515, 46], + ["irale", 515, 100], + ["urale", 515, 105], + ["tale", 515, 113], + ["astale", 525, 110], + ["istale", 525, 111], + ["ostale", 525, 112], + ["avale", 515, 97], + ["evale", 515, 96], + ["ivale", 515, 98], + ["ovale", 515, 76], + ["uvale", 515, 99], + ["a\u010Dale", 515, 102], + ["ele", -1, 83], + ["ile", -1, 116], + ["acile", 536, 124], + ["lucile", 536, 121], + ["nile", 536, 103], + ["rosile", 536, 127], + ["jetile", 536, 118], + ["ozile", 536, 48], + ["a\u010Dile", 536, 101], + ["lu\u010Dile", 536, 117], + ["ro\u0161ile", 536, 90], + ["ole", -1, 50], + ["asle", -1, 115], + ["nule", -1, 13], + ["rame", -1, 52], + ["leme", -1, 51], + ["acome", -1, 124], + ["ecome", -1, 125], + ["ucome", -1, 126], + ["anjome", -1, 84], + ["enjome", -1, 85], + ["snjome", -1, 122], + ["\u0161njome", -1, 86], + ["kome", -1, 95], + ["skome", 558, 1], + ["\u0161kome", 558, 2], + ["elome", -1, 83], + ["nome", -1, 13], + ["cinome", 562, 137], + ["\u010Dinome", 562, 89], + ["osome", -1, 123], + ["atome", -1, 120], + ["evitome", -1, 92], + ["ovitome", -1, 93], + ["astome", -1, 94], + ["avome", -1, 77], + ["evome", -1, 78], + ["ivome", -1, 79], + ["ovome", -1, 80], + ["a\u0107ome", -1, 14], + ["e\u0107ome", -1, 15], + ["u\u0107ome", -1, 16], + ["o\u0161ome", -1, 91], + ["ne", -1, 13], + ["ane", 578, 10], + ["acane", 579, 128], + ["urane", 579, 105], + ["tane", 579, 113], + ["astane", 582, 110], + ["istane", 582, 111], + ["ostane", 582, 112], + ["avane", 579, 97], + ["evane", 579, 96], + ["ivane", 579, 98], + ["uvane", 579, 99], + ["a\u010Dane", 579, 102], + ["acene", 578, 124], + ["lucene", 578, 121], + ["a\u010Dene", 578, 101], + ["lu\u010Dene", 578, 117], + ["ine", 578, 11], + ["cine", 595, 137], + ["anine", 595, 10], + ["\u010Dine", 595, 89], + ["one", 578, 12], + ["are", -1, 53], + ["dre", -1, 54], + ["ere", -1, 55], + ["ore", -1, 56], + ["ase", -1, 161], + ["base", 604, 135], + ["acase", 604, 128], + ["gase", 604, 131], + ["jase", 604, 129], + ["astajase", 608, 138], + ["istajase", 608, 139], + ["ostajase", 608, 140], + ["injase", 608, 150], + ["kase", 604, 133], + ["nase", 604, 132], + ["irase", 604, 155], + ["urase", 604, 156], + ["tase", 604, 130], + ["vase", 604, 134], + ["avase", 618, 144], + ["evase", 618, 145], + ["ivase", 618, 146], + ["ovase", 618, 148], + ["uvase", 618, 147], + ["ese", -1, 57], + ["ise", -1, 58], + ["acise", 625, 124], + ["lucise", 625, 121], + ["rosise", 625, 127], + ["jetise", 625, 149], + ["ose", -1, 123], + ["astadose", 630, 141], + ["istadose", 630, 142], + ["ostadose", 630, 143], + ["ate", -1, 104], + ["acate", 634, 128], + ["ikate", 634, 68], + ["late", 634, 69], + ["irate", 634, 100], + ["urate", 634, 105], + ["tate", 634, 113], + ["avate", 634, 97], + ["evate", 634, 96], + ["ivate", 634, 98], + ["uvate", 634, 99], + ["a\u010Date", 634, 102], + ["ete", -1, 70], + ["astadete", 646, 110], + ["istadete", 646, 111], + ["ostadete", 646, 112], + ["astajete", 646, 106], + ["istajete", 646, 107], + ["ostajete", 646, 108], + ["ijete", 646, 116], + ["injete", 646, 114], + ["ujete", 646, 25], + ["lucujete", 655, 121], + ["irujete", 655, 100], + ["lu\u010Dujete", 655, 117], + ["nete", 646, 13], + ["astanete", 659, 110], + ["istanete", 659, 111], + ["ostanete", 659, 112], + ["astete", 646, 115], + ["ite", -1, 116], + ["acite", 664, 124], + ["lucite", 664, 121], + ["nite", 664, 13], + ["astanite", 667, 110], + ["istanite", 667, 111], + ["ostanite", 667, 112], + ["rosite", 664, 127], + ["jetite", 664, 118], + ["astite", 664, 115], + ["evite", 664, 92], + ["ovite", 664, 93], + ["a\u010Dite", 664, 101], + ["lu\u010Dite", 664, 117], + ["ro\u0161ite", 664, 90], + ["ajte", -1, 104], + ["urajte", 679, 105], + ["tajte", 679, 113], + ["astajte", 681, 106], + ["istajte", 681, 107], + ["ostajte", 681, 108], + ["avajte", 679, 97], + ["evajte", 679, 96], + ["ivajte", 679, 98], + ["uvajte", 679, 99], + ["ijte", -1, 116], + ["lucujte", -1, 121], + ["irujte", -1, 100], + ["lu\u010Dujte", -1, 117], + ["aste", -1, 94], + ["acaste", 693, 128], + ["astajaste", 693, 106], + ["istajaste", 693, 107], + ["ostajaste", 693, 108], + ["injaste", 693, 114], + ["iraste", 693, 100], + ["uraste", 693, 105], + ["taste", 693, 113], + ["avaste", 693, 97], + ["evaste", 693, 96], + ["ivaste", 693, 98], + ["ovaste", 693, 76], + ["uvaste", 693, 99], + ["a\u010Daste", 693, 102], + ["este", -1, 71], + ["iste", -1, 72], + ["aciste", 709, 124], + ["luciste", 709, 121], + ["niste", 709, 103], + ["rosiste", 709, 127], + ["jetiste", 709, 118], + ["a\u010Diste", 709, 101], + ["lu\u010Diste", 709, 117], + ["ro\u0161iste", 709, 90], + ["kste", -1, 73], + ["oste", -1, 74], + ["astadoste", 719, 110], + ["istadoste", 719, 111], + ["ostadoste", 719, 112], + ["nuste", -1, 13], + ["i\u0161te", -1, 75], + ["ave", -1, 77], + ["eve", -1, 78], + ["ajeve", 726, 109], + ["cajeve", 727, 26], + ["lajeve", 727, 30], + ["rajeve", 727, 31], + ["\u0107ajeve", 727, 28], + ["\u010Dajeve", 727, 27], + ["\u0111ajeve", 727, 29], + ["ive", -1, 79], + ["ove", -1, 80], + ["gove", 735, 20], + ["ugove", 736, 17], + ["love", 735, 82], + ["olove", 738, 49], + ["move", 735, 81], + ["onove", 735, 12], + ["a\u0107e", -1, 14], + ["e\u0107e", -1, 15], + ["u\u0107e", -1, 16], + ["a\u010De", -1, 101], + ["lu\u010De", -1, 117], + ["a\u0161e", -1, 104], + ["ba\u0161e", 747, 63], + ["ga\u0161e", 747, 64], + ["ja\u0161e", 747, 61], + ["astaja\u0161e", 750, 106], + ["istaja\u0161e", 750, 107], + ["ostaja\u0161e", 750, 108], + ["inja\u0161e", 750, 114], + ["ka\u0161e", 747, 62], + ["na\u0161e", 747, 60], + ["ira\u0161e", 747, 100], + ["ura\u0161e", 747, 105], + ["ta\u0161e", 747, 59], + ["va\u0161e", 747, 65], + ["ava\u0161e", 760, 97], + ["eva\u0161e", 760, 96], + ["iva\u0161e", 760, 98], + ["ova\u0161e", 760, 76], + ["uva\u0161e", 760, 99], + ["a\u010Da\u0161e", 747, 102], + ["e\u0161e", -1, 66], + ["i\u0161e", -1, 67], + ["jeti\u0161e", 768, 118], + ["a\u010Di\u0161e", 768, 101], + ["lu\u010Di\u0161e", 768, 117], + ["ro\u0161i\u0161e", 768, 90], + ["o\u0161e", -1, 91], + ["astado\u0161e", 773, 110], + ["istado\u0161e", 773, 111], + ["ostado\u0161e", 773, 112], + ["aceg", -1, 124], + ["eceg", -1, 125], + ["uceg", -1, 126], + ["anjijeg", -1, 84], + ["enjijeg", -1, 85], + ["snjijeg", -1, 122], + ["\u0161njijeg", -1, 86], + ["kijeg", -1, 95], + ["skijeg", 784, 1], + ["\u0161kijeg", 784, 2], + ["elijeg", -1, 83], + ["nijeg", -1, 13], + ["osijeg", -1, 123], + ["atijeg", -1, 120], + ["evitijeg", -1, 92], + ["ovitijeg", -1, 93], + ["astijeg", -1, 94], + ["avijeg", -1, 77], + ["evijeg", -1, 78], + ["ivijeg", -1, 79], + ["ovijeg", -1, 80], + ["o\u0161ijeg", -1, 91], + ["anjeg", -1, 84], + ["enjeg", -1, 85], + ["snjeg", -1, 122], + ["\u0161njeg", -1, 86], + ["keg", -1, 95], + ["eleg", -1, 83], + ["neg", -1, 13], + ["aneg", 805, 10], + ["eneg", 805, 87], + ["sneg", 805, 159], + ["\u0161neg", 805, 88], + ["oseg", -1, 123], + ["ateg", -1, 120], + ["aveg", -1, 77], + ["eveg", -1, 78], + ["iveg", -1, 79], + ["oveg", -1, 80], + ["a\u0107eg", -1, 14], + ["e\u0107eg", -1, 15], + ["u\u0107eg", -1, 16], + ["o\u0161eg", -1, 91], + ["acog", -1, 124], + ["ecog", -1, 125], + ["ucog", -1, 126], + ["anjog", -1, 84], + ["enjog", -1, 85], + ["snjog", -1, 122], + ["\u0161njog", -1, 86], + ["kog", -1, 95], + ["skog", 827, 1], + ["\u0161kog", 827, 2], + ["elog", -1, 83], + ["nog", -1, 13], + ["cinog", 831, 137], + ["\u010Dinog", 831, 89], + ["osog", -1, 123], + ["atog", -1, 120], + ["evitog", -1, 92], + ["ovitog", -1, 93], + ["astog", -1, 94], + ["avog", -1, 77], + ["evog", -1, 78], + ["ivog", -1, 79], + ["ovog", -1, 80], + ["a\u0107og", -1, 14], + ["e\u0107og", -1, 15], + ["u\u0107og", -1, 16], + ["o\u0161og", -1, 91], + ["ah", -1, 104], + ["acah", 847, 128], + ["astajah", 847, 106], + ["istajah", 847, 107], + ["ostajah", 847, 108], + ["injah", 847, 114], + ["irah", 847, 100], + ["urah", 847, 105], + ["tah", 847, 113], + ["avah", 847, 97], + ["evah", 847, 96], + ["ivah", 847, 98], + ["ovah", 847, 76], + ["uvah", 847, 99], + ["a\u010Dah", 847, 102], + ["ih", -1, 116], + ["acih", 862, 124], + ["ecih", 862, 125], + ["ucih", 862, 126], + ["lucih", 865, 121], + ["anjijih", 862, 84], + ["enjijih", 862, 85], + ["snjijih", 862, 122], + ["\u0161njijih", 862, 86], + ["kijih", 862, 95], + ["skijih", 871, 1], + ["\u0161kijih", 871, 2], + ["elijih", 862, 83], + ["nijih", 862, 13], + ["osijih", 862, 123], + ["atijih", 862, 120], + ["evitijih", 862, 92], + ["ovitijih", 862, 93], + ["astijih", 862, 94], + ["avijih", 862, 77], + ["evijih", 862, 78], + ["ivijih", 862, 79], + ["ovijih", 862, 80], + ["o\u0161ijih", 862, 91], + ["anjih", 862, 84], + ["enjih", 862, 85], + ["snjih", 862, 122], + ["\u0161njih", 862, 86], + ["kih", 862, 95], + ["skih", 890, 1], + ["\u0161kih", 890, 2], + ["elih", 862, 83], + ["nih", 862, 13], + ["cinih", 894, 137], + ["\u010Dinih", 894, 89], + ["osih", 862, 123], + ["rosih", 897, 127], + ["atih", 862, 120], + ["jetih", 862, 118], + ["evitih", 862, 92], + ["ovitih", 862, 93], + ["astih", 862, 94], + ["avih", 862, 77], + ["evih", 862, 78], + ["ivih", 862, 79], + ["ovih", 862, 80], + ["a\u0107ih", 862, 14], + ["e\u0107ih", 862, 15], + ["u\u0107ih", 862, 16], + ["a\u010Dih", 862, 101], + ["lu\u010Dih", 862, 117], + ["o\u0161ih", 862, 91], + ["ro\u0161ih", 913, 90], + ["astadoh", -1, 110], + ["istadoh", -1, 111], + ["ostadoh", -1, 112], + ["acuh", -1, 124], + ["ecuh", -1, 125], + ["ucuh", -1, 126], + ["a\u0107uh", -1, 14], + ["e\u0107uh", -1, 15], + ["u\u0107uh", -1, 16], + ["aci", -1, 124], + ["aceci", -1, 124], + ["ieci", -1, 162], + ["ajuci", -1, 161], + ["irajuci", 927, 155], + ["urajuci", 927, 156], + ["astajuci", 927, 138], + ["istajuci", 927, 139], + ["ostajuci", 927, 140], + ["avajuci", 927, 144], + ["evajuci", 927, 145], + ["ivajuci", 927, 146], + ["uvajuci", 927, 147], + ["ujuci", -1, 157], + ["lucujuci", 937, 121], + ["irujuci", 937, 155], + ["luci", -1, 121], + ["nuci", -1, 164], + ["etuci", -1, 153], + ["astuci", -1, 136], + ["gi", -1, 20], + ["ugi", 944, 18], + ["aji", -1, 109], + ["caji", 946, 26], + ["laji", 946, 30], + ["raji", 946, 31], + ["\u0107aji", 946, 28], + ["\u010Daji", 946, 27], + ["\u0111aji", 946, 29], + ["biji", -1, 32], + ["ciji", -1, 33], + ["diji", -1, 34], + ["fiji", -1, 40], + ["giji", -1, 39], + ["anjiji", -1, 84], + ["enjiji", -1, 85], + ["snjiji", -1, 122], + ["\u0161njiji", -1, 86], + ["kiji", -1, 95], + ["skiji", 962, 1], + ["\u0161kiji", 962, 2], + ["liji", -1, 35], + ["eliji", 965, 83], + ["miji", -1, 37], + ["niji", -1, 13], + ["ganiji", 968, 9], + ["maniji", 968, 6], + ["paniji", 968, 7], + ["raniji", 968, 8], + ["taniji", 968, 5], + ["piji", -1, 41], + ["riji", -1, 42], + ["siji", -1, 43], + ["osiji", 976, 123], + ["tiji", -1, 44], + ["atiji", 978, 120], + ["evitiji", 978, 92], + ["ovitiji", 978, 93], + ["astiji", 978, 94], + ["aviji", -1, 77], + ["eviji", -1, 78], + ["iviji", -1, 79], + ["oviji", -1, 80], + ["ziji", -1, 45], + ["o\u0161iji", -1, 91], + ["\u017Eiji", -1, 38], + ["anji", -1, 84], + ["enji", -1, 85], + ["snji", -1, 122], + ["\u0161nji", -1, 86], + ["ki", -1, 95], + ["ski", 994, 1], + ["\u0161ki", 994, 2], + ["ali", -1, 104], + ["acali", 997, 128], + ["astajali", 997, 106], + ["istajali", 997, 107], + ["ostajali", 997, 108], + ["ijali", 997, 47], + ["injali", 997, 114], + ["nali", 997, 46], + ["irali", 997, 100], + ["urali", 997, 105], + ["tali", 997, 113], + ["astali", 1007, 110], + ["istali", 1007, 111], + ["ostali", 1007, 112], + ["avali", 997, 97], + ["evali", 997, 96], + ["ivali", 997, 98], + ["ovali", 997, 76], + ["uvali", 997, 99], + ["a\u010Dali", 997, 102], + ["eli", -1, 83], + ["ili", -1, 116], + ["acili", 1018, 124], + ["lucili", 1018, 121], + ["nili", 1018, 103], + ["rosili", 1018, 127], + ["jetili", 1018, 118], + ["ozili", 1018, 48], + ["a\u010Dili", 1018, 101], + ["lu\u010Dili", 1018, 117], + ["ro\u0161ili", 1018, 90], + ["oli", -1, 50], + ["asli", -1, 115], + ["nuli", -1, 13], + ["rami", -1, 52], + ["lemi", -1, 51], + ["ni", -1, 13], + ["ani", 1033, 10], + ["acani", 1034, 128], + ["urani", 1034, 105], + ["tani", 1034, 113], + ["avani", 1034, 97], + ["evani", 1034, 96], + ["ivani", 1034, 98], + ["uvani", 1034, 99], + ["a\u010Dani", 1034, 102], + ["aceni", 1033, 124], + ["luceni", 1033, 121], + ["a\u010Deni", 1033, 101], + ["lu\u010Deni", 1033, 117], + ["ini", 1033, 11], + ["cini", 1047, 137], + ["\u010Dini", 1047, 89], + ["oni", 1033, 12], + ["ari", -1, 53], + ["dri", -1, 54], + ["eri", -1, 55], + ["ori", -1, 56], + ["basi", -1, 135], + ["gasi", -1, 131], + ["jasi", -1, 129], + ["kasi", -1, 133], + ["nasi", -1, 132], + ["tasi", -1, 130], + ["vasi", -1, 134], + ["esi", -1, 152], + ["isi", -1, 154], + ["osi", -1, 123], + ["avsi", -1, 161], + ["acavsi", 1065, 128], + ["iravsi", 1065, 155], + ["tavsi", 1065, 160], + ["etavsi", 1068, 153], + ["astavsi", 1068, 141], + ["istavsi", 1068, 142], + ["ostavsi", 1068, 143], + ["ivsi", -1, 162], + ["nivsi", 1073, 158], + ["rosivsi", 1073, 127], + ["nuvsi", -1, 164], + ["ati", -1, 104], + ["acati", 1077, 128], + ["astajati", 1077, 106], + ["istajati", 1077, 107], + ["ostajati", 1077, 108], + ["injati", 1077, 114], + ["ikati", 1077, 68], + ["lati", 1077, 69], + ["irati", 1077, 100], + ["urati", 1077, 105], + ["tati", 1077, 113], + ["astati", 1087, 110], + ["istati", 1087, 111], + ["ostati", 1087, 112], + ["avati", 1077, 97], + ["evati", 1077, 96], + ["ivati", 1077, 98], + ["ovati", 1077, 76], + ["uvati", 1077, 99], + ["a\u010Dati", 1077, 102], + ["eti", -1, 70], + ["iti", -1, 116], + ["aciti", 1098, 124], + ["luciti", 1098, 121], + ["niti", 1098, 103], + ["rositi", 1098, 127], + ["jetiti", 1098, 118], + ["eviti", 1098, 92], + ["oviti", 1098, 93], + ["a\u010Diti", 1098, 101], + ["lu\u010Diti", 1098, 117], + ["ro\u0161iti", 1098, 90], + ["asti", -1, 94], + ["esti", -1, 71], + ["isti", -1, 72], + ["ksti", -1, 73], + ["osti", -1, 74], + ["nuti", -1, 13], + ["avi", -1, 77], + ["evi", -1, 78], + ["ajevi", 1116, 109], + ["cajevi", 1117, 26], + ["lajevi", 1117, 30], + ["rajevi", 1117, 31], + ["\u0107ajevi", 1117, 28], + ["\u010Dajevi", 1117, 27], + ["\u0111ajevi", 1117, 29], + ["ivi", -1, 79], + ["ovi", -1, 80], + ["govi", 1125, 20], + ["ugovi", 1126, 17], + ["lovi", 1125, 82], + ["olovi", 1128, 49], + ["movi", 1125, 81], + ["onovi", 1125, 12], + ["ie\u0107i", -1, 116], + ["a\u010De\u0107i", -1, 101], + ["aju\u0107i", -1, 104], + ["iraju\u0107i", 1134, 100], + ["uraju\u0107i", 1134, 105], + ["astaju\u0107i", 1134, 106], + ["istaju\u0107i", 1134, 107], + ["ostaju\u0107i", 1134, 108], + ["avaju\u0107i", 1134, 97], + ["evaju\u0107i", 1134, 96], + ["ivaju\u0107i", 1134, 98], + ["uvaju\u0107i", 1134, 99], + ["uju\u0107i", -1, 25], + ["iruju\u0107i", 1144, 100], + ["lu\u010Duju\u0107i", 1144, 117], + ["nu\u0107i", -1, 13], + ["etu\u0107i", -1, 70], + ["astu\u0107i", -1, 115], + ["a\u010Di", -1, 101], + ["lu\u010Di", -1, 117], + ["ba\u0161i", -1, 63], + ["ga\u0161i", -1, 64], + ["ja\u0161i", -1, 61], + ["ka\u0161i", -1, 62], + ["na\u0161i", -1, 60], + ["ta\u0161i", -1, 59], + ["va\u0161i", -1, 65], + ["e\u0161i", -1, 66], + ["i\u0161i", -1, 67], + ["o\u0161i", -1, 91], + ["av\u0161i", -1, 104], + ["irav\u0161i", 1162, 100], + ["tav\u0161i", 1162, 113], + ["etav\u0161i", 1164, 70], + ["astav\u0161i", 1164, 110], + ["istav\u0161i", 1164, 111], + ["ostav\u0161i", 1164, 112], + ["a\u010Dav\u0161i", 1162, 102], + ["iv\u0161i", -1, 116], + ["niv\u0161i", 1170, 103], + ["ro\u0161iv\u0161i", 1170, 90], + ["nuv\u0161i", -1, 13], + ["aj", -1, 104], + ["uraj", 1174, 105], + ["taj", 1174, 113], + ["avaj", 1174, 97], + ["evaj", 1174, 96], + ["ivaj", 1174, 98], + ["uvaj", 1174, 99], + ["ij", -1, 116], + ["acoj", -1, 124], + ["ecoj", -1, 125], + ["ucoj", -1, 126], + ["anjijoj", -1, 84], + ["enjijoj", -1, 85], + ["snjijoj", -1, 122], + ["\u0161njijoj", -1, 86], + ["kijoj", -1, 95], + ["skijoj", 1189, 1], + ["\u0161kijoj", 1189, 2], + ["elijoj", -1, 83], + ["nijoj", -1, 13], + ["osijoj", -1, 123], + ["evitijoj", -1, 92], + ["ovitijoj", -1, 93], + ["astijoj", -1, 94], + ["avijoj", -1, 77], + ["evijoj", -1, 78], + ["ivijoj", -1, 79], + ["ovijoj", -1, 80], + ["o\u0161ijoj", -1, 91], + ["anjoj", -1, 84], + ["enjoj", -1, 85], + ["snjoj", -1, 122], + ["\u0161njoj", -1, 86], + ["koj", -1, 95], + ["skoj", 1207, 1], + ["\u0161koj", 1207, 2], + ["aloj", -1, 104], + ["eloj", -1, 83], + ["noj", -1, 13], + ["cinoj", 1212, 137], + ["\u010Dinoj", 1212, 89], + ["osoj", -1, 123], + ["atoj", -1, 120], + ["evitoj", -1, 92], + ["ovitoj", -1, 93], + ["astoj", -1, 94], + ["avoj", -1, 77], + ["evoj", -1, 78], + ["ivoj", -1, 79], + ["ovoj", -1, 80], + ["a\u0107oj", -1, 14], + ["e\u0107oj", -1, 15], + ["u\u0107oj", -1, 16], + ["o\u0161oj", -1, 91], + ["lucuj", -1, 121], + ["iruj", -1, 100], + ["lu\u010Duj", -1, 117], + ["al", -1, 104], + ["iral", 1231, 100], + ["ural", 1231, 105], + ["el", -1, 119], + ["il", -1, 116], + ["am", -1, 104], + ["acam", 1236, 128], + ["iram", 1236, 100], + ["uram", 1236, 105], + ["tam", 1236, 113], + ["avam", 1236, 97], + ["evam", 1236, 96], + ["ivam", 1236, 98], + ["uvam", 1236, 99], + ["a\u010Dam", 1236, 102], + ["em", -1, 119], + ["acem", 1246, 124], + ["ecem", 1246, 125], + ["ucem", 1246, 126], + ["astadem", 1246, 110], + ["istadem", 1246, 111], + ["ostadem", 1246, 112], + ["ajem", 1246, 104], + ["cajem", 1253, 26], + ["lajem", 1253, 30], + ["rajem", 1253, 31], + ["astajem", 1253, 106], + ["istajem", 1253, 107], + ["ostajem", 1253, 108], + ["\u0107ajem", 1253, 28], + ["\u010Dajem", 1253, 27], + ["\u0111ajem", 1253, 29], + ["ijem", 1246, 116], + ["anjijem", 1263, 84], + ["enjijem", 1263, 85], + ["snjijem", 1263, 123], + ["\u0161njijem", 1263, 86], + ["kijem", 1263, 95], + ["skijem", 1268, 1], + ["\u0161kijem", 1268, 2], + ["lijem", 1263, 24], + ["elijem", 1271, 83], + ["nijem", 1263, 13], + ["rarijem", 1263, 21], + ["sijem", 1263, 23], + ["osijem", 1275, 123], + ["atijem", 1263, 120], + ["evitijem", 1263, 92], + ["ovitijem", 1263, 93], + ["otijem", 1263, 22], + ["astijem", 1263, 94], + ["avijem", 1263, 77], + ["evijem", 1263, 78], + ["ivijem", 1263, 79], + ["ovijem", 1263, 80], + ["o\u0161ijem", 1263, 91], + ["anjem", 1246, 84], + ["enjem", 1246, 85], + ["injem", 1246, 114], + ["snjem", 1246, 122], + ["\u0161njem", 1246, 86], + ["ujem", 1246, 25], + ["lucujem", 1292, 121], + ["irujem", 1292, 100], + ["lu\u010Dujem", 1292, 117], + ["kem", 1246, 95], + ["skem", 1296, 1], + ["\u0161kem", 1296, 2], + ["elem", 1246, 83], + ["nem", 1246, 13], + ["anem", 1300, 10], + ["astanem", 1301, 110], + ["istanem", 1301, 111], + ["ostanem", 1301, 112], + ["enem", 1300, 87], + ["snem", 1300, 159], + ["\u0161nem", 1300, 88], + ["basem", 1246, 135], + ["gasem", 1246, 131], + ["jasem", 1246, 129], + ["kasem", 1246, 133], + ["nasem", 1246, 132], + ["tasem", 1246, 130], + ["vasem", 1246, 134], + ["esem", 1246, 152], + ["isem", 1246, 154], + ["osem", 1246, 123], + ["atem", 1246, 120], + ["etem", 1246, 70], + ["evitem", 1246, 92], + ["ovitem", 1246, 93], + ["astem", 1246, 94], + ["istem", 1246, 151], + ["i\u0161tem", 1246, 75], + ["avem", 1246, 77], + ["evem", 1246, 78], + ["ivem", 1246, 79], + ["a\u0107em", 1246, 14], + ["e\u0107em", 1246, 15], + ["u\u0107em", 1246, 16], + ["ba\u0161em", 1246, 63], + ["ga\u0161em", 1246, 64], + ["ja\u0161em", 1246, 61], + ["ka\u0161em", 1246, 62], + ["na\u0161em", 1246, 60], + ["ta\u0161em", 1246, 59], + ["va\u0161em", 1246, 65], + ["e\u0161em", 1246, 66], + ["i\u0161em", 1246, 67], + ["o\u0161em", 1246, 91], + ["im", -1, 116], + ["acim", 1341, 124], + ["ecim", 1341, 125], + ["ucim", 1341, 126], + ["lucim", 1344, 121], + ["anjijim", 1341, 84], + ["enjijim", 1341, 85], + ["snjijim", 1341, 122], + ["\u0161njijim", 1341, 86], + ["kijim", 1341, 95], + ["skijim", 1350, 1], + ["\u0161kijim", 1350, 2], + ["elijim", 1341, 83], + ["nijim", 1341, 13], + ["osijim", 1341, 123], + ["atijim", 1341, 120], + ["evitijim", 1341, 92], + ["ovitijim", 1341, 93], + ["astijim", 1341, 94], + ["avijim", 1341, 77], + ["evijim", 1341, 78], + ["ivijim", 1341, 79], + ["ovijim", 1341, 80], + ["o\u0161ijim", 1341, 91], + ["anjim", 1341, 84], + ["enjim", 1341, 85], + ["snjim", 1341, 122], + ["\u0161njim", 1341, 86], + ["kim", 1341, 95], + ["skim", 1369, 1], + ["\u0161kim", 1369, 2], + ["elim", 1341, 83], + ["nim", 1341, 13], + ["cinim", 1373, 137], + ["\u010Dinim", 1373, 89], + ["osim", 1341, 123], + ["rosim", 1376, 127], + ["atim", 1341, 120], + ["jetim", 1341, 118], + ["evitim", 1341, 92], + ["ovitim", 1341, 93], + ["astim", 1341, 94], + ["avim", 1341, 77], + ["evim", 1341, 78], + ["ivim", 1341, 79], + ["ovim", 1341, 80], + ["a\u0107im", 1341, 14], + ["e\u0107im", 1341, 15], + ["u\u0107im", 1341, 16], + ["a\u010Dim", 1341, 101], + ["lu\u010Dim", 1341, 117], + ["o\u0161im", 1341, 91], + ["ro\u0161im", 1392, 90], + ["acom", -1, 124], + ["ecom", -1, 125], + ["ucom", -1, 126], + ["gom", -1, 20], + ["logom", 1397, 19], + ["ugom", 1397, 18], + ["bijom", -1, 32], + ["cijom", -1, 33], + ["dijom", -1, 34], + ["fijom", -1, 40], + ["gijom", -1, 39], + ["lijom", -1, 35], + ["mijom", -1, 37], + ["nijom", -1, 36], + ["ganijom", 1407, 9], + ["manijom", 1407, 6], + ["panijom", 1407, 7], + ["ranijom", 1407, 8], + ["tanijom", 1407, 5], + ["pijom", -1, 41], + ["rijom", -1, 42], + ["sijom", -1, 43], + ["tijom", -1, 44], + ["zijom", -1, 45], + ["\u017Eijom", -1, 38], + ["anjom", -1, 84], + ["enjom", -1, 85], + ["snjom", -1, 122], + ["\u0161njom", -1, 86], + ["kom", -1, 95], + ["skom", 1423, 1], + ["\u0161kom", 1423, 2], + ["alom", -1, 104], + ["ijalom", 1426, 47], + ["nalom", 1426, 46], + ["elom", -1, 83], + ["ilom", -1, 116], + ["ozilom", 1430, 48], + ["olom", -1, 50], + ["ramom", -1, 52], + ["lemom", -1, 51], + ["nom", -1, 13], + ["anom", 1435, 10], + ["inom", 1435, 11], + ["cinom", 1437, 137], + ["aninom", 1437, 10], + ["\u010Dinom", 1437, 89], + ["onom", 1435, 12], + ["arom", -1, 53], + ["drom", -1, 54], + ["erom", -1, 55], + ["orom", -1, 56], + ["basom", -1, 135], + ["gasom", -1, 131], + ["jasom", -1, 129], + ["kasom", -1, 133], + ["nasom", -1, 132], + ["tasom", -1, 130], + ["vasom", -1, 134], + ["esom", -1, 57], + ["isom", -1, 58], + ["osom", -1, 123], + ["atom", -1, 120], + ["ikatom", 1456, 68], + ["latom", 1456, 69], + ["etom", -1, 70], + ["evitom", -1, 92], + ["ovitom", -1, 93], + ["astom", -1, 94], + ["estom", -1, 71], + ["istom", -1, 72], + ["kstom", -1, 73], + ["ostom", -1, 74], + ["avom", -1, 77], + ["evom", -1, 78], + ["ivom", -1, 79], + ["ovom", -1, 80], + ["lovom", 1470, 82], + ["movom", 1470, 81], + ["stvom", -1, 3], + ["\u0161tvom", -1, 4], + ["a\u0107om", -1, 14], + ["e\u0107om", -1, 15], + ["u\u0107om", -1, 16], + ["ba\u0161om", -1, 63], + ["ga\u0161om", -1, 64], + ["ja\u0161om", -1, 61], + ["ka\u0161om", -1, 62], + ["na\u0161om", -1, 60], + ["ta\u0161om", -1, 59], + ["va\u0161om", -1, 65], + ["e\u0161om", -1, 66], + ["i\u0161om", -1, 67], + ["o\u0161om", -1, 91], + ["an", -1, 104], + ["acan", 1488, 128], + ["iran", 1488, 100], + ["uran", 1488, 105], + ["tan", 1488, 113], + ["avan", 1488, 97], + ["evan", 1488, 96], + ["ivan", 1488, 98], + ["uvan", 1488, 99], + ["a\u010Dan", 1488, 102], + ["acen", -1, 124], + ["lucen", -1, 121], + ["a\u010Den", -1, 101], + ["lu\u010Den", -1, 117], + ["anin", -1, 10], + ["ao", -1, 104], + ["acao", 1503, 128], + ["astajao", 1503, 106], + ["istajao", 1503, 107], + ["ostajao", 1503, 108], + ["injao", 1503, 114], + ["irao", 1503, 100], + ["urao", 1503, 105], + ["tao", 1503, 113], + ["astao", 1511, 110], + ["istao", 1511, 111], + ["ostao", 1511, 112], + ["avao", 1503, 97], + ["evao", 1503, 96], + ["ivao", 1503, 98], + ["ovao", 1503, 76], + ["uvao", 1503, 99], + ["a\u010Dao", 1503, 102], + ["go", -1, 20], + ["ugo", 1521, 18], + ["io", -1, 116], + ["acio", 1523, 124], + ["lucio", 1523, 121], + ["lio", 1523, 24], + ["nio", 1523, 103], + ["rario", 1523, 21], + ["sio", 1523, 23], + ["rosio", 1529, 127], + ["jetio", 1523, 118], + ["otio", 1523, 22], + ["a\u010Dio", 1523, 101], + ["lu\u010Dio", 1523, 117], + ["ro\u0161io", 1523, 90], + ["bijo", -1, 32], + ["cijo", -1, 33], + ["dijo", -1, 34], + ["fijo", -1, 40], + ["gijo", -1, 39], + ["lijo", -1, 35], + ["mijo", -1, 37], + ["nijo", -1, 36], + ["pijo", -1, 41], + ["rijo", -1, 42], + ["sijo", -1, 43], + ["tijo", -1, 44], + ["zijo", -1, 45], + ["\u017Eijo", -1, 38], + ["anjo", -1, 84], + ["enjo", -1, 85], + ["snjo", -1, 122], + ["\u0161njo", -1, 86], + ["ko", -1, 95], + ["sko", 1554, 1], + ["\u0161ko", 1554, 2], + ["alo", -1, 104], + ["acalo", 1557, 128], + ["astajalo", 1557, 106], + ["istajalo", 1557, 107], + ["ostajalo", 1557, 108], + ["ijalo", 1557, 47], + ["injalo", 1557, 114], + ["nalo", 1557, 46], + ["iralo", 1557, 100], + ["uralo", 1557, 105], + ["talo", 1557, 113], + ["astalo", 1567, 110], + ["istalo", 1567, 111], + ["ostalo", 1567, 112], + ["avalo", 1557, 97], + ["evalo", 1557, 96], + ["ivalo", 1557, 98], + ["ovalo", 1557, 76], + ["uvalo", 1557, 99], + ["a\u010Dalo", 1557, 102], + ["elo", -1, 83], + ["ilo", -1, 116], + ["acilo", 1578, 124], + ["lucilo", 1578, 121], + ["nilo", 1578, 103], + ["rosilo", 1578, 127], + ["jetilo", 1578, 118], + ["a\u010Dilo", 1578, 101], + ["lu\u010Dilo", 1578, 117], + ["ro\u0161ilo", 1578, 90], + ["aslo", -1, 115], + ["nulo", -1, 13], + ["amo", -1, 104], + ["acamo", 1589, 128], + ["ramo", 1589, 52], + ["iramo", 1591, 100], + ["uramo", 1591, 105], + ["tamo", 1589, 113], + ["avamo", 1589, 97], + ["evamo", 1589, 96], + ["ivamo", 1589, 98], + ["uvamo", 1589, 99], + ["a\u010Damo", 1589, 102], + ["emo", -1, 119], + ["astademo", 1600, 110], + ["istademo", 1600, 111], + ["ostademo", 1600, 112], + ["astajemo", 1600, 106], + ["istajemo", 1600, 107], + ["ostajemo", 1600, 108], + ["ijemo", 1600, 116], + ["injemo", 1600, 114], + ["ujemo", 1600, 25], + ["lucujemo", 1609, 121], + ["irujemo", 1609, 100], + ["lu\u010Dujemo", 1609, 117], + ["lemo", 1600, 51], + ["nemo", 1600, 13], + ["astanemo", 1614, 110], + ["istanemo", 1614, 111], + ["ostanemo", 1614, 112], + ["etemo", 1600, 70], + ["astemo", 1600, 115], + ["imo", -1, 116], + ["acimo", 1620, 124], + ["lucimo", 1620, 121], + ["nimo", 1620, 13], + ["astanimo", 1623, 110], + ["istanimo", 1623, 111], + ["ostanimo", 1623, 112], + ["rosimo", 1620, 127], + ["etimo", 1620, 70], + ["jetimo", 1628, 118], + ["astimo", 1620, 115], + ["a\u010Dimo", 1620, 101], + ["lu\u010Dimo", 1620, 117], + ["ro\u0161imo", 1620, 90], + ["ajmo", -1, 104], + ["urajmo", 1634, 105], + ["tajmo", 1634, 113], + ["astajmo", 1636, 106], + ["istajmo", 1636, 107], + ["ostajmo", 1636, 108], + ["avajmo", 1634, 97], + ["evajmo", 1634, 96], + ["ivajmo", 1634, 98], + ["uvajmo", 1634, 99], + ["ijmo", -1, 116], + ["ujmo", -1, 25], + ["lucujmo", 1645, 121], + ["irujmo", 1645, 100], + ["lu\u010Dujmo", 1645, 117], + ["asmo", -1, 104], + ["acasmo", 1649, 128], + ["astajasmo", 1649, 106], + ["istajasmo", 1649, 107], + ["ostajasmo", 1649, 108], + ["injasmo", 1649, 114], + ["irasmo", 1649, 100], + ["urasmo", 1649, 105], + ["tasmo", 1649, 113], + ["avasmo", 1649, 97], + ["evasmo", 1649, 96], + ["ivasmo", 1649, 98], + ["ovasmo", 1649, 76], + ["uvasmo", 1649, 99], + ["a\u010Dasmo", 1649, 102], + ["ismo", -1, 116], + ["acismo", 1664, 124], + ["lucismo", 1664, 121], + ["nismo", 1664, 103], + ["rosismo", 1664, 127], + ["jetismo", 1664, 118], + ["a\u010Dismo", 1664, 101], + ["lu\u010Dismo", 1664, 117], + ["ro\u0161ismo", 1664, 90], + ["astadosmo", -1, 110], + ["istadosmo", -1, 111], + ["ostadosmo", -1, 112], + ["nusmo", -1, 13], + ["no", -1, 13], + ["ano", 1677, 104], + ["acano", 1678, 128], + ["urano", 1678, 105], + ["tano", 1678, 113], + ["avano", 1678, 97], + ["evano", 1678, 96], + ["ivano", 1678, 98], + ["uvano", 1678, 99], + ["a\u010Dano", 1678, 102], + ["aceno", 1677, 124], + ["luceno", 1677, 121], + ["a\u010Deno", 1677, 101], + ["lu\u010Deno", 1677, 117], + ["ino", 1677, 11], + ["cino", 1691, 137], + ["\u010Dino", 1691, 89], + ["ato", -1, 120], + ["ikato", 1694, 68], + ["lato", 1694, 69], + ["eto", -1, 70], + ["evito", -1, 92], + ["ovito", -1, 93], + ["asto", -1, 94], + ["esto", -1, 71], + ["isto", -1, 72], + ["ksto", -1, 73], + ["osto", -1, 74], + ["nuto", -1, 13], + ["nuo", -1, 13], + ["avo", -1, 77], + ["evo", -1, 78], + ["ivo", -1, 79], + ["ovo", -1, 80], + ["stvo", -1, 3], + ["\u0161tvo", -1, 4], + ["as", -1, 161], + ["acas", 1713, 128], + ["iras", 1713, 155], + ["uras", 1713, 156], + ["tas", 1713, 160], + ["avas", 1713, 144], + ["evas", 1713, 145], + ["ivas", 1713, 146], + ["uvas", 1713, 147], + ["es", -1, 163], + ["astades", 1722, 141], + ["istades", 1722, 142], + ["ostades", 1722, 143], + ["astajes", 1722, 138], + ["istajes", 1722, 139], + ["ostajes", 1722, 140], + ["ijes", 1722, 162], + ["injes", 1722, 150], + ["ujes", 1722, 157], + ["lucujes", 1731, 121], + ["irujes", 1731, 155], + ["nes", 1722, 164], + ["astanes", 1734, 141], + ["istanes", 1734, 142], + ["ostanes", 1734, 143], + ["etes", 1722, 153], + ["astes", 1722, 136], + ["is", -1, 162], + ["acis", 1740, 124], + ["lucis", 1740, 121], + ["nis", 1740, 158], + ["rosis", 1740, 127], + ["jetis", 1740, 149], + ["at", -1, 104], + ["acat", 1746, 128], + ["astajat", 1746, 106], + ["istajat", 1746, 107], + ["ostajat", 1746, 108], + ["injat", 1746, 114], + ["irat", 1746, 100], + ["urat", 1746, 105], + ["tat", 1746, 113], + ["astat", 1754, 110], + ["istat", 1754, 111], + ["ostat", 1754, 112], + ["avat", 1746, 97], + ["evat", 1746, 96], + ["ivat", 1746, 98], + ["irivat", 1760, 100], + ["ovat", 1746, 76], + ["uvat", 1746, 99], + ["a\u010Dat", 1746, 102], + ["it", -1, 116], + ["acit", 1765, 124], + ["lucit", 1765, 121], + ["rosit", 1765, 127], + ["jetit", 1765, 118], + ["a\u010Dit", 1765, 101], + ["lu\u010Dit", 1765, 117], + ["ro\u0161it", 1765, 90], + ["nut", -1, 13], + ["astadu", -1, 110], + ["istadu", -1, 111], + ["ostadu", -1, 112], + ["gu", -1, 20], + ["logu", 1777, 19], + ["ugu", 1777, 18], + ["ahu", -1, 104], + ["acahu", 1780, 128], + ["astajahu", 1780, 106], + ["istajahu", 1780, 107], + ["ostajahu", 1780, 108], + ["injahu", 1780, 114], + ["irahu", 1780, 100], + ["urahu", 1780, 105], + ["avahu", 1780, 97], + ["evahu", 1780, 96], + ["ivahu", 1780, 98], + ["ovahu", 1780, 76], + ["uvahu", 1780, 99], + ["a\u010Dahu", 1780, 102], + ["aju", -1, 104], + ["caju", 1794, 26], + ["acaju", 1795, 128], + ["laju", 1794, 30], + ["raju", 1794, 31], + ["iraju", 1798, 100], + ["uraju", 1798, 105], + ["taju", 1794, 113], + ["astaju", 1801, 106], + ["istaju", 1801, 107], + ["ostaju", 1801, 108], + ["avaju", 1794, 97], + ["evaju", 1794, 96], + ["ivaju", 1794, 98], + ["uvaju", 1794, 99], + ["\u0107aju", 1794, 28], + ["\u010Daju", 1794, 27], + ["a\u010Daju", 1810, 102], + ["\u0111aju", 1794, 29], + ["iju", -1, 116], + ["biju", 1813, 32], + ["ciju", 1813, 33], + ["diju", 1813, 34], + ["fiju", 1813, 40], + ["giju", 1813, 39], + ["anjiju", 1813, 84], + ["enjiju", 1813, 85], + ["snjiju", 1813, 122], + ["\u0161njiju", 1813, 86], + ["kiju", 1813, 95], + ["liju", 1813, 24], + ["eliju", 1824, 83], + ["miju", 1813, 37], + ["niju", 1813, 13], + ["ganiju", 1827, 9], + ["maniju", 1827, 6], + ["paniju", 1827, 7], + ["raniju", 1827, 8], + ["taniju", 1827, 5], + ["piju", 1813, 41], + ["riju", 1813, 42], + ["rariju", 1834, 21], + ["siju", 1813, 23], + ["osiju", 1836, 123], + ["tiju", 1813, 44], + ["atiju", 1838, 120], + ["otiju", 1838, 22], + ["aviju", 1813, 77], + ["eviju", 1813, 78], + ["iviju", 1813, 79], + ["oviju", 1813, 80], + ["ziju", 1813, 45], + ["o\u0161iju", 1813, 91], + ["\u017Eiju", 1813, 38], + ["anju", -1, 84], + ["enju", -1, 85], + ["snju", -1, 122], + ["\u0161nju", -1, 86], + ["uju", -1, 25], + ["lucuju", 1852, 121], + ["iruju", 1852, 100], + ["lu\u010Duju", 1852, 117], + ["ku", -1, 95], + ["sku", 1856, 1], + ["\u0161ku", 1856, 2], + ["alu", -1, 104], + ["ijalu", 1859, 47], + ["nalu", 1859, 46], + ["elu", -1, 83], + ["ilu", -1, 116], + ["ozilu", 1863, 48], + ["olu", -1, 50], + ["ramu", -1, 52], + ["acemu", -1, 124], + ["ecemu", -1, 125], + ["ucemu", -1, 126], + ["anjijemu", -1, 84], + ["enjijemu", -1, 85], + ["snjijemu", -1, 122], + ["\u0161njijemu", -1, 86], + ["kijemu", -1, 95], + ["skijemu", 1874, 1], + ["\u0161kijemu", 1874, 2], + ["elijemu", -1, 83], + ["nijemu", -1, 13], + ["osijemu", -1, 123], + ["atijemu", -1, 120], + ["evitijemu", -1, 92], + ["ovitijemu", -1, 93], + ["astijemu", -1, 94], + ["avijemu", -1, 77], + ["evijemu", -1, 78], + ["ivijemu", -1, 79], + ["ovijemu", -1, 80], + ["o\u0161ijemu", -1, 91], + ["anjemu", -1, 84], + ["enjemu", -1, 85], + ["snjemu", -1, 122], + ["\u0161njemu", -1, 86], + ["kemu", -1, 95], + ["skemu", 1893, 1], + ["\u0161kemu", 1893, 2], + ["lemu", -1, 51], + ["elemu", 1896, 83], + ["nemu", -1, 13], + ["anemu", 1898, 10], + ["enemu", 1898, 87], + ["snemu", 1898, 159], + ["\u0161nemu", 1898, 88], + ["osemu", -1, 123], + ["atemu", -1, 120], + ["evitemu", -1, 92], + ["ovitemu", -1, 93], + ["astemu", -1, 94], + ["avemu", -1, 77], + ["evemu", -1, 78], + ["ivemu", -1, 79], + ["ovemu", -1, 80], + ["a\u0107emu", -1, 14], + ["e\u0107emu", -1, 15], + ["u\u0107emu", -1, 16], + ["o\u0161emu", -1, 91], + ["acomu", -1, 124], + ["ecomu", -1, 125], + ["ucomu", -1, 126], + ["anjomu", -1, 84], + ["enjomu", -1, 85], + ["snjomu", -1, 122], + ["\u0161njomu", -1, 86], + ["komu", -1, 95], + ["skomu", 1923, 1], + ["\u0161komu", 1923, 2], + ["elomu", -1, 83], + ["nomu", -1, 13], + ["cinomu", 1927, 137], + ["\u010Dinomu", 1927, 89], + ["osomu", -1, 123], + ["atomu", -1, 120], + ["evitomu", -1, 92], + ["ovitomu", -1, 93], + ["astomu", -1, 94], + ["avomu", -1, 77], + ["evomu", -1, 78], + ["ivomu", -1, 79], + ["ovomu", -1, 80], + ["a\u0107omu", -1, 14], + ["e\u0107omu", -1, 15], + ["u\u0107omu", -1, 16], + ["o\u0161omu", -1, 91], + ["nu", -1, 13], + ["anu", 1943, 10], + ["astanu", 1944, 110], + ["istanu", 1944, 111], + ["ostanu", 1944, 112], + ["inu", 1943, 11], + ["cinu", 1948, 137], + ["aninu", 1948, 10], + ["\u010Dinu", 1948, 89], + ["onu", 1943, 12], + ["aru", -1, 53], + ["dru", -1, 54], + ["eru", -1, 55], + ["oru", -1, 56], + ["basu", -1, 135], + ["gasu", -1, 131], + ["jasu", -1, 129], + ["kasu", -1, 133], + ["nasu", -1, 132], + ["tasu", -1, 130], + ["vasu", -1, 134], + ["esu", -1, 57], + ["isu", -1, 58], + ["osu", -1, 123], + ["atu", -1, 120], + ["ikatu", 1967, 68], + ["latu", 1967, 69], + ["etu", -1, 70], + ["evitu", -1, 92], + ["ovitu", -1, 93], + ["astu", -1, 94], + ["estu", -1, 71], + ["istu", -1, 72], + ["kstu", -1, 73], + ["ostu", -1, 74], + ["i\u0161tu", -1, 75], + ["avu", -1, 77], + ["evu", -1, 78], + ["ivu", -1, 79], + ["ovu", -1, 80], + ["lovu", 1982, 82], + ["movu", 1982, 81], + ["stvu", -1, 3], + ["\u0161tvu", -1, 4], + ["ba\u0161u", -1, 63], + ["ga\u0161u", -1, 64], + ["ja\u0161u", -1, 61], + ["ka\u0161u", -1, 62], + ["na\u0161u", -1, 60], + ["ta\u0161u", -1, 59], + ["va\u0161u", -1, 65], + ["e\u0161u", -1, 66], + ["i\u0161u", -1, 67], + ["o\u0161u", -1, 91], + ["avav", -1, 97], + ["evav", -1, 96], + ["ivav", -1, 98], + ["uvav", -1, 99], + ["kov", -1, 95], + ["a\u0161", -1, 104], + ["ira\u0161", 2002, 100], + ["ura\u0161", 2002, 105], + ["ta\u0161", 2002, 113], + ["ava\u0161", 2002, 97], + ["eva\u0161", 2002, 96], + ["iva\u0161", 2002, 98], + ["uva\u0161", 2002, 99], + ["a\u010Da\u0161", 2002, 102], + ["e\u0161", -1, 119], + ["astade\u0161", 2011, 110], + ["istade\u0161", 2011, 111], + ["ostade\u0161", 2011, 112], + ["astaje\u0161", 2011, 106], + ["istaje\u0161", 2011, 107], + ["ostaje\u0161", 2011, 108], + ["ije\u0161", 2011, 116], + ["inje\u0161", 2011, 114], + ["uje\u0161", 2011, 25], + ["iruje\u0161", 2020, 100], + ["lu\u010Duje\u0161", 2020, 117], + ["ne\u0161", 2011, 13], + ["astane\u0161", 2023, 110], + ["istane\u0161", 2023, 111], + ["ostane\u0161", 2023, 112], + ["ete\u0161", 2011, 70], + ["aste\u0161", 2011, 115], + ["i\u0161", -1, 116], + ["ni\u0161", 2029, 103], + ["jeti\u0161", 2029, 118], + ["a\u010Di\u0161", 2029, 101], + ["lu\u010Di\u0161", 2029, 117], + ["ro\u0161i\u0161", 2029, 90] + ]; + + /** @const */ var a_3 = [ + ["a", -1, 1], + ["oga", 0, 1], + ["ama", 0, 1], + ["ima", 0, 1], + ["ena", 0, 1], + ["e", -1, 1], + ["og", -1, 1], + ["anog", 6, 1], + ["enog", 6, 1], + ["anih", -1, 1], + ["enih", -1, 1], + ["i", -1, 1], + ["ani", 11, 1], + ["eni", 11, 1], + ["anoj", -1, 1], + ["enoj", -1, 1], + ["anim", -1, 1], + ["enim", -1, 1], + ["om", -1, 1], + ["enom", 18, 1], + ["o", -1, 1], + ["ano", 20, 1], + ["eno", 20, 1], + ["ost", -1, 1], + ["u", -1, 1], + ["enu", 24, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16]; + + /** @const */ var /** Array */ g_sa = [65, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 128]; + + /** @const */ var /** Array */ g_ca = [119, 95, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 136, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 16]; + + /** @const */ var /** Array */ g_rg = [1]; + + var /** number */ I_p1 = 0; + var /** boolean */ B_no_diacritics = false; + + + /** @return {boolean} */ + function r_cyr_to_lat() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + golab2: while(true) + { + var /** number */ v_3 = base.cursor; + lab3: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + break lab3; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("a")) + { + return false; + } + break; + case 2: + if (!base.slice_from("b")) + { + return false; + } + break; + case 3: + if (!base.slice_from("v")) + { + return false; + } + break; + case 4: + if (!base.slice_from("g")) + { + return false; + } + break; + case 5: + if (!base.slice_from("d")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u0111")) + { + return false; + } + break; + case 7: + if (!base.slice_from("e")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u017E")) + { + return false; + } + break; + case 9: + if (!base.slice_from("z")) + { + return false; + } + break; + case 10: + if (!base.slice_from("i")) + { + return false; + } + break; + case 11: + if (!base.slice_from("j")) + { + return false; + } + break; + case 12: + if (!base.slice_from("k")) + { + return false; + } + break; + case 13: + if (!base.slice_from("l")) + { + return false; + } + break; + case 14: + if (!base.slice_from("lj")) + { + return false; + } + break; + case 15: + if (!base.slice_from("m")) + { + return false; + } + break; + case 16: + if (!base.slice_from("n")) + { + return false; + } + break; + case 17: + if (!base.slice_from("nj")) + { + return false; + } + break; + case 18: + if (!base.slice_from("o")) + { + return false; + } + break; + case 19: + if (!base.slice_from("p")) + { + return false; + } + break; + case 20: + if (!base.slice_from("r")) + { + return false; + } + break; + case 21: + if (!base.slice_from("s")) + { + return false; + } + break; + case 22: + if (!base.slice_from("t")) + { + return false; + } + break; + case 23: + if (!base.slice_from("\u0107")) + { + return false; + } + break; + case 24: + if (!base.slice_from("u")) + { + return false; + } + break; + case 25: + if (!base.slice_from("f")) + { + return false; + } + break; + case 26: + if (!base.slice_from("h")) + { + return false; + } + break; + case 27: + if (!base.slice_from("c")) + { + return false; + } + break; + case 28: + if (!base.slice_from("\u010D")) + { + return false; + } + break; + case 29: + if (!base.slice_from("d\u017E")) + { + return false; + } + break; + case 30: + if (!base.slice_from("\u0161")) + { + return false; + } + break; + } + base.cursor = v_3; + break golab2; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_prelude() { + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + golab2: while(true) + { + var /** number */ v_3 = base.cursor; + lab3: { + if (!(base.in_grouping(g_ca, 98, 382))) + { + break lab3; + } + base.bra = base.cursor; + if (!(base.eq_s("ije"))) + { + break lab3; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_ca, 98, 382))) + { + break lab3; + } + if (!base.slice_from("e")) + { + return false; + } + base.cursor = v_3; + break golab2; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + var /** number */ v_4 = base.cursor; + lab4: { + while(true) + { + var /** number */ v_5 = base.cursor; + lab5: { + golab6: while(true) + { + var /** number */ v_6 = base.cursor; + lab7: { + if (!(base.in_grouping(g_ca, 98, 382))) + { + break lab7; + } + base.bra = base.cursor; + if (!(base.eq_s("je"))) + { + break lab7; + } + base.ket = base.cursor; + if (!(base.in_grouping(g_ca, 98, 382))) + { + break lab7; + } + if (!base.slice_from("e")) + { + return false; + } + base.cursor = v_6; + break golab6; + } + base.cursor = v_6; + if (base.cursor >= base.limit) + { + break lab5; + } + base.cursor++; + } + continue; + } + base.cursor = v_5; + break; + } + } + base.cursor = v_4; + var /** number */ v_7 = base.cursor; + lab8: { + while(true) + { + var /** number */ v_8 = base.cursor; + lab9: { + golab10: while(true) + { + var /** number */ v_9 = base.cursor; + lab11: { + base.bra = base.cursor; + if (!(base.eq_s("dj"))) + { + break lab11; + } + base.ket = base.cursor; + if (!base.slice_from("\u0111")) + { + return false; + } + base.cursor = v_9; + break golab10; + } + base.cursor = v_9; + if (base.cursor >= base.limit) + { + break lab9; + } + base.cursor++; + } + continue; + } + base.cursor = v_8; + break; + } + } + base.cursor = v_7; + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + B_no_diacritics = true; + var /** number */ v_1 = base.cursor; + lab0: { + golab1: while(true) + { + lab2: { + if (!(base.in_grouping(g_sa, 263, 382))) + { + break lab2; + } + break golab1; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + B_no_diacritics = false; + } + base.cursor = v_1; + I_p1 = base.limit; + var /** number */ v_3 = base.cursor; + lab3: { + golab4: while(true) + { + lab5: { + if (!(base.in_grouping(g_v, 97, 117))) + { + break lab5; + } + break golab4; + } + if (base.cursor >= base.limit) + { + break lab3; + } + base.cursor++; + } + I_p1 = base.cursor; + if (I_p1 >= 2) + { + break lab3; + } + golab6: while(true) + { + lab7: { + if (!(base.out_grouping(g_v, 97, 117))) + { + break lab7; + } + break golab6; + } + if (base.cursor >= base.limit) + { + break lab3; + } + base.cursor++; + } + I_p1 = base.cursor; + } + base.cursor = v_3; + var /** number */ v_6 = base.cursor; + lab8: { + golab9: while(true) + { + lab10: { + if (!(base.eq_s("r"))) + { + break lab10; + } + break golab9; + } + if (base.cursor >= base.limit) + { + break lab8; + } + base.cursor++; + } + lab11: { + var /** number */ v_8 = base.cursor; + lab12: { + if (base.cursor < 2) + { + break lab12; + } + break lab11; + } + base.cursor = v_8; + golab13: while(true) + { + lab14: { + if (!(base.out_grouping(g_rg, 114, 114))) + { + break lab14; + } + break golab13; + } + if (base.cursor >= base.limit) + { + break lab8; + } + base.cursor++; + } + } + if ((I_p1 - base.cursor) <= 1) + { + break lab8; + } + I_p1 = base.cursor; + } + base.cursor = v_6; + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_Step_1() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_1); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("loga")) + { + return false; + } + break; + case 2: + if (!base.slice_from("peh")) + { + return false; + } + break; + case 3: + if (!base.slice_from("vojka")) + { + return false; + } + break; + case 4: + if (!base.slice_from("bojka")) + { + return false; + } + break; + case 5: + if (!base.slice_from("jak")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u010Dajni")) + { + return false; + } + break; + case 7: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("cajni")) + { + return false; + } + break; + case 8: + if (!base.slice_from("erni")) + { + return false; + } + break; + case 9: + if (!base.slice_from("larni")) + { + return false; + } + break; + case 10: + if (!base.slice_from("esni")) + { + return false; + } + break; + case 11: + if (!base.slice_from("anjca")) + { + return false; + } + break; + case 12: + if (!base.slice_from("ajca")) + { + return false; + } + break; + case 13: + if (!base.slice_from("ljca")) + { + return false; + } + break; + case 14: + if (!base.slice_from("ejca")) + { + return false; + } + break; + case 15: + if (!base.slice_from("ojca")) + { + return false; + } + break; + case 16: + if (!base.slice_from("ajka")) + { + return false; + } + break; + case 17: + if (!base.slice_from("ojka")) + { + return false; + } + break; + case 18: + if (!base.slice_from("\u0161ca")) + { + return false; + } + break; + case 19: + if (!base.slice_from("ing")) + { + return false; + } + break; + case 20: + if (!base.slice_from("tvenik")) + { + return false; + } + break; + case 21: + if (!base.slice_from("tetika")) + { + return false; + } + break; + case 22: + if (!base.slice_from("nstva")) + { + return false; + } + break; + case 23: + if (!base.slice_from("nik")) + { + return false; + } + break; + case 24: + if (!base.slice_from("tik")) + { + return false; + } + break; + case 25: + if (!base.slice_from("zik")) + { + return false; + } + break; + case 26: + if (!base.slice_from("snik")) + { + return false; + } + break; + case 27: + if (!base.slice_from("kusi")) + { + return false; + } + break; + case 28: + if (!base.slice_from("kusni")) + { + return false; + } + break; + case 29: + if (!base.slice_from("kustva")) + { + return false; + } + break; + case 30: + if (!base.slice_from("du\u0161ni")) + { + return false; + } + break; + case 31: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("dusni")) + { + return false; + } + break; + case 32: + if (!base.slice_from("antni")) + { + return false; + } + break; + case 33: + if (!base.slice_from("bilni")) + { + return false; + } + break; + case 34: + if (!base.slice_from("tilni")) + { + return false; + } + break; + case 35: + if (!base.slice_from("avilni")) + { + return false; + } + break; + case 36: + if (!base.slice_from("silni")) + { + return false; + } + break; + case 37: + if (!base.slice_from("gilni")) + { + return false; + } + break; + case 38: + if (!base.slice_from("rilni")) + { + return false; + } + break; + case 39: + if (!base.slice_from("nilni")) + { + return false; + } + break; + case 40: + if (!base.slice_from("alni")) + { + return false; + } + break; + case 41: + if (!base.slice_from("ozni")) + { + return false; + } + break; + case 42: + if (!base.slice_from("ravi")) + { + return false; + } + break; + case 43: + if (!base.slice_from("stavni")) + { + return false; + } + break; + case 44: + if (!base.slice_from("pravni")) + { + return false; + } + break; + case 45: + if (!base.slice_from("tivni")) + { + return false; + } + break; + case 46: + if (!base.slice_from("sivni")) + { + return false; + } + break; + case 47: + if (!base.slice_from("atni")) + { + return false; + } + break; + case 48: + if (!base.slice_from("enta")) + { + return false; + } + break; + case 49: + if (!base.slice_from("tetni")) + { + return false; + } + break; + case 50: + if (!base.slice_from("pletni")) + { + return false; + } + break; + case 51: + if (!base.slice_from("\u0161avi")) + { + return false; + } + break; + case 52: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("savi")) + { + return false; + } + break; + case 53: + if (!base.slice_from("anta")) + { + return false; + } + break; + case 54: + if (!base.slice_from("a\u010Dka")) + { + return false; + } + break; + case 55: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("acka")) + { + return false; + } + break; + case 56: + if (!base.slice_from("u\u0161ka")) + { + return false; + } + break; + case 57: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("uska")) + { + return false; + } + break; + case 58: + if (!base.slice_from("atka")) + { + return false; + } + break; + case 59: + if (!base.slice_from("etka")) + { + return false; + } + break; + case 60: + if (!base.slice_from("itka")) + { + return false; + } + break; + case 61: + if (!base.slice_from("otka")) + { + return false; + } + break; + case 62: + if (!base.slice_from("utka")) + { + return false; + } + break; + case 63: + if (!base.slice_from("eskna")) + { + return false; + } + break; + case 64: + if (!base.slice_from("ti\u010Dni")) + { + return false; + } + break; + case 65: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ticni")) + { + return false; + } + break; + case 66: + if (!base.slice_from("ojska")) + { + return false; + } + break; + case 67: + if (!base.slice_from("esma")) + { + return false; + } + break; + case 68: + if (!base.slice_from("metra")) + { + return false; + } + break; + case 69: + if (!base.slice_from("centra")) + { + return false; + } + break; + case 70: + if (!base.slice_from("istra")) + { + return false; + } + break; + case 71: + if (!base.slice_from("osti")) + { + return false; + } + break; + case 72: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("osti")) + { + return false; + } + break; + case 73: + if (!base.slice_from("dba")) + { + return false; + } + break; + case 74: + if (!base.slice_from("\u010Dka")) + { + return false; + } + break; + case 75: + if (!base.slice_from("mca")) + { + return false; + } + break; + case 76: + if (!base.slice_from("nca")) + { + return false; + } + break; + case 77: + if (!base.slice_from("voljni")) + { + return false; + } + break; + case 78: + if (!base.slice_from("anki")) + { + return false; + } + break; + case 79: + if (!base.slice_from("vca")) + { + return false; + } + break; + case 80: + if (!base.slice_from("sca")) + { + return false; + } + break; + case 81: + if (!base.slice_from("rca")) + { + return false; + } + break; + case 82: + if (!base.slice_from("alca")) + { + return false; + } + break; + case 83: + if (!base.slice_from("elca")) + { + return false; + } + break; + case 84: + if (!base.slice_from("olca")) + { + return false; + } + break; + case 85: + if (!base.slice_from("njca")) + { + return false; + } + break; + case 86: + if (!base.slice_from("ekta")) + { + return false; + } + break; + case 87: + if (!base.slice_from("izma")) + { + return false; + } + break; + case 88: + if (!base.slice_from("jebi")) + { + return false; + } + break; + case 89: + if (!base.slice_from("baci")) + { + return false; + } + break; + case 90: + if (!base.slice_from("a\u0161ni")) + { + return false; + } + break; + case 91: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("asni")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_2() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("sk")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u0161k")) + { + return false; + } + break; + case 3: + if (!base.slice_from("stv")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u0161tv")) + { + return false; + } + break; + case 5: + if (!base.slice_from("tanij")) + { + return false; + } + break; + case 6: + if (!base.slice_from("manij")) + { + return false; + } + break; + case 7: + if (!base.slice_from("panij")) + { + return false; + } + break; + case 8: + if (!base.slice_from("ranij")) + { + return false; + } + break; + case 9: + if (!base.slice_from("ganij")) + { + return false; + } + break; + case 10: + if (!base.slice_from("an")) + { + return false; + } + break; + case 11: + if (!base.slice_from("in")) + { + return false; + } + break; + case 12: + if (!base.slice_from("on")) + { + return false; + } + break; + case 13: + if (!base.slice_from("n")) + { + return false; + } + break; + case 14: + if (!base.slice_from("a\u0107")) + { + return false; + } + break; + case 15: + if (!base.slice_from("e\u0107")) + { + return false; + } + break; + case 16: + if (!base.slice_from("u\u0107")) + { + return false; + } + break; + case 17: + if (!base.slice_from("ugov")) + { + return false; + } + break; + case 18: + if (!base.slice_from("ug")) + { + return false; + } + break; + case 19: + if (!base.slice_from("log")) + { + return false; + } + break; + case 20: + if (!base.slice_from("g")) + { + return false; + } + break; + case 21: + if (!base.slice_from("rari")) + { + return false; + } + break; + case 22: + if (!base.slice_from("oti")) + { + return false; + } + break; + case 23: + if (!base.slice_from("si")) + { + return false; + } + break; + case 24: + if (!base.slice_from("li")) + { + return false; + } + break; + case 25: + if (!base.slice_from("uj")) + { + return false; + } + break; + case 26: + if (!base.slice_from("caj")) + { + return false; + } + break; + case 27: + if (!base.slice_from("\u010Daj")) + { + return false; + } + break; + case 28: + if (!base.slice_from("\u0107aj")) + { + return false; + } + break; + case 29: + if (!base.slice_from("\u0111aj")) + { + return false; + } + break; + case 30: + if (!base.slice_from("laj")) + { + return false; + } + break; + case 31: + if (!base.slice_from("raj")) + { + return false; + } + break; + case 32: + if (!base.slice_from("bij")) + { + return false; + } + break; + case 33: + if (!base.slice_from("cij")) + { + return false; + } + break; + case 34: + if (!base.slice_from("dij")) + { + return false; + } + break; + case 35: + if (!base.slice_from("lij")) + { + return false; + } + break; + case 36: + if (!base.slice_from("nij")) + { + return false; + } + break; + case 37: + if (!base.slice_from("mij")) + { + return false; + } + break; + case 38: + if (!base.slice_from("\u017Eij")) + { + return false; + } + break; + case 39: + if (!base.slice_from("gij")) + { + return false; + } + break; + case 40: + if (!base.slice_from("fij")) + { + return false; + } + break; + case 41: + if (!base.slice_from("pij")) + { + return false; + } + break; + case 42: + if (!base.slice_from("rij")) + { + return false; + } + break; + case 43: + if (!base.slice_from("sij")) + { + return false; + } + break; + case 44: + if (!base.slice_from("tij")) + { + return false; + } + break; + case 45: + if (!base.slice_from("zij")) + { + return false; + } + break; + case 46: + if (!base.slice_from("nal")) + { + return false; + } + break; + case 47: + if (!base.slice_from("ijal")) + { + return false; + } + break; + case 48: + if (!base.slice_from("ozil")) + { + return false; + } + break; + case 49: + if (!base.slice_from("olov")) + { + return false; + } + break; + case 50: + if (!base.slice_from("ol")) + { + return false; + } + break; + case 51: + if (!base.slice_from("lem")) + { + return false; + } + break; + case 52: + if (!base.slice_from("ram")) + { + return false; + } + break; + case 53: + if (!base.slice_from("ar")) + { + return false; + } + break; + case 54: + if (!base.slice_from("dr")) + { + return false; + } + break; + case 55: + if (!base.slice_from("er")) + { + return false; + } + break; + case 56: + if (!base.slice_from("or")) + { + return false; + } + break; + case 57: + if (!base.slice_from("es")) + { + return false; + } + break; + case 58: + if (!base.slice_from("is")) + { + return false; + } + break; + case 59: + if (!base.slice_from("ta\u0161")) + { + return false; + } + break; + case 60: + if (!base.slice_from("na\u0161")) + { + return false; + } + break; + case 61: + if (!base.slice_from("ja\u0161")) + { + return false; + } + break; + case 62: + if (!base.slice_from("ka\u0161")) + { + return false; + } + break; + case 63: + if (!base.slice_from("ba\u0161")) + { + return false; + } + break; + case 64: + if (!base.slice_from("ga\u0161")) + { + return false; + } + break; + case 65: + if (!base.slice_from("va\u0161")) + { + return false; + } + break; + case 66: + if (!base.slice_from("e\u0161")) + { + return false; + } + break; + case 67: + if (!base.slice_from("i\u0161")) + { + return false; + } + break; + case 68: + if (!base.slice_from("ikat")) + { + return false; + } + break; + case 69: + if (!base.slice_from("lat")) + { + return false; + } + break; + case 70: + if (!base.slice_from("et")) + { + return false; + } + break; + case 71: + if (!base.slice_from("est")) + { + return false; + } + break; + case 72: + if (!base.slice_from("ist")) + { + return false; + } + break; + case 73: + if (!base.slice_from("kst")) + { + return false; + } + break; + case 74: + if (!base.slice_from("ost")) + { + return false; + } + break; + case 75: + if (!base.slice_from("i\u0161t")) + { + return false; + } + break; + case 76: + if (!base.slice_from("ova")) + { + return false; + } + break; + case 77: + if (!base.slice_from("av")) + { + return false; + } + break; + case 78: + if (!base.slice_from("ev")) + { + return false; + } + break; + case 79: + if (!base.slice_from("iv")) + { + return false; + } + break; + case 80: + if (!base.slice_from("ov")) + { + return false; + } + break; + case 81: + if (!base.slice_from("mov")) + { + return false; + } + break; + case 82: + if (!base.slice_from("lov")) + { + return false; + } + break; + case 83: + if (!base.slice_from("el")) + { + return false; + } + break; + case 84: + if (!base.slice_from("anj")) + { + return false; + } + break; + case 85: + if (!base.slice_from("enj")) + { + return false; + } + break; + case 86: + if (!base.slice_from("\u0161nj")) + { + return false; + } + break; + case 87: + if (!base.slice_from("en")) + { + return false; + } + break; + case 88: + if (!base.slice_from("\u0161n")) + { + return false; + } + break; + case 89: + if (!base.slice_from("\u010Din")) + { + return false; + } + break; + case 90: + if (!base.slice_from("ro\u0161i")) + { + return false; + } + break; + case 91: + if (!base.slice_from("o\u0161")) + { + return false; + } + break; + case 92: + if (!base.slice_from("evit")) + { + return false; + } + break; + case 93: + if (!base.slice_from("ovit")) + { + return false; + } + break; + case 94: + if (!base.slice_from("ast")) + { + return false; + } + break; + case 95: + if (!base.slice_from("k")) + { + return false; + } + break; + case 96: + if (!base.slice_from("eva")) + { + return false; + } + break; + case 97: + if (!base.slice_from("ava")) + { + return false; + } + break; + case 98: + if (!base.slice_from("iva")) + { + return false; + } + break; + case 99: + if (!base.slice_from("uva")) + { + return false; + } + break; + case 100: + if (!base.slice_from("ir")) + { + return false; + } + break; + case 101: + if (!base.slice_from("a\u010D")) + { + return false; + } + break; + case 102: + if (!base.slice_from("a\u010Da")) + { + return false; + } + break; + case 103: + if (!base.slice_from("ni")) + { + return false; + } + break; + case 104: + if (!base.slice_from("a")) + { + return false; + } + break; + case 105: + if (!base.slice_from("ur")) + { + return false; + } + break; + case 106: + if (!base.slice_from("astaj")) + { + return false; + } + break; + case 107: + if (!base.slice_from("istaj")) + { + return false; + } + break; + case 108: + if (!base.slice_from("ostaj")) + { + return false; + } + break; + case 109: + if (!base.slice_from("aj")) + { + return false; + } + break; + case 110: + if (!base.slice_from("asta")) + { + return false; + } + break; + case 111: + if (!base.slice_from("ista")) + { + return false; + } + break; + case 112: + if (!base.slice_from("osta")) + { + return false; + } + break; + case 113: + if (!base.slice_from("ta")) + { + return false; + } + break; + case 114: + if (!base.slice_from("inj")) + { + return false; + } + break; + case 115: + if (!base.slice_from("as")) + { + return false; + } + break; + case 116: + if (!base.slice_from("i")) + { + return false; + } + break; + case 117: + if (!base.slice_from("lu\u010D")) + { + return false; + } + break; + case 118: + if (!base.slice_from("jeti")) + { + return false; + } + break; + case 119: + if (!base.slice_from("e")) + { + return false; + } + break; + case 120: + if (!base.slice_from("at")) + { + return false; + } + break; + case 121: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("luc")) + { + return false; + } + break; + case 122: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("snj")) + { + return false; + } + break; + case 123: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("os")) + { + return false; + } + break; + case 124: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ac")) + { + return false; + } + break; + case 125: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ec")) + { + return false; + } + break; + case 126: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("uc")) + { + return false; + } + break; + case 127: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("rosi")) + { + return false; + } + break; + case 128: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("aca")) + { + return false; + } + break; + case 129: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("jas")) + { + return false; + } + break; + case 130: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("tas")) + { + return false; + } + break; + case 131: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("gas")) + { + return false; + } + break; + case 132: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("nas")) + { + return false; + } + break; + case 133: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("kas")) + { + return false; + } + break; + case 134: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("vas")) + { + return false; + } + break; + case 135: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("bas")) + { + return false; + } + break; + case 136: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("as")) + { + return false; + } + break; + case 137: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("cin")) + { + return false; + } + break; + case 138: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("astaj")) + { + return false; + } + break; + case 139: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("istaj")) + { + return false; + } + break; + case 140: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ostaj")) + { + return false; + } + break; + case 141: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("asta")) + { + return false; + } + break; + case 142: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ista")) + { + return false; + } + break; + case 143: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("osta")) + { + return false; + } + break; + case 144: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ava")) + { + return false; + } + break; + case 145: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("eva")) + { + return false; + } + break; + case 146: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("iva")) + { + return false; + } + break; + case 147: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("uva")) + { + return false; + } + break; + case 148: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ova")) + { + return false; + } + break; + case 149: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("jeti")) + { + return false; + } + break; + case 150: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("inj")) + { + return false; + } + break; + case 151: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ist")) + { + return false; + } + break; + case 152: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("es")) + { + return false; + } + break; + case 153: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("et")) + { + return false; + } + break; + case 154: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("is")) + { + return false; + } + break; + case 155: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ir")) + { + return false; + } + break; + case 156: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ur")) + { + return false; + } + break; + case 157: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("uj")) + { + return false; + } + break; + case 158: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ni")) + { + return false; + } + break; + case 159: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("sn")) + { + return false; + } + break; + case 160: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("ta")) + { + return false; + } + break; + case 161: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("a")) + { + return false; + } + break; + case 162: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("i")) + { + return false; + } + break; + case 163: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("e")) + { + return false; + } + break; + case 164: + if (!B_no_diacritics) + { + return false; + } + if (!base.slice_from("n")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_Step_3() { + base.ket = base.cursor; + if (base.find_among_b(a_3) == 0) + { + return false; + } + base.bra = base.cursor; + if (!r_R1()) + { + return false; + } + if (!base.slice_from("")) + { + return false; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_cyr_to_lat(); + r_prelude(); + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_4 = base.limit - base.cursor; + r_Step_1(); + base.cursor = base.limit - v_4; + var /** number */ v_5 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_6 = base.limit - base.cursor; + lab2: { + if (!r_Step_2()) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_6; + if (!r_Step_3()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_5; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['SerbianStemmer'] = SerbianStemmer; diff --git a/js/spanish-stemmer.js b/js/spanish-stemmer.js new file mode 100644 index 0000000..9137989 --- /dev/null +++ b/js/spanish-stemmer.js @@ -0,0 +1,976 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var SpanishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["", -1, 6], + ["\u00E1", 0, 1], + ["\u00E9", 0, 2], + ["\u00ED", 0, 3], + ["\u00F3", 0, 4], + ["\u00FA", 0, 5] + ]; + + /** @const */ var a_1 = [ + ["la", -1, -1], + ["sela", 0, -1], + ["le", -1, -1], + ["me", -1, -1], + ["se", -1, -1], + ["lo", -1, -1], + ["selo", 5, -1], + ["las", -1, -1], + ["selas", 7, -1], + ["les", -1, -1], + ["los", -1, -1], + ["selos", 10, -1], + ["nos", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["ando", -1, 6], + ["iendo", -1, 6], + ["yendo", -1, 7], + ["\u00E1ndo", -1, 2], + ["i\u00E9ndo", -1, 1], + ["ar", -1, 6], + ["er", -1, 6], + ["ir", -1, 6], + ["\u00E1r", -1, 3], + ["\u00E9r", -1, 4], + ["\u00EDr", -1, 5] + ]; + + /** @const */ var a_3 = [ + ["ic", -1, -1], + ["ad", -1, -1], + ["os", -1, -1], + ["iv", -1, 1] + ]; + + /** @const */ var a_4 = [ + ["able", -1, 1], + ["ible", -1, 1], + ["ante", -1, 1] + ]; + + /** @const */ var a_5 = [ + ["ic", -1, 1], + ["abil", -1, 1], + ["iv", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["ica", -1, 1], + ["ancia", -1, 2], + ["encia", -1, 5], + ["adora", -1, 2], + ["osa", -1, 1], + ["ista", -1, 1], + ["iva", -1, 9], + ["anza", -1, 1], + ["log\u00EDa", -1, 3], + ["idad", -1, 8], + ["able", -1, 1], + ["ible", -1, 1], + ["ante", -1, 2], + ["mente", -1, 7], + ["amente", 13, 6], + ["aci\u00F3n", -1, 2], + ["uci\u00F3n", -1, 4], + ["ico", -1, 1], + ["ismo", -1, 1], + ["oso", -1, 1], + ["amiento", -1, 1], + ["imiento", -1, 1], + ["ivo", -1, 9], + ["ador", -1, 2], + ["icas", -1, 1], + ["ancias", -1, 2], + ["encias", -1, 5], + ["adoras", -1, 2], + ["osas", -1, 1], + ["istas", -1, 1], + ["ivas", -1, 9], + ["anzas", -1, 1], + ["log\u00EDas", -1, 3], + ["idades", -1, 8], + ["ables", -1, 1], + ["ibles", -1, 1], + ["aciones", -1, 2], + ["uciones", -1, 4], + ["adores", -1, 2], + ["antes", -1, 2], + ["icos", -1, 1], + ["ismos", -1, 1], + ["osos", -1, 1], + ["amientos", -1, 1], + ["imientos", -1, 1], + ["ivos", -1, 9] + ]; + + /** @const */ var a_7 = [ + ["ya", -1, 1], + ["ye", -1, 1], + ["yan", -1, 1], + ["yen", -1, 1], + ["yeron", -1, 1], + ["yendo", -1, 1], + ["yo", -1, 1], + ["yas", -1, 1], + ["yes", -1, 1], + ["yais", -1, 1], + ["yamos", -1, 1], + ["y\u00F3", -1, 1] + ]; + + /** @const */ var a_8 = [ + ["aba", -1, 2], + ["ada", -1, 2], + ["ida", -1, 2], + ["ara", -1, 2], + ["iera", -1, 2], + ["\u00EDa", -1, 2], + ["ar\u00EDa", 5, 2], + ["er\u00EDa", 5, 2], + ["ir\u00EDa", 5, 2], + ["ad", -1, 2], + ["ed", -1, 2], + ["id", -1, 2], + ["ase", -1, 2], + ["iese", -1, 2], + ["aste", -1, 2], + ["iste", -1, 2], + ["an", -1, 2], + ["aban", 16, 2], + ["aran", 16, 2], + ["ieran", 16, 2], + ["\u00EDan", 16, 2], + ["ar\u00EDan", 20, 2], + ["er\u00EDan", 20, 2], + ["ir\u00EDan", 20, 2], + ["en", -1, 1], + ["asen", 24, 2], + ["iesen", 24, 2], + ["aron", -1, 2], + ["ieron", -1, 2], + ["ar\u00E1n", -1, 2], + ["er\u00E1n", -1, 2], + ["ir\u00E1n", -1, 2], + ["ado", -1, 2], + ["ido", -1, 2], + ["ando", -1, 2], + ["iendo", -1, 2], + ["ar", -1, 2], + ["er", -1, 2], + ["ir", -1, 2], + ["as", -1, 2], + ["abas", 39, 2], + ["adas", 39, 2], + ["idas", 39, 2], + ["aras", 39, 2], + ["ieras", 39, 2], + ["\u00EDas", 39, 2], + ["ar\u00EDas", 45, 2], + ["er\u00EDas", 45, 2], + ["ir\u00EDas", 45, 2], + ["es", -1, 1], + ["ases", 49, 2], + ["ieses", 49, 2], + ["abais", -1, 2], + ["arais", -1, 2], + ["ierais", -1, 2], + ["\u00EDais", -1, 2], + ["ar\u00EDais", 55, 2], + ["er\u00EDais", 55, 2], + ["ir\u00EDais", 55, 2], + ["aseis", -1, 2], + ["ieseis", -1, 2], + ["asteis", -1, 2], + ["isteis", -1, 2], + ["\u00E1is", -1, 2], + ["\u00E9is", -1, 1], + ["ar\u00E9is", 64, 2], + ["er\u00E9is", 64, 2], + ["ir\u00E9is", 64, 2], + ["ados", -1, 2], + ["idos", -1, 2], + ["amos", -1, 2], + ["\u00E1bamos", 70, 2], + ["\u00E1ramos", 70, 2], + ["i\u00E9ramos", 70, 2], + ["\u00EDamos", 70, 2], + ["ar\u00EDamos", 74, 2], + ["er\u00EDamos", 74, 2], + ["ir\u00EDamos", 74, 2], + ["emos", -1, 1], + ["aremos", 78, 2], + ["eremos", 78, 2], + ["iremos", 78, 2], + ["\u00E1semos", 78, 2], + ["i\u00E9semos", 78, 2], + ["imos", -1, 2], + ["ar\u00E1s", -1, 2], + ["er\u00E1s", -1, 2], + ["ir\u00E1s", -1, 2], + ["\u00EDs", -1, 2], + ["ar\u00E1", -1, 2], + ["er\u00E1", -1, 2], + ["ir\u00E1", -1, 2], + ["ar\u00E9", -1, 2], + ["er\u00E9", -1, 2], + ["ir\u00E9", -1, 2], + ["i\u00F3", -1, 2] + ]; + + /** @const */ var a_9 = [ + ["a", -1, 1], + ["e", -1, 2], + ["o", -1, 1], + ["os", -1, 1], + ["\u00E1", -1, 1], + ["\u00E9", -1, 2], + ["\u00ED", -1, 1], + ["\u00F3", -1, 1] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 10]; + + var /** number */ I_p2 = 0; + var /** number */ I_p1 = 0; + var /** number */ I_pV = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_pV = base.limit; + I_p1 = base.limit; + I_p2 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + lab1: { + var /** number */ v_2 = base.cursor; + lab2: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab2; + } + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab4; + } + golab5: while(true) + { + lab6: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab6; + } + break golab5; + } + if (base.cursor >= base.limit) + { + break lab4; + } + base.cursor++; + } + break lab3; + } + base.cursor = v_3; + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab2; + } + golab7: while(true) + { + lab8: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab8; + } + break golab7; + } + if (base.cursor >= base.limit) + { + break lab2; + } + base.cursor++; + } + } + break lab1; + } + base.cursor = v_2; + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab0; + } + lab9: { + var /** number */ v_6 = base.cursor; + lab10: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab10; + } + golab11: while(true) + { + lab12: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab12; + } + break golab11; + } + if (base.cursor >= base.limit) + { + break lab10; + } + base.cursor++; + } + break lab9; + } + base.cursor = v_6; + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab0; + } + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + } + } + I_pV = base.cursor; + } + base.cursor = v_1; + var /** number */ v_8 = base.cursor; + lab13: { + golab14: while(true) + { + lab15: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab15; + } + break golab14; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab16: while(true) + { + lab17: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab17; + } + break golab16; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p1 = base.cursor; + golab18: while(true) + { + lab19: { + if (!(base.in_grouping(g_v, 97, 252))) + { + break lab19; + } + break golab18; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + golab20: while(true) + { + lab21: { + if (!(base.out_grouping(g_v, 97, 252))) + { + break lab21; + } + break golab20; + } + if (base.cursor >= base.limit) + { + break lab13; + } + base.cursor++; + } + I_p2 = base.cursor; + } + base.cursor = v_8; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + var /** number */ among_var; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("a")) + { + return false; + } + break; + case 2: + if (!base.slice_from("e")) + { + return false; + } + break; + case 3: + if (!base.slice_from("i")) + { + return false; + } + break; + case 4: + if (!base.slice_from("o")) + { + return false; + } + break; + case 5: + if (!base.slice_from("u")) + { + return false; + } + break; + case 6: + if (base.cursor >= base.limit) + { + break lab0; + } + base.cursor++; + break; + } + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_RV() { + return I_pV <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R2() { + return I_p2 <= base.cursor; + }; + + /** @return {boolean} */ + function r_attached_pronoun() { + var /** number */ among_var; + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + return false; + } + if (!r_RV()) + { + return false; + } + switch (among_var) { + case 1: + base.bra = base.cursor; + if (!base.slice_from("iendo")) + { + return false; + } + break; + case 2: + base.bra = base.cursor; + if (!base.slice_from("ando")) + { + return false; + } + break; + case 3: + base.bra = base.cursor; + if (!base.slice_from("ar")) + { + return false; + } + break; + case 4: + base.bra = base.cursor; + if (!base.slice_from("er")) + { + return false; + } + break; + case 5: + base.bra = base.cursor; + if (!base.slice_from("ir")) + { + return false; + } + break; + case 6: + if (!base.slice_del()) + { + return false; + } + break; + case 7: + if (!(base.eq_s_b("u"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("ic"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 3: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("log")) + { + return false; + } + break; + case 4: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("u")) + { + return false; + } + break; + case 5: + if (!r_R2()) + { + return false; + } + if (!base.slice_from("ente")) + { + return false; + } + break; + case 6: + if (!r_R1()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + switch (among_var) { + case 1: + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_2; + break lab1; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_2; + break lab1; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + break; + case 7: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + if (base.find_among_b(a_4) == 0) + { + base.cursor = base.limit - v_3; + break lab2; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_3; + break lab2; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 8: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + lab3: { + base.ket = base.cursor; + if (base.find_among_b(a_5) == 0) + { + base.cursor = base.limit - v_4; + break lab3; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_4; + break lab3; + } + if (!base.slice_del()) + { + return false; + } + } + break; + case 9: + if (!r_R2()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_5 = base.limit - base.cursor; + lab4: { + base.ket = base.cursor; + if (!(base.eq_s_b("at"))) + { + base.cursor = base.limit - v_5; + break lab4; + } + base.bra = base.cursor; + if (!r_R2()) + { + base.cursor = base.limit - v_5; + break lab4; + } + if (!base.slice_del()) + { + return false; + } + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_y_verb_suffix() { + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + if (base.find_among_b(a_7) == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + if (!(base.eq_s_b("u"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_verb_suffix() { + var /** number */ among_var; + if (base.cursor < I_pV) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_pV; + base.ket = base.cursor; + among_var = base.find_among_b(a_8); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("u"))) + { + base.cursor = base.limit - v_3; + break lab0; + } + var /** number */ v_4 = base.limit - base.cursor; + if (!(base.eq_s_b("g"))) + { + base.cursor = base.limit - v_3; + break lab0; + } + base.cursor = base.limit - v_4; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_residual_suffix() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_9); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_RV()) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (!(base.eq_s_b("u"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.bra = base.cursor; + var /** number */ v_2 = base.limit - base.cursor; + if (!(base.eq_s_b("g"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + base.cursor = base.limit - v_2; + if (!r_RV()) + { + base.cursor = base.limit - v_1; + break lab0; + } + if (!base.slice_del()) + { + return false; + } + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_mark_regions(); + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_attached_pronoun(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + lab0: { + lab1: { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + if (!r_standard_suffix()) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_4; + lab3: { + if (!r_y_verb_suffix()) + { + break lab3; + } + break lab1; + } + base.cursor = base.limit - v_4; + if (!r_verb_suffix()) + { + break lab0; + } + } + } + base.cursor = base.limit - v_3; + var /** number */ v_5 = base.limit - base.cursor; + r_residual_suffix(); + base.cursor = base.limit - v_5; + base.cursor = base.limit_backward; + var /** number */ v_6 = base.cursor; + r_postlude(); + base.cursor = v_6; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['SpanishStemmer'] = SpanishStemmer; diff --git a/js/stemmers.js b/js/stemmers.js new file mode 100644 index 0000000..7c10d0d --- /dev/null +++ b/js/stemmers.js @@ -0,0 +1,528 @@ +var f=!0,q=!1; +window.ArabicStemmer=function(){function g(){var a,b=k.cursor;for(;;){var c=k.cursor;b:{c:{var e=k.cursor;k.c=k.cursor;a=k.o(v);if(0!=a){k.d=k.cursor;switch(a){case 1:if(!k.e())return;break;case 2:if(!k.b("0"))return;break;case 3:if(!k.b("1"))return;break;case 4:if(!k.b("2"))return;break;case 5:if(!k.b("3"))return;break;case 6:if(!k.b("4"))return;break;case 7:if(!k.b("5"))return;break;case 8:if(!k.b("6"))return;break;case 9:if(!k.b("7"))return;break;case 10:if(!k.b("8"))return;break;case 11:if(!k.b("9"))return; +break;case 12:if(!k.b("\u0621"))return;break;case 13:if(!k.b("\u0623"))return;break;case 14:if(!k.b("\u0625"))return;break;case 15:if(!k.b("\u0626"))return;break;case 16:if(!k.b("\u0622"))return;break;case 17:if(!k.b("\u0624"))return;break;case 18:if(!k.b("\u0627"))return;break;case 19:if(!k.b("\u0628"))return;break;case 20:if(!k.b("\u0629"))return;break;case 21:if(!k.b("\u062a"))return;break;case 22:if(!k.b("\u062b"))return;break;case 23:if(!k.b("\u062c"))return;break;case 24:if(!k.b("\u062d"))return; +break;case 25:if(!k.b("\u062e"))return;break;case 26:if(!k.b("\u062f"))return;break;case 27:if(!k.b("\u0630"))return;break;case 28:if(!k.b("\u0631"))return;break;case 29:if(!k.b("\u0632"))return;break;case 30:if(!k.b("\u0633"))return;break;case 31:if(!k.b("\u0634"))return;break;case 32:if(!k.b("\u0635"))return;break;case 33:if(!k.b("\u0636"))return;break;case 34:if(!k.b("\u0637"))return;break;case 35:if(!k.b("\u0638"))return;break;case 36:if(!k.b("\u0639"))return;break;case 37:if(!k.b("\u063a"))return; +break;case 38:if(!k.b("\u0641"))return;break;case 39:if(!k.b("\u0642"))return;break;case 40:if(!k.b("\u0643"))return;break;case 41:if(!k.b("\u0644"))return;break;case 42:if(!k.b("\u0645"))return;break;case 43:if(!k.b("\u0646"))return;break;case 44:if(!k.b("\u0647"))return;break;case 45:if(!k.b("\u0648"))return;break;case 46:if(!k.b("\u0649"))return;break;case 47:if(!k.b("\u064a"))return;break;case 48:if(!k.b("\u0644\u0627"))return;break;case 49:if(!k.b("\u0644\u0623"))return;break;case 50:if(!k.b("\u0644\u0625"))return; +break;case 51:if(!k.b("\u0644\u0622"))return}break c}k.cursor=e;if(k.cursor>=k.a)break b;k.cursor++}continue}k.cursor=c;break}k.cursor=b}function d(){var a;a=k.cursor;k.f=k.cursor;k.cursor=k.a;k.d=k.cursor;if(0!=k.g(w)){k.c=k.cursor;if(!k.b("\u0621"))return;k.cursor=k.f}k.cursor=a;var b=k.cursor;for(;;){var c=k.cursor;b:{c:{var e=k.cursor;k.c=k.cursor;a=k.o(y);if(0!=a){k.d=k.cursor;switch(a){case 1:if(!k.b("\u0627"))return;break;case 2:if(!k.b("\u0648"))return;break;case 3:if(!k.b("\u064a"))return}break c}k.cursor= +e;if(k.cursor>=k.a)break b;k.cursor++}continue}k.cursor=c;break}k.cursor=b}function m(){var a;k.c=k.cursor;a=k.o(c);if(0==a)return q;k.d=k.cursor;switch(a){case 1:if(3>=k.j.length||!k.b("\u0623"))return q;break;case 2:if(3>=k.j.length||!k.b("\u0622"))return q;break;case 3:if(3>=k.j.length||!k.b("\u0627"))return q;break;case 4:if(3>=k.j.length||!k.b("\u0625"))return q}return f}function a(){k.c=k.cursor;if(0==k.o(A))return q;k.d=k.cursor;if(3>=k.j.length)return q;var a=k.cursor;if(k.n("\u0627"))return q; +k.cursor=a;return!k.e()?q:f}function b(){var a;k.c=k.cursor;a=k.o(D);if(0==a)return q;k.d=k.cursor;switch(a){case 1:if(5>=k.j.length||!k.e())return q;break;case 2:if(4>=k.j.length||!k.e())return q}return f}function l(){var a;k.c=k.cursor;a=k.o(x);if(0==a)return q;k.d=k.cursor;switch(a){case 1:if(3>=k.j.length||!k.e())return q;break;case 2:if(3>=k.j.length||!k.b("\u0628"))return q;break;case 3:if(3>=k.j.length||!k.b("\u0643"))return q}return f}function n(){var a;k.c=k.cursor;a=k.o(G);if(0==a)return q; +k.d=k.cursor;switch(a){case 1:if(4>=k.j.length||!k.b("\u064a"))return q;break;case 2:if(4>=k.j.length||!k.b("\u062a"))return q;break;case 3:if(4>=k.j.length||!k.b("\u0646"))return q;break;case 4:if(4>=k.j.length||!k.b("\u0623"))return q}return f}function e(){var a;k.d=k.cursor;a=k.g(J);if(0==a)return q;k.c=k.cursor;switch(a){case 1:if(4>k.j.length||!k.e())return q;break;case 2:if(5>k.j.length||!k.e())return q;break;case 3:if(6>k.j.length||!k.e())return q}return f}function h(){k.d=k.cursor;if(0==k.g(N))return q; +k.c=k.cursor;return 4>=k.j.length||!k.e()?q:f}function u(){k.d=k.cursor;if(0==k.g(P))return q;k.c=k.cursor;return 5>k.j.length||!k.e()?q:f}function t(){k.d=k.cursor;if(0==k.g(Q))return q;k.c=k.cursor;return 4>k.j.length||!k.e()?q:f}function s(){var a;k.d=k.cursor;a=k.g(R);if(0==a)return q;k.c=k.cursor;switch(a){case 1:if(4>k.j.length||!k.e())return q;break;case 2:if(5>k.j.length||!k.e())return q;break;case 3:if(6>k.j.length||!k.e())return q}return f}function r(){var a;k.d=k.cursor;a=k.g(S);if(0== +a)return q;k.c=k.cursor;switch(a){case 1:if(4>k.j.length||!k.e())return q;break;case 2:if(5>k.j.length||!k.e())return q;break;case 3:if(5>=k.j.length||!k.e())return q;break;case 4:if(6>k.j.length||!k.e())return q}return f}function p(){var a;k.d=k.cursor;a=k.g(M);if(0==a)return q;k.c=k.cursor;switch(a){case 1:if(4>k.j.length||!k.e())return q;break;case 2:if(6>k.j.length||!k.e())return q}return f}var k=new C,v=[["\u0640",-1,1],["\u064b",-1,1],["\u064c",-1,1],["\u064d",-1,1],["\u064e",-1,1],["\u064f", +-1,1],["\u0650",-1,1],["\u0651",-1,1],["\u0652",-1,1],["\u0660",-1,2],["\u0661",-1,3],["\u0662",-1,4],["\u0663",-1,5],["\u0664",-1,6],["\u0665",-1,7],["\u0666",-1,8],["\u0667",-1,9],["\u0668",-1,10],["\u0669",-1,11],["\ufe80",-1,12],["\ufe81",-1,16],["\ufe82",-1,16],["\ufe83",-1,13],["\ufe84",-1,13],["\ufe85",-1,17],["\ufe86",-1,17],["\ufe87",-1,14],["\ufe88",-1,14],["\ufe89",-1,15],["\ufe8a",-1,15],["\ufe8b",-1,15],["\ufe8c",-1,15],["\ufe8d",-1,18],["\ufe8e",-1,18],["\ufe8f",-1,19],["\ufe90",-1, +19],["\ufe91",-1,19],["\ufe92",-1,19],["\ufe93",-1,20],["\ufe94",-1,20],["\ufe95",-1,21],["\ufe96",-1,21],["\ufe97",-1,21],["\ufe98",-1,21],["\ufe99",-1,22],["\ufe9a",-1,22],["\ufe9b",-1,22],["\ufe9c",-1,22],["\ufe9d",-1,23],["\ufe9e",-1,23],["\ufe9f",-1,23],["\ufea0",-1,23],["\ufea1",-1,24],["\ufea2",-1,24],["\ufea3",-1,24],["\ufea4",-1,24],["\ufea5",-1,25],["\ufea6",-1,25],["\ufea7",-1,25],["\ufea8",-1,25],["\ufea9",-1,26],["\ufeaa",-1,26],["\ufeab",-1,27],["\ufeac",-1,27],["\ufead",-1,28],["\ufeae", +-1,28],["\ufeaf",-1,29],["\ufeb0",-1,29],["\ufeb1",-1,30],["\ufeb2",-1,30],["\ufeb3",-1,30],["\ufeb4",-1,30],["\ufeb5",-1,31],["\ufeb6",-1,31],["\ufeb7",-1,31],["\ufeb8",-1,31],["\ufeb9",-1,32],["\ufeba",-1,32],["\ufebb",-1,32],["\ufebc",-1,32],["\ufebd",-1,33],["\ufebe",-1,33],["\ufebf",-1,33],["\ufec0",-1,33],["\ufec1",-1,34],["\ufec2",-1,34],["\ufec3",-1,34],["\ufec4",-1,34],["\ufec5",-1,35],["\ufec6",-1,35],["\ufec7",-1,35],["\ufec8",-1,35],["\ufec9",-1,36],["\ufeca",-1,36],["\ufecb",-1,36],["\ufecc", +-1,36],["\ufecd",-1,37],["\ufece",-1,37],["\ufecf",-1,37],["\ufed0",-1,37],["\ufed1",-1,38],["\ufed2",-1,38],["\ufed3",-1,38],["\ufed4",-1,38],["\ufed5",-1,39],["\ufed6",-1,39],["\ufed7",-1,39],["\ufed8",-1,39],["\ufed9",-1,40],["\ufeda",-1,40],["\ufedb",-1,40],["\ufedc",-1,40],["\ufedd",-1,41],["\ufede",-1,41],["\ufedf",-1,41],["\ufee0",-1,41],["\ufee1",-1,42],["\ufee2",-1,42],["\ufee3",-1,42],["\ufee4",-1,42],["\ufee5",-1,43],["\ufee6",-1,43],["\ufee7",-1,43],["\ufee8",-1,43],["\ufee9",-1,44],["\ufeea", +-1,44],["\ufeeb",-1,44],["\ufeec",-1,44],["\ufeed",-1,45],["\ufeee",-1,45],["\ufeef",-1,46],["\ufef0",-1,46],["\ufef1",-1,47],["\ufef2",-1,47],["\ufef3",-1,47],["\ufef4",-1,47],["\ufef5",-1,51],["\ufef6",-1,51],["\ufef7",-1,49],["\ufef8",-1,49],["\ufef9",-1,50],["\ufefa",-1,50],["\ufefb",-1,48],["\ufefc",-1,48]],w=[["\u0622",-1,1],["\u0623",-1,1],["\u0624",-1,1],["\u0625",-1,1],["\u0626",-1,1]],y=[["\u0622",-1,1],["\u0623",-1,1],["\u0624",-1,2],["\u0625",-1,1],["\u0626",-1,3]],z=[["\u0627\u0644", +-1,2],["\u0628\u0627\u0644",-1,1],["\u0643\u0627\u0644",-1,1],["\u0644\u0644",-1,2]],c=[["\u0623\u0622",-1,2],["\u0623\u0623",-1,1],["\u0623\u0624",-1,1],["\u0623\u0625",-1,4],["\u0623\u0627",-1,3]],A=[["\u0641",-1,1],["\u0648",-1,1]],D=[["\u0627\u0644",-1,2],["\u0628\u0627\u0644",-1,1],["\u0643\u0627\u0644",-1,1],["\u0644\u0644",-1,2]],x=[["\u0628",-1,1],["\u0628\u0627",0,-1],["\u0628\u0628",0,2],["\u0643\u0643",-1,3]],G=[["\u0633\u0623",-1,4],["\u0633\u062a",-1,2],["\u0633\u0646",-1,3],["\u0633\u064a", +-1,1]],E=[["\u062a\u0633\u062a",-1,1],["\u0646\u0633\u062a",-1,1],["\u064a\u0633\u062a",-1,1]],J=[["\u0643\u0645\u0627",-1,3],["\u0647\u0645\u0627",-1,3],["\u0646\u0627",-1,2],["\u0647\u0627",-1,2],["\u0643",-1,1],["\u0643\u0645",-1,2],["\u0647\u0645",-1,2],["\u0647\u0646",-1,2],["\u0647",-1,1],["\u064a",-1,1]],O=[["\u0646",-1,1]],N=[["\u0627",-1,1],["\u0648",-1,1],["\u064a",-1,1]],P=[["\u0627\u062a",-1,1]],Q=[["\u062a",-1,1]],T=[["\u0629",-1,1]],U=[["\u064a",-1,1]],R=[["\u0643\u0645\u0627",-1,3], +["\u0647\u0645\u0627",-1,3],["\u0646\u0627",-1,2],["\u0647\u0627",-1,2],["\u0643",-1,1],["\u0643\u0645",-1,2],["\u0647\u0645",-1,2],["\u0643\u0646",-1,2],["\u0647\u0646",-1,2],["\u0647",-1,1],["\u0643\u0645\u0648",-1,3],["\u0646\u064a",-1,2]],S=[["\u0627",-1,1],["\u062a\u0627",0,2],["\u062a\u0645\u0627",0,4],["\u0646\u0627",0,2],["\u062a",-1,1],["\u0646",-1,1],["\u0627\u0646",5,3],["\u062a\u0646",5,2],["\u0648\u0646",5,3],["\u064a\u0646",5,3],["\u064a",-1,1]],V=[["\u0648\u0627",-1,1],["\u062a\u0645", +-1,1]],M=[["\u0648",-1,1],["\u062a\u0645\u0648",0,2]],K=[["\u0649",-1,1]],H=q,I=q,L=q;this.m=function(){I=L=f;H=q;var c=k.cursor,y;k.c=k.cursor;y=k.o(z);if(0!=y)switch(k.d=k.cursor,y){case 1:if(4>=k.j.length)break;L=f;I=q;H=f;break;case 2:if(3>=k.j.length)break;L=f;I=q;H=f}k.cursor=c;g();k.f=k.cursor;k.cursor=k.a;c=k.a-k.cursor;b:{y=k.a-k.cursor;c:if(I){d:{var x=k.a-k.cursor;e:{for(var w=1;;){var A=k.a-k.cursor;if(s()){w--;continue}k.cursor=k.a-A;break}if(!(0k.j.length||!k.e()?q:f);if(w)break d;k.cursor=k.a-x;if(!r())break c}break b}k.cursor=k.a-y;if(L){x=k.a-k.cursor;e:{w=k.a-k.cursor;k.d=k.cursor;0==k.g(T)?A=q:(k.c=k.cursor,A=4>k.j.length||!k.e()?q:f);if(A)break e;k.cursor=k.a-w;f:{if(H)break f;if(e()){g:{A=k.a-k.cursor;if(h())break g;k.cursor=k.a-A;if(u())break g;k.cursor=k.a-A;if(t())break g;k.cursor= +k.a-A;if(k.cursor<=k.f)break f;k.cursor--}break e}}k.cursor=k.a-w;f:if(k.d=k.cursor,0==k.g(O)?A=q:(k.c=k.cursor,A=5>=k.j.length||!k.e()?q:f),A){g:{A=k.a-k.cursor;if(h())break g;k.cursor=k.a-A;if(u())break g;k.cursor=k.a-A;if(!t())break f}break e}k.cursor=k.a-w;f:{if(H)break f;if(h())break e}k.cursor=k.a-w;u()||(k.cursor=k.a-x)}k.d=k.cursor;0==k.g(U)?x=q:(k.c=k.cursor,x=3>k.j.length||!k.e()?q:f);if(x)break b}k.cursor=k.a-y;k.d=k.cursor;0!=k.g(K)&&(k.c=k.cursor,k.b("\u064a"))}k.cursor=k.a-c;k.cursor= +k.f;c=k.cursor;y=k.cursor;m()||(k.cursor=y);y=k.cursor;a()||(k.cursor=y);b:{y=k.cursor;if(b())break b;k.cursor=y;if(L&&l())break b;k.cursor=y;I&&(y=k.cursor,n()||(k.cursor=y),k.c=k.cursor,0!=k.o(E)&&(k.d=k.cursor,4>=k.j.length||(I=f,L=q,k.b("\u0627\u0633\u062a"))))}k.cursor=c;d();return f};this.stemWord=function(a){k.p(a);this.m();return k.j}};window.ArmenianStemmer=function(){var g=new C,d=[["\u0580\u0578\u0580\u0564",-1,1],["\u0565\u0580\u0578\u0580\u0564",0,1],["\u0561\u056c\u056b",-1,1],["\u0561\u056f\u056b",-1,1],["\u0578\u0580\u0561\u056f",-1,1],["\u0565\u0572",-1,1],["\u0561\u056f\u0561\u0576",-1,1],["\u0561\u0580\u0561\u0576",-1,1],["\u0565\u0576",-1,1],["\u0565\u056f\u0565\u0576",8,1],["\u0565\u0580\u0565\u0576",8,1],["\u0578\u0580\u0567\u0576",-1,1],["\u056b\u0576",-1,1],["\u0563\u056b\u0576",12,1],["\u0578\u057e\u056b\u0576", +12,1],["\u056c\u0561\u0575\u0576",-1,1],["\u057e\u0578\u0582\u0576",-1,1],["\u057a\u0565\u057d",-1,1],["\u056b\u057e",-1,1],["\u0561\u057f",-1,1],["\u0561\u057e\u0565\u057f",-1,1],["\u056f\u0578\u057f",-1,1],["\u0562\u0561\u0580",-1,1]],m=[["\u0561",-1,1],["\u0561\u0581\u0561",0,1],["\u0565\u0581\u0561",0,1],["\u057e\u0565",-1,1],["\u0561\u0581\u0580\u056b",-1,1],["\u0561\u0581\u056b",-1,1],["\u0565\u0581\u056b",-1,1],["\u057e\u0565\u0581\u056b",6,1],["\u0561\u056c",-1,1],["\u0568\u0561\u056c",8, +1],["\u0561\u0576\u0561\u056c",8,1],["\u0565\u0576\u0561\u056c",8,1],["\u0561\u0581\u0576\u0561\u056c",8,1],["\u0565\u056c",-1,1],["\u0568\u0565\u056c",13,1],["\u0576\u0565\u056c",13,1],["\u0581\u0576\u0565\u056c",15,1],["\u0565\u0581\u0576\u0565\u056c",16,1],["\u0579\u0565\u056c",13,1],["\u057e\u0565\u056c",13,1],["\u0561\u0581\u057e\u0565\u056c",19,1],["\u0565\u0581\u057e\u0565\u056c",19,1],["\u057f\u0565\u056c",13,1],["\u0561\u057f\u0565\u056c",22,1],["\u0578\u057f\u0565\u056c",22,1],["\u056f\u0578\u057f\u0565\u056c", +24,1],["\u057e\u0561\u056e",-1,1],["\u0578\u0582\u0574",-1,1],["\u057e\u0578\u0582\u0574",27,1],["\u0561\u0576",-1,1],["\u0581\u0561\u0576",29,1],["\u0561\u0581\u0561\u0576",30,1],["\u0561\u0581\u0580\u056b\u0576",-1,1],["\u0561\u0581\u056b\u0576",-1,1],["\u0565\u0581\u056b\u0576",-1,1],["\u057e\u0565\u0581\u056b\u0576",34,1],["\u0561\u056c\u056b\u057d",-1,1],["\u0565\u056c\u056b\u057d",-1,1],["\u0561\u057e",-1,1],["\u0561\u0581\u0561\u057e",38,1],["\u0565\u0581\u0561\u057e",38,1],["\u0561\u056c\u0578\u057e", +-1,1],["\u0565\u056c\u0578\u057e",-1,1],["\u0561\u0580",-1,1],["\u0561\u0581\u0561\u0580",43,1],["\u0565\u0581\u0561\u0580",43,1],["\u0561\u0581\u0580\u056b\u0580",-1,1],["\u0561\u0581\u056b\u0580",-1,1],["\u0565\u0581\u056b\u0580",-1,1],["\u057e\u0565\u0581\u056b\u0580",48,1],["\u0561\u0581",-1,1],["\u0565\u0581",-1,1],["\u0561\u0581\u0580\u0565\u0581",51,1],["\u0561\u056c\u0578\u0582\u0581",-1,1],["\u0565\u056c\u0578\u0582\u0581",-1,1],["\u0561\u056c\u0578\u0582",-1,1],["\u0565\u056c\u0578\u0582", +-1,1],["\u0561\u0584",-1,1],["\u0581\u0561\u0584",57,1],["\u0561\u0581\u0561\u0584",58,1],["\u0561\u0581\u0580\u056b\u0584",-1,1],["\u0561\u0581\u056b\u0584",-1,1],["\u0565\u0581\u056b\u0584",-1,1],["\u057e\u0565\u0581\u056b\u0584",62,1],["\u0561\u0576\u0584",-1,1],["\u0581\u0561\u0576\u0584",64,1],["\u0561\u0581\u0561\u0576\u0584",65,1],["\u0561\u0581\u0580\u056b\u0576\u0584",-1,1],["\u0561\u0581\u056b\u0576\u0584",-1,1],["\u0565\u0581\u056b\u0576\u0584",-1,1],["\u057e\u0565\u0581\u056b\u0576\u0584", +69,1]],a=[["\u0578\u0580\u0564",-1,1],["\u0578\u0582\u0575\u0569",-1,1],["\u0578\u0582\u0570\u056b",-1,1],["\u0581\u056b",-1,1],["\u056b\u056c",-1,1],["\u0561\u056f",-1,1],["\u0575\u0561\u056f",5,1],["\u0561\u0576\u0561\u056f",5,1],["\u056b\u056f",-1,1],["\u0578\u0582\u056f",-1,1],["\u0561\u0576",-1,1],["\u057a\u0561\u0576",10,1],["\u057d\u057f\u0561\u0576",10,1],["\u0561\u0580\u0561\u0576",10,1],["\u0565\u0572\u0567\u0576",-1,1],["\u0575\u0578\u0582\u0576",-1,1],["\u0578\u0582\u0569\u0575\u0578\u0582\u0576", +15,1],["\u0561\u056e\u0578",-1,1],["\u056b\u0579",-1,1],["\u0578\u0582\u057d",-1,1],["\u0578\u0582\u057d\u057f",-1,1],["\u0563\u0561\u0580",-1,1],["\u057e\u0578\u0580",-1,1],["\u0561\u057e\u0578\u0580",22,1],["\u0578\u0581",-1,1],["\u0561\u0576\u0585\u0581",-1,1],["\u0578\u0582",-1,1],["\u0584",-1,1],["\u0579\u0565\u0584",27,1],["\u056b\u0584",27,1],["\u0561\u056c\u056b\u0584",29,1],["\u0561\u0576\u056b\u0584",29,1],["\u057e\u0561\u056e\u0584",27,1],["\u0578\u0582\u0575\u0584",27,1],["\u0565\u0576\u0584", +27,1],["\u0578\u0576\u0584",27,1],["\u0578\u0582\u0576\u0584",27,1],["\u0574\u0578\u0582\u0576\u0584",36,1],["\u056b\u0579\u0584",27,1],["\u0561\u0580\u0584",27,1]],b=[["\u057d\u0561",-1,1],["\u057e\u0561",-1,1],["\u0561\u0574\u0562",-1,1],["\u0564",-1,1],["\u0561\u0576\u0564",3,1],["\u0578\u0582\u0569\u0575\u0561\u0576\u0564",4,1],["\u057e\u0561\u0576\u0564",4,1],["\u0578\u057b\u0564",3,1],["\u0565\u0580\u0564",3,1],["\u0576\u0565\u0580\u0564",8,1],["\u0578\u0582\u0564",3,1],["\u0568",-1,1],["\u0561\u0576\u0568", +11,1],["\u0578\u0582\u0569\u0575\u0561\u0576\u0568",12,1],["\u057e\u0561\u0576\u0568",12,1],["\u0578\u057b\u0568",11,1],["\u0565\u0580\u0568",11,1],["\u0576\u0565\u0580\u0568",16,1],["\u056b",-1,1],["\u057e\u056b",18,1],["\u0565\u0580\u056b",18,1],["\u0576\u0565\u0580\u056b",20,1],["\u0561\u0576\u0578\u0582\u0574",-1,1],["\u0565\u0580\u0578\u0582\u0574",-1,1],["\u0576\u0565\u0580\u0578\u0582\u0574",23,1],["\u0576",-1,1],["\u0561\u0576",25,1],["\u0578\u0582\u0569\u0575\u0561\u0576",26,1],["\u057e\u0561\u0576", +26,1],["\u056b\u0576",25,1],["\u0565\u0580\u056b\u0576",29,1],["\u0576\u0565\u0580\u056b\u0576",30,1],["\u0578\u0582\u0569\u0575\u0561\u0576\u0576",25,1],["\u0565\u0580\u0576",25,1],["\u0576\u0565\u0580\u0576",33,1],["\u0578\u0582\u0576",25,1],["\u0578\u057b",-1,1],["\u0578\u0582\u0569\u0575\u0561\u0576\u057d",-1,1],["\u057e\u0561\u0576\u057d",-1,1],["\u0578\u057b\u057d",-1,1],["\u0578\u057e",-1,1],["\u0561\u0576\u0578\u057e",40,1],["\u057e\u0578\u057e",40,1],["\u0565\u0580\u0578\u057e",40,1],["\u0576\u0565\u0580\u0578\u057e", +43,1],["\u0565\u0580",-1,1],["\u0576\u0565\u0580",45,1],["\u0581",-1,1],["\u056b\u0581",47,1],["\u057e\u0561\u0576\u056b\u0581",48,1],["\u0578\u057b\u056b\u0581",48,1],["\u057e\u056b\u0581",48,1],["\u0565\u0580\u056b\u0581",48,1],["\u0576\u0565\u0580\u056b\u0581",52,1],["\u0581\u056b\u0581",48,1],["\u0578\u0581",47,1],["\u0578\u0582\u0581",47,1]],l=[209,4,128,0,18],n=0,e=0;this.m=function(){n=e=g.a;var h=g.cursor;a:{b:for(;;){if(g.i(l,1377,1413))break b;if(g.cursor>=g.a)break a;g.cursor++}e=g.cursor; +b:for(;;){if(g.k(l,1377,1413))break b;if(g.cursor>=g.a)break a;g.cursor++}b:for(;;){if(g.i(l,1377,1413))break b;if(g.cursor>=g.a)break a;g.cursor++}b:for(;;){if(g.k(l,1377,1413))break b;if(g.cursor>=g.a)break a;g.cursor++}n=g.cursor}g.cursor=h;g.f=g.cursor;g.cursor=g.a;if(g.cursor=this.a)return q;var a=this.j.charCodeAt(this.cursor);if(a>m||a>>3]&1<<(a&7)))return q;this.cursor++;return f};this.l=function(g,d,m){if(this.cursor<=this.f)return q;var a=this.j.charCodeAt(this.cursor- +1);if(a>m||a>>3]&1<<(a&7)))return q;this.cursor--;return f};this.k=function(g,d,m){if(this.cursor>=this.a)return q;var a=this.j.charCodeAt(this.cursor);if(a>m||a>>3]&1<<(a&7))?(this.cursor++,f):q};this.q=function(g,d,m){if(this.cursor<=this.f)return q;var a=this.j.charCodeAt(this.cursor-1);if(a>m||a>>3]&1<<(a&7))?(this.cursor--,f):q};this.n=function(g){if(this.a-this.cursor>>1),u=0,t=lu?(m=h,n=t):(d=h,l=t);if(1>=m-d){if(0< +d)break;if(m==d)break;if(e)break;e=f}}do{s=g[d];if(l>=s[0].length){this.cursor=a+s[0].length;if(4>s.length)return s[2];d=s[3](this);this.cursor=a+s[0].length;if(d)return s[2]}d=s[1]}while(0<=d);return 0};this.g=function(g){for(var d=0,m=g.length,a=this.cursor,b=this.f,l=0,n=0,e=q;;){var h=d+(m-d>>1),u=0,t=lu?(m=h,n=t):(d=h,l=t);if(1>=m-d){if(0=s[0].length){this.cursor=a-s[0].length;if(4>s.length)return s[2];d=s[3](this);this.cursor=a-s[0].length;if(d)return s[2]}d=s[1]}while(0<=d);return 0};this.s=function(g,d,m){var a=m.length-(d-g);this.j=this.j.slice(0,g)+m+this.j.slice(d);this.a+=a;this.cursor>=d?this.cursor+=a:this.cursor>g&&(this.cursor=g);return a};this.t=function(){return 0>this.c||this.c>this.d||this.d>this.a||this.a>this.j.length?q:f};this.b=function(g){var d=q;this.t()&&(this.s(this.c,this.d,g), +d=f);return d};this.e=function(){return this.b("")};this.r=function(g,d,m){d=this.s(g,d,m);g<=this.c&&(this.c+=d);g<=this.d&&(this.d+=d)};this.u=function(){var g="";this.t()&&(g=this.j.slice(this.c,this.d));return g};this.v=function(){return this.j.slice(0,this.a)}}window.BaseStemmer=C;window.BasqueStemmer=function(){function g(){var b;m.d=m.cursor;b=m.g(a);if(0==b)return q;m.c=m.cursor;switch(b){case 1:if(!(u<=m.cursor)||!m.e())return q;break;case 2:if(!(e<=m.cursor)||!m.e())return q;break;case 3:if(!m.b("atseden"))return q;break;case 4:if(!m.b("arabera"))return q;break;case 5:if(!m.b("baditu"))return q}return f}function d(){var a;m.d=m.cursor;a=m.g(b);if(0==a)return q;m.c=m.cursor;switch(a){case 1:if(!(u<=m.cursor)||!m.e())return q;break;case 2:if(!(e<=m.cursor)||!m.e())return q; +break;case 3:if(!m.b("jok"))return q;break;case 4:if(!(h<=m.cursor)||!m.e())return q;break;case 5:if(!m.b("tra"))return q;break;case 6:if(!m.b("minutu"))return q;break;case 7:if(!m.b("zehar"))return q;break;case 8:if(!m.b("geldi"))return q;break;case 9:if(!m.b("igaro"))return q;break;case 10:if(!m.b("aurka"))return q}return f}var m=new C,a=[["idea",-1,1],["bidea",0,1],["kidea",0,1],["pidea",0,1],["kundea",-1,1],["galea",-1,1],["tailea",-1,1],["tzailea",-1,1],["gunea",-1,1],["kunea",-1,1],["tzaga", +-1,1],["gaia",-1,1],["aldia",-1,1],["taldia",12,1],["karia",-1,1],["garria",-1,2],["karria",-1,1],["ka",-1,1],["tzaka",17,1],["la",-1,1],["mena",-1,1],["pena",-1,1],["kina",-1,1],["ezina",-1,1],["tezina",23,1],["kuna",-1,1],["tuna",-1,1],["kizuna",-1,1],["era",-1,1],["bera",28,1],["arabera",29,4],["kera",28,1],["pera",28,1],["orra",-1,1],["korra",33,1],["dura",-1,1],["gura",-1,1],["kura",-1,1],["tura",-1,1],["eta",-1,1],["keta",39,1],["gailua",-1,1],["eza",-1,1],["erreza",42,1],["tza",-1,2],["gaitza", +44,1],["kaitza",44,1],["kuntza",44,1],["ide",-1,1],["bide",48,1],["kide",48,1],["pide",48,1],["kunde",-1,1],["tzake",-1,1],["tzeke",-1,1],["le",-1,1],["gale",55,1],["taile",55,1],["tzaile",55,1],["gune",-1,1],["kune",-1,1],["tze",-1,1],["atze",61,1],["gai",-1,1],["aldi",-1,1],["taldi",64,1],["ki",-1,1],["ari",-1,1],["kari",67,1],["lari",67,1],["tari",67,1],["etari",70,1],["garri",-1,2],["karri",-1,1],["arazi",-1,1],["tarazi",74,1],["an",-1,1],["ean",76,1],["rean",77,1],["kan",76,1],["etan",76,1], +["atseden",-1,3],["men",-1,1],["pen",-1,1],["kin",-1,1],["rekin",84,1],["ezin",-1,1],["tezin",86,1],["tun",-1,1],["kizun",-1,1],["go",-1,1],["ago",90,1],["tio",-1,1],["dako",-1,1],["or",-1,1],["kor",94,1],["tzat",-1,1],["du",-1,1],["gailu",-1,1],["tu",-1,1],["atu",99,1],["aldatu",100,1],["tatu",100,1],["baditu",99,5],["ez",-1,1],["errez",104,1],["tzez",104,1],["gaitz",-1,1],["kaitz",-1,1]],b=[["ada",-1,1],["kada",0,1],["anda",-1,1],["denda",-1,1],["gabea",-1,1],["kabea",-1,1],["aldea",-1,1],["kaldea", +6,1],["taldea",6,1],["ordea",-1,1],["zalea",-1,1],["tzalea",10,1],["gilea",-1,1],["emea",-1,1],["kumea",-1,1],["nea",-1,1],["enea",15,1],["zionea",15,1],["unea",15,1],["gunea",18,1],["pea",-1,1],["aurrea",-1,1],["tea",-1,1],["kotea",22,1],["artea",22,1],["ostea",22,1],["etxea",-1,1],["ga",-1,1],["anga",27,1],["gaia",-1,1],["aldia",-1,1],["taldia",30,1],["handia",-1,1],["mendia",-1,1],["geia",-1,1],["egia",-1,1],["degia",35,1],["tegia",35,1],["nahia",-1,1],["ohia",-1,1],["kia",-1,1],["tokia",40,1], +["oia",-1,1],["koia",42,1],["aria",-1,1],["karia",44,1],["laria",44,1],["taria",44,1],["eria",-1,1],["keria",48,1],["teria",48,1],["garria",-1,2],["larria",-1,1],["kirria",-1,1],["duria",-1,1],["asia",-1,1],["tia",-1,1],["ezia",-1,1],["bizia",-1,1],["ontzia",-1,1],["ka",-1,1],["joka",60,3],["aurka",60,10],["ska",60,1],["xka",60,1],["zka",60,1],["gibela",-1,1],["gela",-1,1],["kaila",-1,1],["skila",-1,1],["tila",-1,1],["ola",-1,1],["na",-1,1],["kana",72,1],["ena",72,1],["garrena",74,1],["gerrena",74, +1],["urrena",74,1],["zaina",72,1],["tzaina",78,1],["kina",72,1],["mina",72,1],["garna",72,1],["una",72,1],["duna",83,1],["asuna",83,1],["tasuna",85,1],["ondoa",-1,1],["kondoa",87,1],["ngoa",-1,1],["zioa",-1,1],["koa",-1,1],["takoa",91,1],["zkoa",91,1],["noa",-1,1],["zinoa",94,1],["aroa",-1,1],["taroa",96,1],["zaroa",96,1],["eroa",-1,1],["oroa",-1,1],["osoa",-1,1],["toa",-1,1],["ttoa",102,1],["ztoa",102,1],["txoa",-1,1],["tzoa",-1,1],["\u00f1oa",-1,1],["ra",-1,1],["ara",108,1],["dara",109,1],["liara", +109,1],["tiara",109,1],["tara",109,1],["etara",113,1],["tzara",109,1],["bera",108,1],["kera",108,1],["pera",108,1],["ora",108,2],["tzarra",108,1],["korra",108,1],["tra",108,1],["sa",-1,1],["osa",123,1],["ta",-1,1],["eta",125,1],["keta",126,1],["sta",125,1],["dua",-1,1],["mendua",129,1],["ordua",129,1],["lekua",-1,1],["burua",-1,1],["durua",-1,1],["tsua",-1,1],["tua",-1,1],["mentua",136,1],["estua",136,1],["txua",-1,1],["zua",-1,1],["tzua",140,1],["za",-1,1],["eza",142,1],["eroza",142,1],["tza",142, +2],["koitza",145,1],["antza",145,1],["gintza",145,1],["kintza",145,1],["kuntza",145,1],["gabe",-1,1],["kabe",-1,1],["kide",-1,1],["alde",-1,1],["kalde",154,1],["talde",154,1],["orde",-1,1],["ge",-1,1],["zale",-1,1],["tzale",159,1],["gile",-1,1],["eme",-1,1],["kume",-1,1],["ne",-1,1],["zione",164,1],["une",164,1],["gune",166,1],["pe",-1,1],["aurre",-1,1],["te",-1,1],["kote",170,1],["arte",170,1],["oste",170,1],["etxe",-1,1],["gai",-1,1],["di",-1,1],["aldi",176,1],["taldi",177,1],["geldi",176,8],["handi", +176,1],["mendi",176,1],["gei",-1,1],["egi",-1,1],["degi",183,1],["tegi",183,1],["nahi",-1,1],["ohi",-1,1],["ki",-1,1],["toki",188,1],["oi",-1,1],["goi",190,1],["koi",190,1],["ari",-1,1],["kari",193,1],["lari",193,1],["tari",193,1],["garri",-1,2],["larri",-1,1],["kirri",-1,1],["duri",-1,1],["asi",-1,1],["ti",-1,1],["ontzi",-1,1],["\u00f1i",-1,1],["ak",-1,1],["ek",-1,1],["tarik",-1,1],["gibel",-1,1],["ail",-1,1],["kail",209,1],["kan",-1,1],["tan",-1,1],["etan",212,1],["en",-1,4],["ren",214,2],["garren", +215,1],["gerren",215,1],["urren",215,1],["ten",214,4],["tzen",214,4],["zain",-1,1],["tzain",221,1],["kin",-1,1],["min",-1,1],["dun",-1,1],["asun",-1,1],["tasun",226,1],["aizun",-1,1],["ondo",-1,1],["kondo",229,1],["go",-1,1],["ngo",231,1],["zio",-1,1],["ko",-1,1],["trako",234,5],["tako",234,1],["etako",236,1],["eko",234,1],["tariko",234,1],["sko",234,1],["tuko",234,1],["minutuko",241,6],["zko",234,1],["no",-1,1],["zino",244,1],["ro",-1,1],["aro",246,1],["igaro",247,9],["taro",247,1],["zaro",247,1], +["ero",246,1],["giro",246,1],["oro",246,1],["oso",-1,1],["to",-1,1],["tto",255,1],["zto",255,1],["txo",-1,1],["tzo",-1,1],["gintzo",259,1],["\u00f1o",-1,1],["zp",-1,1],["ar",-1,1],["dar",263,1],["behar",263,1],["zehar",263,7],["liar",263,1],["tiar",263,1],["tar",263,1],["tzar",263,1],["or",-1,2],["kor",271,1],["os",-1,1],["ket",-1,1],["du",-1,1],["mendu",275,1],["ordu",275,1],["leku",-1,1],["buru",-1,2],["duru",-1,1],["tsu",-1,1],["tu",-1,1],["tatu",282,4],["mentu",282,1],["estu",282,1],["txu",-1, +1],["zu",-1,1],["tzu",287,1],["gintzu",288,1],["z",-1,1],["ez",290,1],["eroz",290,1],["tz",290,1],["koitz",293,1]],l=[["zlea",-1,2],["keria",-1,1],["la",-1,1],["era",-1,1],["dade",-1,1],["tade",-1,1],["date",-1,1],["tate",-1,1],["gi",-1,1],["ki",-1,1],["ik",-1,1],["lanik",10,1],["rik",10,1],["larik",12,1],["ztik",10,1],["go",-1,1],["ro",-1,1],["ero",16,1],["to",-1,1]],n=[17,65,16],e=0,h=0,u=0;this.m=function(){e=h=u=m.a;var a=m.cursor;a:{b:{var b=m.cursor;c:if(m.i(n,97,117)){d:{var r=m.cursor;e:if(m.k(n, +97,117)){f:for(;;){if(m.i(n,97,117))break f;if(m.cursor>=m.a)break e;m.cursor++}break d}m.cursor=r;if(!m.i(n,97,117))break c;e:for(;;){if(m.k(n,97,117))break e;if(m.cursor>=m.a)break c;m.cursor++}}break b}m.cursor=b;if(!m.k(n,97,117))break a;c:{b=m.cursor;d:if(m.k(n,97,117)){e:for(;;){if(m.i(n,97,117))break e;if(m.cursor>=m.a)break d;m.cursor++}break c}m.cursor=b;if(!m.i(n,97,117))break a;if(m.cursor>=m.a)break a;m.cursor++}}u=m.cursor}m.cursor=a;a=m.cursor;a:{b:for(;;){if(m.i(n,97,117))break b;if(m.cursor>= +m.a)break a;m.cursor++}b:for(;;){if(m.k(n,97,117))break b;if(m.cursor>=m.a)break a;m.cursor++}h=m.cursor;b:for(;;){if(m.i(n,97,117))break b;if(m.cursor>=m.a)break a;m.cursor++}b:for(;;){if(m.k(n,97,117))break b;if(m.cursor>=m.a)break a;m.cursor++}e=m.cursor}m.cursor=a;m.f=m.cursor;for(m.cursor=m.a;;){a=m.a-m.cursor;if(g())continue;m.cursor=m.a-a;break}for(;;){a=m.a-m.cursor;if(d())continue;m.cursor=m.a-a;break}a=m.a-m.cursor;m.d=m.cursor;b=m.g(l);if(0!=b)switch(m.c=m.cursor,b){case 1:if(!(u<=m.cursor)|| +!m.e())break;break;case 2:m.b("z")}m.cursor=m.a-a;m.cursor=m.f;return f};this.stemWord=function(a){m.p(a);this.m();return m.j}};window.CatalanStemmer=function(){function g(){for(var b;;){var e=m.cursor;a:{m.c=m.cursor;b=m.o(a);m.d=m.cursor;switch(b){case 1:if(!m.b("a"))return;break;case 2:if(!m.b("e"))return;break;case 3:if(!m.b("i"))return;break;case 4:if(!m.b("o"))return;break;case 5:if(!m.b("u"))return;break;case 6:if(!m.b("."))return;break;case 7:if(m.cursor>=m.a)break a;m.cursor++}continue}m.cursor=e;break}}function d(){var a;m.d=m.cursor;a=m.g(l);if(0==a)return q;m.c=m.cursor;switch(a){case 1:if(!(t<=m.cursor)||!m.e())return q; +break;case 2:if(!(u<=m.cursor)||!m.e())return q;break;case 3:if(!(u<=m.cursor)||!m.b("log"))return q;break;case 4:if(!(u<=m.cursor)||!m.b("ic"))return q;break;case 5:if(!(t<=m.cursor)||!m.b("c"))return q}return f}var m=new C,a=[["",-1,7],["\u00b7",0,6],["\u00e0",0,1],["\u00e1",0,1],["\u00e8",0,2],["\u00e9",0,2],["\u00ec",0,3],["\u00ed",0,3],["\u00ef",0,3],["\u00f2",0,4],["\u00f3",0,4],["\u00fa",0,5],["\u00fc",0,5]],b=[["la",-1,1],["-la",0,1],["sela",0,1],["le",-1,1],["me",-1,1],["-me",4,1],["se", +-1,1],["-te",-1,1],["hi",-1,1],["'hi",8,1],["li",-1,1],["-li",10,1],["'l",-1,1],["'m",-1,1],["-m",-1,1],["'n",-1,1],["-n",-1,1],["ho",-1,1],["'ho",17,1],["lo",-1,1],["selo",19,1],["'s",-1,1],["las",-1,1],["selas",22,1],["les",-1,1],["-les",24,1],["'ls",-1,1],["-ls",-1,1],["'ns",-1,1],["-ns",-1,1],["ens",-1,1],["los",-1,1],["selos",31,1],["nos",-1,1],["-nos",33,1],["vos",-1,1],["us",-1,1],["-us",36,1],["'t",-1,1]],l=[["ica",-1,4],["l\u00f3gica",0,3],["enca",-1,1],["ada",-1,2],["ancia",-1,1],["encia", +-1,1],["\u00e8ncia",-1,1],["\u00edcia",-1,1],["logia",-1,3],["inia",-1,1],["\u00edinia",9,1],["eria",-1,1],["\u00e0ria",-1,1],["at\u00f2ria",-1,1],["alla",-1,1],["ella",-1,1],["\u00edvola",-1,1],["ima",-1,1],["\u00edssima",17,1],["qu\u00edssima",18,5],["ana",-1,1],["ina",-1,1],["era",-1,1],["sfera",22,1],["ora",-1,1],["dora",24,1],["adora",25,1],["adura",-1,1],["esa",-1,1],["osa",-1,1],["assa",-1,1],["essa",-1,1],["issa",-1,1],["eta",-1,1],["ita",-1,1],["ota",-1,1],["ista",-1,1],["ialista",36,1], +["ionista",36,1],["iva",-1,1],["ativa",39,1],["n\u00e7a",-1,1],["log\u00eda",-1,3],["ic",-1,4],["\u00edstic",43,1],["enc",-1,1],["esc",-1,1],["ud",-1,1],["atge",-1,1],["ble",-1,1],["able",49,1],["ible",49,1],["isme",-1,1],["ialisme",52,1],["ionisme",52,1],["ivisme",52,1],["aire",-1,1],["icte",-1,1],["iste",-1,1],["ici",-1,1],["\u00edci",-1,1],["logi",-1,3],["ari",-1,1],["tori",-1,1],["al",-1,1],["il",-1,1],["all",-1,1],["ell",-1,1],["\u00edvol",-1,1],["isam",-1,1],["issem",-1,1],["\u00ecssem",-1, +1],["\u00edssem",-1,1],["\u00edssim",-1,1],["qu\u00edssim",73,5],["amen",-1,1],["\u00ecssin",-1,1],["ar",-1,1],["ificar",77,1],["egar",77,1],["ejar",77,1],["itar",77,1],["itzar",77,1],["fer",-1,1],["or",-1,1],["dor",84,1],["dur",-1,1],["doras",-1,1],["ics",-1,4],["l\u00f3gics",88,3],["uds",-1,1],["nces",-1,1],["ades",-1,2],["ancies",-1,1],["encies",-1,1],["\u00e8ncies",-1,1],["\u00edcies",-1,1],["logies",-1,3],["inies",-1,1],["\u00ednies",-1,1],["eries",-1,1],["\u00e0ries",-1,1],["at\u00f2ries",-1, +1],["bles",-1,1],["ables",103,1],["ibles",103,1],["imes",-1,1],["\u00edssimes",106,1],["qu\u00edssimes",107,5],["formes",-1,1],["ismes",-1,1],["ialismes",110,1],["ines",-1,1],["eres",-1,1],["ores",-1,1],["dores",114,1],["idores",115,1],["dures",-1,1],["eses",-1,1],["oses",-1,1],["asses",-1,1],["ictes",-1,1],["ites",-1,1],["otes",-1,1],["istes",-1,1],["ialistes",124,1],["ionistes",124,1],["iques",-1,4],["l\u00f3giques",127,3],["ives",-1,1],["atives",129,1],["log\u00edes",-1,3],["alleng\u00fces",-1, +1],["icis",-1,1],["\u00edcis",-1,1],["logis",-1,3],["aris",-1,1],["toris",-1,1],["ls",-1,1],["als",138,1],["ells",138,1],["ims",-1,1],["\u00edssims",141,1],["qu\u00edssims",142,5],["ions",-1,1],["cions",144,1],["acions",145,2],["esos",-1,1],["osos",-1,1],["assos",-1,1],["issos",-1,1],["ers",-1,1],["ors",-1,1],["dors",152,1],["adors",153,1],["idors",153,1],["ats",-1,1],["itats",156,1],["bilitats",157,1],["ivitats",157,1],["ativitats",159,1],["\u00eftats",156,1],["ets",-1,1],["ants",-1,1],["ents",-1, +1],["ments",164,1],["aments",165,1],["ots",-1,1],["uts",-1,1],["ius",-1,1],["trius",169,1],["atius",169,1],["\u00e8s",-1,1],["\u00e9s",-1,1],["\u00eds",-1,1],["d\u00eds",174,1],["\u00f3s",-1,1],["itat",-1,1],["bilitat",177,1],["ivitat",177,1],["ativitat",179,1],["\u00eftat",-1,1],["et",-1,1],["ant",-1,1],["ent",-1,1],["ient",184,1],["ment",184,1],["ament",186,1],["isament",187,1],["ot",-1,1],["isseu",-1,1],["\u00ecsseu",-1,1],["\u00edsseu",-1,1],["triu",-1,1],["\u00edssiu",-1,1],["atiu",-1,1],["\u00f3", +-1,1],["i\u00f3",196,1],["ci\u00f3",197,1],["aci\u00f3",198,1]],n=[["aba",-1,1],["esca",-1,1],["isca",-1,1],["\u00efsca",-1,1],["ada",-1,1],["ida",-1,1],["uda",-1,1],["\u00efda",-1,1],["ia",-1,1],["aria",8,1],["iria",8,1],["ara",-1,1],["iera",-1,1],["ira",-1,1],["adora",-1,1],["\u00efra",-1,1],["ava",-1,1],["ixa",-1,1],["itza",-1,1],["\u00eda",-1,1],["ar\u00eda",19,1],["er\u00eda",19,1],["ir\u00eda",19,1],["\u00efa",-1,1],["isc",-1,1],["\u00efsc",-1,1],["ad",-1,1],["ed",-1,1],["id",-1,1],["ie",-1, +1],["re",-1,1],["dre",30,1],["ase",-1,1],["iese",-1,1],["aste",-1,1],["iste",-1,1],["ii",-1,1],["ini",-1,1],["esqui",-1,1],["eixi",-1,1],["itzi",-1,1],["am",-1,1],["em",-1,1],["arem",42,1],["irem",42,1],["\u00e0rem",42,1],["\u00edrem",42,1],["\u00e0ssem",42,1],["\u00e9ssem",42,1],["iguem",42,1],["\u00efguem",42,1],["avem",42,1],["\u00e0vem",42,1],["\u00e1vem",42,1],["ir\u00ecem",42,1],["\u00edem",42,1],["ar\u00edem",55,1],["ir\u00edem",55,1],["assim",-1,1],["essim",-1,1],["issim",-1,1],["\u00e0ssim", +-1,1],["\u00e8ssim",-1,1],["\u00e9ssim",-1,1],["\u00edssim",-1,1],["\u00efm",-1,1],["an",-1,1],["aban",66,1],["arian",66,1],["aran",66,1],["ieran",66,1],["iran",66,1],["\u00edan",66,1],["ar\u00edan",72,1],["er\u00edan",72,1],["ir\u00edan",72,1],["en",-1,1],["ien",76,1],["arien",77,1],["irien",77,1],["aren",76,1],["eren",76,1],["iren",76,1],["\u00e0ren",76,1],["\u00efren",76,1],["asen",76,1],["iesen",76,1],["assen",76,1],["essen",76,1],["issen",76,1],["\u00e9ssen",76,1],["\u00efssen",76,1],["esquen", +76,1],["isquen",76,1],["\u00efsquen",76,1],["aven",76,1],["ixen",76,1],["eixen",96,1],["\u00efxen",76,1],["\u00efen",76,1],["in",-1,1],["inin",100,1],["sin",100,1],["isin",102,1],["assin",102,1],["essin",102,1],["issin",102,1],["\u00efssin",102,1],["esquin",100,1],["eixin",100,1],["aron",-1,1],["ieron",-1,1],["ar\u00e1n",-1,1],["er\u00e1n",-1,1],["ir\u00e1n",-1,1],["i\u00efn",-1,1],["ado",-1,1],["ido",-1,1],["ando",-1,2],["iendo",-1,1],["io",-1,1],["ixo",-1,1],["eixo",121,1],["\u00efxo",-1,1],["itzo", +-1,1],["ar",-1,1],["tzar",125,1],["er",-1,1],["eixer",127,1],["ir",-1,1],["ador",-1,1],["as",-1,1],["abas",131,1],["adas",131,1],["idas",131,1],["aras",131,1],["ieras",131,1],["\u00edas",131,1],["ar\u00edas",137,1],["er\u00edas",137,1],["ir\u00edas",137,1],["ids",-1,1],["es",-1,1],["ades",142,1],["ides",142,1],["udes",142,1],["\u00efdes",142,1],["atges",142,1],["ies",142,1],["aries",148,1],["iries",148,1],["ares",142,1],["ires",142,1],["adores",142,1],["\u00efres",142,1],["ases",142,1],["ieses",142, +1],["asses",142,1],["esses",142,1],["isses",142,1],["\u00efsses",142,1],["ques",142,1],["esques",161,1],["\u00efsques",161,1],["aves",142,1],["ixes",142,1],["eixes",165,1],["\u00efxes",142,1],["\u00efes",142,1],["abais",-1,1],["arais",-1,1],["ierais",-1,1],["\u00edais",-1,1],["ar\u00edais",172,1],["er\u00edais",172,1],["ir\u00edais",172,1],["aseis",-1,1],["ieseis",-1,1],["asteis",-1,1],["isteis",-1,1],["inis",-1,1],["sis",-1,1],["isis",181,1],["assis",181,1],["essis",181,1],["issis",181,1],["\u00efssis", +181,1],["esquis",-1,1],["eixis",-1,1],["itzis",-1,1],["\u00e1is",-1,1],["ar\u00e9is",-1,1],["er\u00e9is",-1,1],["ir\u00e9is",-1,1],["ams",-1,1],["ados",-1,1],["idos",-1,1],["amos",-1,1],["\u00e1bamos",197,1],["\u00e1ramos",197,1],["i\u00e9ramos",197,1],["\u00edamos",197,1],["ar\u00edamos",201,1],["er\u00edamos",201,1],["ir\u00edamos",201,1],["aremos",-1,1],["eremos",-1,1],["iremos",-1,1],["\u00e1semos",-1,1],["i\u00e9semos",-1,1],["imos",-1,1],["adors",-1,1],["ass",-1,1],["erass",212,1],["ess",-1, +1],["ats",-1,1],["its",-1,1],["ents",-1,1],["\u00e0s",-1,1],["ar\u00e0s",218,1],["ir\u00e0s",218,1],["ar\u00e1s",-1,1],["er\u00e1s",-1,1],["ir\u00e1s",-1,1],["\u00e9s",-1,1],["ar\u00e9s",224,1],["\u00eds",-1,1],["i\u00efs",-1,1],["at",-1,1],["it",-1,1],["ant",-1,1],["ent",-1,1],["int",-1,1],["ut",-1,1],["\u00eft",-1,1],["au",-1,1],["erau",235,1],["ieu",-1,1],["ineu",-1,1],["areu",-1,1],["ireu",-1,1],["\u00e0reu",-1,1],["\u00edreu",-1,1],["asseu",-1,1],["esseu",-1,1],["eresseu",244,1],["\u00e0sseu", +-1,1],["\u00e9sseu",-1,1],["igueu",-1,1],["\u00efgueu",-1,1],["\u00e0veu",-1,1],["\u00e1veu",-1,1],["itzeu",-1,1],["\u00eceu",-1,1],["ir\u00eceu",253,1],["\u00edeu",-1,1],["ar\u00edeu",255,1],["ir\u00edeu",255,1],["assiu",-1,1],["issiu",-1,1],["\u00e0ssiu",-1,1],["\u00e8ssiu",-1,1],["\u00e9ssiu",-1,1],["\u00edssiu",-1,1],["\u00efu",-1,1],["ix",-1,1],["eix",265,1],["\u00efx",-1,1],["itz",-1,1],["i\u00e0",-1,1],["ar\u00e0",-1,1],["ir\u00e0",-1,1],["itz\u00e0",-1,1],["ar\u00e1",-1,1],["er\u00e1",-1, +1],["ir\u00e1",-1,1],["ir\u00e8",-1,1],["ar\u00e9",-1,1],["er\u00e9",-1,1],["ir\u00e9",-1,1],["\u00ed",-1,1],["i\u00ef",-1,1],["i\u00f3",-1,1]],e=[["a",-1,1],["e",-1,1],["i",-1,1],["\u00efn",-1,1],["o",-1,1],["ir",-1,1],["s",-1,1],["is",6,1],["os",6,1],["\u00efs",6,1],["it",-1,1],["eu",-1,1],["iu",-1,1],["iqu",-1,2],["itz",-1,1],["\u00e0",-1,1],["\u00e1",-1,1],["\u00e9",-1,1],["\u00ec",-1,1],["\u00ed",-1,1],["\u00ef",-1,1],["\u00f3",-1,1]],h=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,128,129,81,6,10],u=0, +t=0;this.m=function(){u=t=m.a;var a=m.cursor;a:{b:for(;;){if(m.i(h,97,252))break b;if(m.cursor>=m.a)break a;m.cursor++}b:for(;;){if(m.k(h,97,252))break b;if(m.cursor>=m.a)break a;m.cursor++}t=m.cursor;b:for(;;){if(m.i(h,97,252))break b;if(m.cursor>=m.a)break a;m.cursor++}b:for(;;){if(m.k(h,97,252))break b;if(m.cursor>=m.a)break a;m.cursor++}u=m.cursor}m.cursor=a;m.f=m.cursor;m.cursor=m.a;a=m.a-m.cursor;m.d=m.cursor;0!=m.g(b)&&(m.c=m.cursor,!(t<=m.cursor)||m.e());m.cursor=m.a-a;a=m.a-m.cursor;b:{var l= +m.a-m.cursor;if(d())break b;m.cursor=m.a-l;m.d=m.cursor;l=m.g(n);if(0!=l)switch(m.c=m.cursor,l){case 1:if(!(t<=m.cursor)||!m.e())break;break;case 2:!(u<=m.cursor)||m.e()}}m.cursor=m.a-a;a=m.a-m.cursor;m.d=m.cursor;l=m.g(e);if(0!=l)switch(m.c=m.cursor,l){case 1:if(!(t<=m.cursor)||!m.e())break;break;case 2:!(t<=m.cursor)||m.b("ic")}m.cursor=m.a-a;m.cursor=m.f;a=m.cursor;g();m.cursor=a;return f};this.stemWord=function(a){m.p(a);this.m();return m.j}};window.DanishStemmer=function(){function g(){var b=d.a-d.cursor;if(!(d.cursord.a)){d.cursor= +p;h=d.cursor;d.cursor=r;b:for(;;){r=d.cursor;if(d.i(n,97,248)){d.cursor=r;break b}d.cursor=r;if(d.cursor>=d.a)break a;d.cursor++}b:for(;;){if(d.k(n,97,248))break b;if(d.cursor>=d.a)break a;d.cursor++}u=d.cursor;u>=h||(u=h)}}d.cursor=a;d.f=d.cursor;d.cursor=d.a;a=d.a-d.cursor;if(!(d.cursor=e.a)break a;e.cursor++}continue}e.cursor=c;break}e.cursor=b;a=e.cursor;e.c=e.cursor;if(e.n("y")){if(e.d=e.cursor,!e.b("Y"))return}else e.cursor=a;for(;;){a=e.cursor;a:{b:for(;;){b=e.cursor; +c:if(e.i(k,97,232)){e.c=e.cursor;d:{c=e.cursor;if(e.n("i")&&(e.d=e.cursor,e.i(k,97,232))){if(!e.b("I"))return;break d}e.cursor=c;if(!e.n("y"))break c;e.d=e.cursor;if(!e.b("Y"))return}e.cursor=b;break b}e.cursor=b;if(e.cursor>=e.a)break a;e.cursor++}continue}e.cursor=a;break}}function d(){y=z=e.a;a:for(;;){if(e.i(k,97,232))break a;if(e.cursor>=e.a)return;e.cursor++}a:for(;;){if(e.k(k,97,232))break a;if(e.cursor>=e.a)return;e.cursor++}z=e.cursor;3<=z||(z=3);a:for(;;){if(e.i(k,97,232))break a;if(e.cursor>= +e.a)return;e.cursor++}a:for(;;){if(e.k(k,97,232))break a;if(e.cursor>=e.a)return;e.cursor++}y=e.cursor}function m(){return y<=e.cursor}function a(){var a=e.a-e.cursor;if(0==e.g(t))return q;e.cursor=e.a-a;e.d=e.cursor;if(e.cursor<=e.f)return q;e.cursor--;e.c=e.cursor;return!e.e()?q:f}function b(){c=q;e.d=e.cursor;if(!e.h("e"))return q;e.c=e.cursor;if(!(z<=e.cursor))return q;var b=e.a-e.cursor;if(!e.q(k,97,232))return q;e.cursor=e.a-b;if(!e.e())return q;c=f;return!a()?q:f}function l(){if(!(z<=e.cursor))return q; +var b=e.a-e.cursor;if(!e.q(k,97,232))return q;e.cursor=e.a-b;b=e.a-e.cursor;if(e.h("gem"))return q;e.cursor=e.a-b;return!e.e()||!a()?q:f}function n(){var d,g=e.a-e.cursor;a:if(e.d=e.cursor,d=e.g(s),0!=d)switch(e.c=e.cursor,d){case 1:if(!(z<=e.cursor))break a;if(!e.b("heid"))return;break;case 2:if(!l())break a;break;case 3:if(!(z<=e.cursor))break a;if(!e.q(w,97,232))break a;if(!e.e())return}e.cursor=e.a-g;g=e.a-e.cursor;b();e.cursor=e.a-g;g=e.a-e.cursor;a:if(e.d=e.cursor,e.h("heid")&&(e.c=e.cursor, +m())){d=e.a-e.cursor;if(e.h("c"))break a;e.cursor=e.a-d;if(!e.e())return;e.d=e.cursor;e.h("en")&&(e.c=e.cursor,l())}e.cursor=e.a-g;g=e.a-e.cursor;a:if(e.d=e.cursor,d=e.g(r),0!=d)switch(e.c=e.cursor,d){case 1:if(!m())break a;if(!e.e())return;b:{d=e.a-e.cursor;c:if(e.d=e.cursor,e.h("ig")&&(e.c=e.cursor,m())){var h=e.a-e.cursor;if(e.h("e"))break c;e.cursor=e.a-h;if(!e.e())return;break b}e.cursor=e.a-d;if(!a())break a}break;case 2:if(!m())break a;d=e.a-e.cursor;if(e.h("e"))break a;e.cursor=e.a-d;if(!e.e())return; +break;case 3:if(!m())break a;if(!e.e())return;if(!b())break a;break;case 4:if(!m())break a;if(!e.e())return;break;case 5:if(!m())break a;if(!c)break a;if(!e.e())return}e.cursor=e.a-g;g=e.a-e.cursor;if(e.q(v,73,232)&&(d=e.a-e.cursor,0!=e.g(p)&&e.q(k,97,232)&&(e.cursor=e.a-d,e.d=e.cursor,!(e.cursor<=e.f)&&(e.cursor--,e.c=e.cursor,!e.e()))))return;e.cursor=e.a-g}var e=new C,h=[["",-1,6],["\u00e1",0,1],["\u00e4",0,1],["\u00e9",0,2],["\u00eb",0,2],["\u00ed",0,3],["\u00ef",0,3],["\u00f3",0,4],["\u00f6", +0,4],["\u00fa",0,5],["\u00fc",0,5]],u=[["",-1,3],["I",0,2],["Y",0,1]],t=[["dd",-1,-1],["kk",-1,-1],["tt",-1,-1]],s=[["ene",-1,2],["se",-1,3],["en",-1,2],["heden",2,1],["s",-1,3]],r=[["end",-1,1],["ig",-1,2],["ing",-1,1],["lijk",-1,3],["baar",-1,4],["bar",-1,5]],p=[["aa",-1,-1],["ee",-1,-1],["oo",-1,-1],["uu",-1,-1]],k=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],v=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],w=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],y=0,z=0,c=q;this.m=function(){var a=e.cursor;g(); +e.cursor=a;a=e.cursor;d();e.cursor=a;e.f=e.cursor;e.cursor=e.a;n();e.cursor=e.f;a=e.cursor;a:for(var b;;){var c=e.cursor;b:{e.c=e.cursor;b=e.o(u);e.d=e.cursor;switch(b){case 1:if(!e.b("y"))break a;break;case 2:if(!e.b("i"))break a;break;case 3:if(e.cursor>=e.a)break b;e.cursor++}continue}e.cursor=c;break}e.cursor=a;return f};this.stemWord=function(a){e.p(a);this.m();return e.j}};window.EnglishStemmer=function(){function g(){z=q;var b=a.cursor;a.c=a.cursor;if(a.n("'")&&(a.d=a.cursor,!a.e()))return;a.cursor=b;b=a.cursor;a.c=a.cursor;if(a.n("y")){a.d=a.cursor;if(!a.b("Y"))return;z=f}a.cursor=b;b=a.cursor;for(;;){var c=a.cursor;b:{c:for(;;){var e=a.cursor;if(a.i(v,97,121)&&(a.c=a.cursor,a.n("y"))){a.d=a.cursor;a.cursor=e;break c}a.cursor=e;if(a.cursor>=a.a)break b;a.cursor++}if(!a.b("Y"))return;z=f;continue}a.cursor=c;break}a.cursor=b}function d(){a:{var b=a.a-a.cursor;if(a.q(w, +89,121)&&a.l(v,97,121)&&a.q(v,97,121))break a;a.cursor=a.a-b;if(!a.q(v,97,121)||!a.l(v,97,121)||a.cursor>a.f)return q}return f}function m(){var b;a.c=a.cursor;b=a.o(k);if(0==b)return q;a.d=a.cursor;if(a.cursora.a)){a.cursor=w;break b}a.cursor=x;break a}a.cursor=k;g();c=A=a.a;k=a.cursor;b:{c:{x=a.cursor;if(0!=a.o(b))break c;a.cursor= +x;d:for(;;){if(a.i(v,97,121))break d;if(a.cursor>=a.a)break b;a.cursor++}d:for(;;){if(a.k(v,97,121))break d;if(a.cursor>=a.a)break b;a.cursor++}}A=a.cursor;c:for(;;){if(a.i(v,97,121))break c;if(a.cursor>=a.a)break b;a.cursor++}c:for(;;){if(a.k(v,97,121))break c;if(a.cursor>=a.a)break b;a.cursor++}c=a.cursor}a.cursor=k;a.f=a.cursor;a.cursor=a.a;k=a.a-a.cursor;b:{x=a.a-a.cursor;a.d=a.cursor;if(0==a.g(l))a.cursor=a.a-x;else if(a.c=a.cursor,!a.e())break b;a.d=a.cursor;x=a.g(n);if(0!=x)switch(a.c=a.cursor, +x){case 1:if(!a.b("ss"))break;break;case 2:c:{x=a.a-a.cursor;w=a.cursor-2;if(!(wa.f?q:f);if(x)break b;a.cursor=a.a-k;k=a.a-a.cursor;c:if(a.d=a.cursor,x=a.g(h),0!=x)switch(a.c=a.cursor,x){case 1:if(!(A<=a.cursor)|| +!a.b("ee"))break;break;case 2:x=a.a-a.cursor;d:for(;;){if(a.l(v,97,121))break d;if(a.cursor<=a.f)break c;a.cursor--}a.cursor=a.a-x;if(!a.e())break;w=a.a-a.cursor;x=a.g(e);a.cursor=a.a-w;switch(x){case 1:x=a.cursor;a.r(a.cursor,a.cursor,"e");a.cursor=x;break;case 2:a.d=a.cursor;if(a.cursor<=a.f)break;a.cursor--;a.c=a.cursor;if(!a.e())break;break;case 3:if(a.cursor!=A)break;x=a.a-a.cursor;if(!d())break;a.cursor=a.a-x;x=a.cursor;a.r(a.cursor,a.cursor,"e");a.cursor=x}}a.cursor=a.a-k;k=a.a-a.cursor;c:{a.d= +a.cursor;d:{x=a.a-a.cursor;if(a.h("y"))break d;a.cursor=a.a-x;if(!a.h("Y"))break c}a.c=a.cursor;if(a.q(v,97,121)){if(!(a.cursor>a.f))break c;a.b("i")}}a.cursor=a.a-k;k=a.a-a.cursor;a.d=a.cursor;x=a.g(u);if(0!=x&&(a.c=a.cursor,A<=a.cursor))switch(x){case 1:if(!a.b("tion"))break;break;case 2:if(!a.b("ence"))break;break;case 3:if(!a.b("ance"))break;break;case 4:if(!a.b("able"))break;break;case 5:if(!a.b("ent"))break;break;case 6:if(!a.b("ize"))break;break;case 7:if(!a.b("ate"))break;break;case 8:if(!a.b("al"))break; +break;case 9:if(!a.b("ful"))break;break;case 10:if(!a.b("ous"))break;break;case 11:if(!a.b("ive"))break;break;case 12:if(!a.b("ble"))break;break;case 13:if(!a.h("l")||!a.b("og"))break;break;case 14:if(!a.b("less"))break;break;case 15:!a.l(y,99,116)||a.e()}a.cursor=a.a-k;k=a.a-a.cursor;a.d=a.cursor;x=a.g(t);if(0!=x&&(a.c=a.cursor,A<=a.cursor))switch(x){case 1:if(!a.b("tion"))break;break;case 2:if(!a.b("ate"))break;break;case 3:if(!a.b("al"))break;break;case 4:if(!a.b("ic"))break;break;case 5:if(!a.e())break; +break;case 6:!(c<=a.cursor)||a.e()}a.cursor=a.a-k;k=a.a-a.cursor;a.d=a.cursor;x=a.g(s);if(0!=x&&(a.c=a.cursor,c<=a.cursor))switch(x){case 1:if(!a.e())break;break;case 2:c:{x=a.a-a.cursor;if(a.h("s"))break c;a.cursor=a.a-x;if(!a.h("t"))break}a.e()}a.cursor=a.a-k;k=a.a-a.cursor;a.d=a.cursor;x=a.g(r);if(0!=x)switch(a.c=a.cursor,x){case 1:c:{if(c<=a.cursor)break c;a.cursor=a.a-(a.a-a.cursor);if(!(A<=a.cursor))break;x=a.a-a.cursor;if(d())break;a.cursor=a.a-x}if(!a.e())break;break;case 2:!(c<=a.cursor)|| +!a.h("l")||a.e()}a.cursor=a.a-k}a.cursor=a.f;k=a.cursor;if(z)for(;;){x=a.cursor;b:{c:for(;;){w=a.cursor;a.c=a.cursor;if(a.n("Y")){a.d=a.cursor;a.cursor=w;break c}a.cursor=w;if(a.cursor>=a.a)break b;a.cursor++}if(!a.b("y"))break;continue}a.cursor=x;break}a.cursor=k}return f};this.stemWord=function(b){a.p(b);this.m();return a.j}};window.EstonianStemmer=function(){function g(){var b;if(a.cursor=a.a)break a;a.cursor++}b:for(;;){if(a.k(w,97,252))break b;if(a.cursor>=a.a)break a;a.cursor++}A=a.cursor}a.cursor=l;a.f=a.cursor;a.cursor= +a.a;l=a.a-a.cursor;if(!(a.cursora.cursor)&&(a.d=a.cursor,x=a.g(p),0!=x))switch(a.c= +a.cursor,x){case 1:if(!a.b("k"))break;break;case 2:if(!a.b("p"))break;break;case 3:a.b("t")}a.cursor=a.a-l;a.cursor=a.f;return f};this.stemWord=function(b){a.p(b);this.m();return a.j}};window.FinnishStemmer=function(){function g(){G=E=l.a;a:for(;;){var a=l.cursor;if(l.i(z,97,246)){l.cursor=a;break a}l.cursor=a;if(l.cursor>=l.a)return;l.cursor++}a:for(;;){if(l.k(z,97,246))break a;if(l.cursor>=l.a)return;l.cursor++}E=l.cursor;a:for(;;){a=l.cursor;if(l.i(z,97,246)){l.cursor=a;break a}l.cursor=a;if(l.cursor>=l.a)return;l.cursor++}a:for(;;){if(l.k(z,97,246))break a;if(l.cursor>=l.a)return;l.cursor++}G=l.cursor}function d(){return 0==l.g(s)?q:f}function m(){return!l.h("i")||!l.l(c,97, +246)?q:f}function a(){var a;if(!(l.cursor=e.a)break a;e.cursor++}continue}e.cursor=a;break}}function d(){for(var a;;){var b=e.cursor;a:{e.c=e.cursor;a=e.o(u);e.d=e.cursor;switch(a){case 1:if(!e.b("i"))return;break;case 2:if(!e.b("u"))return;break; +case 3:if(!e.b("y"))return;break;case 4:if(!e.b("\u00eb"))return;break;case 5:if(!e.b("\u00ef"))return;break;case 6:if(!e.e())return;break;case 7:if(e.cursor>=e.a)break a;e.cursor++}continue}e.cursor=b;break}}function m(){return c<=e.cursor}function a(){var a;e.d=e.cursor;a=e.g(r);if(0==a)return q;e.c=e.cursor;switch(a){case 1:if(!m()||!e.e())return q;break;case 2:if(!m()||!e.e())return q;a=e.a-e.cursor;e.d=e.cursor;if(e.h("ic")){e.c=e.cursor;b:{a=e.a-e.cursor;if(m()){if(!e.e())return q;break b}e.cursor= +e.a-a;if(!e.b("iqU"))return q}}else e.cursor=e.a-a;break;case 3:if(!m()||!e.b("log"))return q;break;case 4:if(!m()||!e.b("u"))return q;break;case 5:if(!m()||!e.b("ent"))return q;break;case 6:if(!(D<=e.cursor)||!e.e())return q;var b=e.a-e.cursor;a:if(e.d=e.cursor,a=e.g(t),0==a)e.cursor=e.a-b;else switch(e.c=e.cursor,a){case 1:if(!m()){e.cursor=e.a-b;break a}if(!e.e())return q;e.d=e.cursor;if(!e.h("at")){e.cursor=e.a-b;break a}e.c=e.cursor;if(!m()){e.cursor=e.a-b;break a}if(!e.e())return q;break;case 2:b:{a= +e.a-e.cursor;if(m()){if(!e.e())return q;break b}e.cursor=e.a-a;if(!(A<=e.cursor)){e.cursor=e.a-b;break a}if(!e.b("eux"))return q}break;case 3:if(!m()){e.cursor=e.a-b;break a}if(!e.e())return q;break;case 4:if(!(D<=e.cursor)){e.cursor=e.a-b;break a}if(!e.b("i"))return q}break;case 7:if(!m()||!e.e())return q;b=e.a-e.cursor;a:if(e.d=e.cursor,a=e.g(s),0==a)e.cursor=e.a-b;else switch(e.c=e.cursor,a){case 1:b:{a=e.a-e.cursor;if(m()){if(!e.e())return q;break b}e.cursor=e.a-a;if(!e.b("abl"))return q}break; +case 2:b:{a=e.a-e.cursor;if(m()){if(!e.e())return q;break b}e.cursor=e.a-a;if(!e.b("iqU"))return q}break;case 3:if(!m()){e.cursor=e.a-b;break a}if(!e.e())return q}break;case 8:if(!m()||!e.e())return q;a=e.a-e.cursor;e.d=e.cursor;if(e.h("at"))if(e.c=e.cursor,m()){if(!e.e())return q;e.d=e.cursor;if(e.h("ic")){e.c=e.cursor;b:{a=e.a-e.cursor;if(m()){if(!e.e())return q;break b}e.cursor=e.a-a;if(!e.b("iqU"))return q}}else e.cursor=e.a-a}else e.cursor=e.a-a;else e.cursor=e.a-a;break;case 9:if(!e.b("eau"))return q; +break;case 10:if(!(A<=e.cursor)||!e.b("al"))return q;break;case 11:a:{a=e.a-e.cursor;if(m()){if(!e.e())return q;break a}e.cursor=e.a-a;if(!(A<=e.cursor)||!e.b("eux"))return q}break;case 12:if(!(A<=e.cursor)||!e.q(y,97,251)||!e.e())return q;break;case 13:if(!(D<=e.cursor))return q;e.b("ant");return q;case 14:if(!(D<=e.cursor))return q;e.b("ent");return q;case 15:a=e.a-e.cursor;if(!e.l(y,97,251)||!(D<=e.cursor))return q;e.cursor=e.a-a;e.e();return q}return f}function b(){if(e.cursor=e.a)){e.cursor++;break b}e.cursor= +m;if(0!=e.o(h))break b;e.cursor=m;if(e.cursor>=e.a)break a;e.cursor++;c:for(;;){if(e.i(y,97,251))break c;if(e.cursor>=e.a)break a;e.cursor++}}D=e.cursor}e.cursor=k;k=e.cursor;a:{b:for(;;){if(e.i(y,97,251))break b;if(e.cursor>=e.a)break a;e.cursor++}b:for(;;){if(e.k(y,97,251))break b;if(e.cursor>=e.a)break a;e.cursor++}A=e.cursor;b:for(;;){if(e.i(y,97,251))break b;if(e.cursor>=e.a)break a;e.cursor++}b:for(;;){if(e.k(y,97,251))break b;if(e.cursor>=e.a)break a;e.cursor++}c=e.cursor}e.cursor=k;e.f=e.cursor; +e.cursor=e.a;k=e.a-e.cursor;b:{m=e.a-e.cursor;c:{var z=e.a-e.cursor;d:{var p=e.a-e.cursor;if(a())break d;e.cursor=e.a-p;if(b())break d;e.cursor=e.a-p;if(!l())break c}e.cursor=e.a-z;m=e.a-e.cursor;e.d=e.cursor;e:{z=e.a-e.cursor;if(e.h("Y")){e.c=e.cursor;if(!e.b("i"))return q;break e}e.cursor=e.a-z;if(e.h("\u00e7")){if(e.c=e.cursor,!e.b("c"))return q}else e.cursor=e.a-m}break b}e.cursor=e.a-m;n()}e.cursor=e.a-k;k=e.a-e.cursor;m=e.a-e.cursor;0!=e.g(w)&&(e.cursor=e.a-m,e.d=e.cursor,e.cursor<=e.f||(e.cursor--, +e.c=e.cursor,e.e()));e.cursor=e.a-k;k=e.a-e.cursor;a:{for(m=1;;){if(e.q(y,97,251)){m--;continue}break}if(!(0=b.a)break a;b.cursor++}continue}b.cursor=d;break}for(b.cursor=a;;){d=b.cursor;a:{b.c=b.cursor;a=b.o(l);b.d=b.cursor;switch(a){case 1:if(!b.b("ss"))return; +break;case 2:if(!b.b("\u00e4"))return;break;case 3:if(!b.b("\u00f6"))return;break;case 4:if(!b.b("\u00fc"))return;break;case 5:if(b.cursor>=b.a)break a;b.cursor++}continue}b.cursor=d;break}}function d(){v=w=b.a;var a=b.cursor,d=b.cursor+3;if(!(d>b.a)){b.cursor=d;k=b.cursor;b.cursor=a;a:for(;;){if(b.i(s,97,252))break a;if(b.cursor>=b.a)return;b.cursor++}a:for(;;){if(b.k(s,97,252))break a;if(b.cursor>=b.a)return;b.cursor++}w=b.cursor;w>=k||(w=k);a:for(;;){if(b.i(s,97,252))break a;if(b.cursor>=b.a)return; +b.cursor++}a:for(;;){if(b.k(s,97,252))break a;if(b.cursor>=b.a)return;b.cursor++}v=b.cursor}}function m(){for(var a;;){var d=b.cursor;a:{b.c=b.cursor;a=b.o(n);b.d=b.cursor;switch(a){case 1:if(!b.b("y"))return;break;case 2:if(!b.b("u"))return;break;case 3:if(!b.b("a"))return;break;case 4:if(!b.b("o"))return;break;case 5:if(b.cursor>=b.a)break a;b.cursor++}continue}b.cursor=d;break}}function a(){var a,d=b.a-b.cursor;a:if(b.d=b.cursor,a=b.g(e),0!=a&&(b.c=b.cursor,w<=b.cursor))switch(a){case 1:if(!b.e())return; +break;case 2:if(!b.e())return;a=b.a-b.cursor;b.d=b.cursor;if(b.h("s"))if(b.c=b.cursor,b.h("nis")){if(!b.e())return}else b.cursor=b.a-a;else b.cursor=b.a-a;break;case 3:if(!b.l(r,98,116))break a;if(!b.e())return}b.cursor=b.a-d;d=b.a-b.cursor;a:if(b.d=b.cursor,a=b.g(h),0!=a&&(b.c=b.cursor,w<=b.cursor))switch(a){case 1:if(!b.e())return;break;case 2:if(!b.l(p,98,116))break a;a=b.cursor-3;if(ab.f)&&!b.b("\u03b1\u03b3\u03b1\u03bd"))return}b.cursor=b.a-a;b.d=b.cursor;if(b.h("\u03b1\u03bd\u03b5")&&(b.c=b.cursor,b.e())){B= +q;a:{a=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(b.l(ha,945,969)){if(!b.b("\u03b1\u03bd"))return;break a}b.cursor=b.a-a;b.d=b.cursor}b.c=b.cursor;0==b.g(F)||b.cursor>b.f||b.b("\u03b1\u03bd")}}function a(){var a=b.a-b.cursor;b.d=b.cursor;if(0!=b.g(W)){b.c=b.cursor;if(!b.e())return;B=q}b.cursor=b.a-a;b.d=b.cursor;if(b.h("\u03b5\u03c4\u03b5")&&(b.c=b.cursor,b.e())){B=q;a:{a=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(b.l(ha,945,969)){if(!b.b("\u03b5\u03c4"))return;break a}b.cursor=b.a-a;b.d=b.cursor;b.c= +b.cursor;if(0!=b.g(da)){if(!b.b("\u03b5\u03c4"))return;break a}b.cursor=b.a-a;b.d=b.cursor}b.c=b.cursor;0==b.g(ea)||b.cursor>b.f||b.b("\u03b5\u03c4")}}var b=new C,l=[["",-1,25],["\u0386",0,1],["\u0388",0,5],["\u0389",0,7],["\u038a",0,9],["\u038c",0,15],["\u038e",0,20],["\u038f",0,24],["\u0390",0,7],["\u0391",0,1],["\u0392",0,2],["\u0393",0,3],["\u0394",0,4],["\u0395",0,5],["\u0396",0,6],["\u0397",0,7],["\u0398",0,8],["\u0399",0,9],["\u039a",0,10],["\u039b",0,11],["\u039c",0,12],["\u039d",0,13],["\u039e", +0,14],["\u039f",0,15],["\u03a0",0,16],["\u03a1",0,17],["\u03a3",0,18],["\u03a4",0,19],["\u03a5",0,20],["\u03a6",0,21],["\u03a7",0,22],["\u03a8",0,23],["\u03a9",0,24],["\u03aa",0,9],["\u03ab",0,20],["\u03ac",0,1],["\u03ad",0,5],["\u03ae",0,7],["\u03af",0,9],["\u03b0",0,20],["\u03c2",0,18],["\u03ca",0,7],["\u03cb",0,20],["\u03cc",0,15],["\u03cd",0,20],["\u03ce",0,24]],n=[["\u03c3\u03ba\u03b1\u03b3\u03b9\u03b1",-1,2],["\u03c6\u03b1\u03b3\u03b9\u03b1",-1,1],["\u03bf\u03bb\u03bf\u03b3\u03b9\u03b1",-1, +3],["\u03c3\u03bf\u03b3\u03b9\u03b1",-1,4],["\u03c4\u03b1\u03c4\u03bf\u03b3\u03b9\u03b1",-1,5],["\u03ba\u03c1\u03b5\u03b1\u03c4\u03b1",-1,6],["\u03c0\u03b5\u03c1\u03b1\u03c4\u03b1",-1,7],["\u03c4\u03b5\u03c1\u03b1\u03c4\u03b1",-1,8],["\u03b3\u03b5\u03b3\u03bf\u03bd\u03bf\u03c4\u03b1",-1,11],["\u03ba\u03b1\u03b8\u03b5\u03c3\u03c4\u03c9\u03c4\u03b1",-1,10],["\u03c6\u03c9\u03c4\u03b1",-1,9],["\u03c0\u03b5\u03c1\u03b1\u03c4\u03b7",-1,7],["\u03c3\u03ba\u03b1\u03b3\u03b9\u03c9\u03bd",-1,2],["\u03c6\u03b1\u03b3\u03b9\u03c9\u03bd", +-1,1],["\u03bf\u03bb\u03bf\u03b3\u03b9\u03c9\u03bd",-1,3],["\u03c3\u03bf\u03b3\u03b9\u03c9\u03bd",-1,4],["\u03c4\u03b1\u03c4\u03bf\u03b3\u03b9\u03c9\u03bd",-1,5],["\u03ba\u03c1\u03b5\u03b1\u03c4\u03c9\u03bd",-1,6],["\u03c0\u03b5\u03c1\u03b1\u03c4\u03c9\u03bd",-1,7],["\u03c4\u03b5\u03c1\u03b1\u03c4\u03c9\u03bd",-1,8],["\u03b3\u03b5\u03b3\u03bf\u03bd\u03bf\u03c4\u03c9\u03bd",-1,11],["\u03ba\u03b1\u03b8\u03b5\u03c3\u03c4\u03c9\u03c4\u03c9\u03bd",-1,10],["\u03c6\u03c9\u03c4\u03c9\u03bd",-1,9],["\u03ba\u03c1\u03b5\u03b1\u03c3", +-1,6],["\u03c0\u03b5\u03c1\u03b1\u03c3",-1,7],["\u03c4\u03b5\u03c1\u03b1\u03c3",-1,8],["\u03b3\u03b5\u03b3\u03bf\u03bd\u03bf\u03c3",-1,11],["\u03ba\u03c1\u03b5\u03b1\u03c4\u03bf\u03c3",-1,6],["\u03c0\u03b5\u03c1\u03b1\u03c4\u03bf\u03c3",-1,7],["\u03c4\u03b5\u03c1\u03b1\u03c4\u03bf\u03c3",-1,8],["\u03b3\u03b5\u03b3\u03bf\u03bd\u03bf\u03c4\u03bf\u03c3",-1,11],["\u03ba\u03b1\u03b8\u03b5\u03c3\u03c4\u03c9\u03c4\u03bf\u03c3",-1,10],["\u03c6\u03c9\u03c4\u03bf\u03c3",-1,9],["\u03ba\u03b1\u03b8\u03b5\u03c3\u03c4\u03c9\u03c3", +-1,10],["\u03c6\u03c9\u03c3",-1,9],["\u03c3\u03ba\u03b1\u03b3\u03b9\u03bf\u03c5",-1,2],["\u03c6\u03b1\u03b3\u03b9\u03bf\u03c5",-1,1],["\u03bf\u03bb\u03bf\u03b3\u03b9\u03bf\u03c5",-1,3],["\u03c3\u03bf\u03b3\u03b9\u03bf\u03c5",-1,4],["\u03c4\u03b1\u03c4\u03bf\u03b3\u03b9\u03bf\u03c5",-1,5]],e=[["\u03c0\u03b1",-1,1],["\u03be\u03b1\u03bd\u03b1\u03c0\u03b1",0,1],["\u03b5\u03c0\u03b1",0,1],["\u03c0\u03b5\u03c1\u03b9\u03c0\u03b1",0,1],["\u03b1\u03bd\u03b1\u03bc\u03c0\u03b1",0,1],["\u03b5\u03bc\u03c0\u03b1", +0,1],["\u03b2",-1,2],["\u03b4\u03b1\u03bd\u03b5",-1,1],["\u03b2\u03b1\u03b8\u03c5\u03c1\u03b9",-1,2],["\u03b2\u03b1\u03c1\u03ba",-1,2],["\u03bc\u03b1\u03c1\u03ba",-1,2],["\u03bb",-1,2],["\u03bc",-1,2],["\u03ba\u03bf\u03c1\u03bd",-1,2],["\u03b1\u03b8\u03c1\u03bf",-1,1],["\u03c3\u03c5\u03bd\u03b1\u03b8\u03c1\u03bf",14,1],["\u03c0",-1,2],["\u03b9\u03bc\u03c0",16,2],["\u03c1",-1,2],["\u03bc\u03b1\u03c1",18,2],["\u03b1\u03bc\u03c0\u03b1\u03c1",18,2],["\u03b3\u03ba\u03c1",18,2],["\u03b2\u03bf\u03bb\u03b2\u03bf\u03c1", +18,2],["\u03b3\u03bb\u03c5\u03ba\u03bf\u03c1",18,2],["\u03c0\u03b9\u03c0\u03b5\u03c1\u03bf\u03c1",18,2],["\u03c0\u03c1",18,2],["\u03bc\u03c0\u03c1",25,2],["\u03b1\u03c1\u03c1",18,2],["\u03b3\u03bb\u03c5\u03ba\u03c5\u03c1",18,2],["\u03c0\u03bf\u03bb\u03c5\u03c1",18,2],["\u03bb\u03bf\u03c5",-1,2]],h=[["\u03b9\u03b6\u03b1",-1,1],["\u03b9\u03b6\u03b5",-1,1],["\u03b9\u03b6\u03b1\u03bc\u03b5",-1,1],["\u03b9\u03b6\u03bf\u03c5\u03bc\u03b5",-1,1],["\u03b9\u03b6\u03b1\u03bd\u03b5",-1,1],["\u03b9\u03b6\u03bf\u03c5\u03bd\u03b5", +-1,1],["\u03b9\u03b6\u03b1\u03c4\u03b5",-1,1],["\u03b9\u03b6\u03b5\u03c4\u03b5",-1,1],["\u03b9\u03b6\u03b5\u03b9",-1,1],["\u03b9\u03b6\u03b1\u03bd",-1,1],["\u03b9\u03b6\u03bf\u03c5\u03bd",-1,1],["\u03b9\u03b6\u03b5\u03c3",-1,1],["\u03b9\u03b6\u03b5\u03b9\u03c3",-1,1],["\u03b9\u03b6\u03c9",-1,1]],u=[["\u03b2\u03b9",-1,1],["\u03bb\u03b9",-1,1],["\u03b1\u03bb",-1,1],["\u03b5\u03bd",-1,1],["\u03c3",-1,1],["\u03c7",-1,1],["\u03c5\u03c8",-1,1],["\u03b6\u03c9",-1,1]],t=[["\u03c9\u03b8\u03b7\u03ba\u03b1", +-1,1],["\u03c9\u03b8\u03b7\u03ba\u03b5",-1,1],["\u03c9\u03b8\u03b7\u03ba\u03b1\u03bc\u03b5",-1,1],["\u03c9\u03b8\u03b7\u03ba\u03b1\u03bd\u03b5",-1,1],["\u03c9\u03b8\u03b7\u03ba\u03b1\u03c4\u03b5",-1,1],["\u03c9\u03b8\u03b7\u03ba\u03b1\u03bd",-1,1],["\u03c9\u03b8\u03b7\u03ba\u03b5\u03c3",-1,1]],s=[["\u03be\u03b1\u03bd\u03b1\u03c0\u03b1",-1,1],["\u03b5\u03c0\u03b1",-1,1],["\u03c0\u03b5\u03c1\u03b9\u03c0\u03b1",-1,1],["\u03b1\u03bd\u03b1\u03bc\u03c0\u03b1",-1,1],["\u03b5\u03bc\u03c0\u03b1",-1,1],["\u03c7\u03b1\u03c1\u03c4\u03bf\u03c0\u03b1", +-1,1],["\u03b5\u03be\u03b1\u03c1\u03c7\u03b1",-1,1],["\u03b3\u03b5",-1,2],["\u03b3\u03ba\u03b5",-1,2],["\u03ba\u03bb\u03b5",-1,1],["\u03b5\u03ba\u03bb\u03b5",9,1],["\u03b1\u03c0\u03b5\u03ba\u03bb\u03b5",10,1],["\u03b1\u03c0\u03bf\u03ba\u03bb\u03b5",9,1],["\u03b5\u03c3\u03c9\u03ba\u03bb\u03b5",9,1],["\u03b4\u03b1\u03bd\u03b5",-1,1],["\u03c0\u03b5",-1,1],["\u03b5\u03c0\u03b5",15,1],["\u03bc\u03b5\u03c4\u03b5\u03c0\u03b5",16,1],["\u03b5\u03c3\u03b5",-1,1],["\u03b3\u03ba",-1,2],["\u03bc",-1,2],["\u03c0\u03bf\u03c5\u03ba\u03b1\u03bc", +20,2],["\u03ba\u03bf\u03bc",20,2],["\u03b1\u03bd",-1,2],["\u03bf\u03bb\u03bf",-1,2],["\u03b1\u03b8\u03c1\u03bf",-1,1],["\u03c3\u03c5\u03bd\u03b1\u03b8\u03c1\u03bf",25,1],["\u03c0",-1,2],["\u03bb\u03b1\u03c1",-1,2],["\u03b4\u03b7\u03bc\u03bf\u03ba\u03c1\u03b1\u03c4",-1,2],["\u03b1\u03c6",-1,2],["\u03b3\u03b9\u03b3\u03b1\u03bd\u03c4\u03bf\u03b1\u03c6",30,2]],r=[["\u03b9\u03c3\u03b1",-1,1],["\u03b9\u03c3\u03b1\u03bc\u03b5",-1,1],["\u03b9\u03c3\u03b1\u03bd\u03b5",-1,1],["\u03b9\u03c3\u03b5",-1,1],["\u03b9\u03c3\u03b1\u03c4\u03b5", +-1,1],["\u03b9\u03c3\u03b1\u03bd",-1,1],["\u03b9\u03c3\u03b5\u03c3",-1,1]],p=[["\u03be\u03b1\u03bd\u03b1\u03c0\u03b1",-1,1],["\u03b5\u03c0\u03b1",-1,1],["\u03c0\u03b5\u03c1\u03b9\u03c0\u03b1",-1,1],["\u03b1\u03bd\u03b1\u03bc\u03c0\u03b1",-1,1],["\u03b5\u03bc\u03c0\u03b1",-1,1],["\u03c7\u03b1\u03c1\u03c4\u03bf\u03c0\u03b1",-1,1],["\u03b5\u03be\u03b1\u03c1\u03c7\u03b1",-1,1],["\u03ba\u03bb\u03b5",-1,1],["\u03b5\u03ba\u03bb\u03b5",7,1],["\u03b1\u03c0\u03b5\u03ba\u03bb\u03b5",8,1],["\u03b1\u03c0\u03bf\u03ba\u03bb\u03b5", +7,1],["\u03b5\u03c3\u03c9\u03ba\u03bb\u03b5",7,1],["\u03b4\u03b1\u03bd\u03b5",-1,1],["\u03c0\u03b5",-1,1],["\u03b5\u03c0\u03b5",13,1],["\u03bc\u03b5\u03c4\u03b5\u03c0\u03b5",14,1],["\u03b5\u03c3\u03b5",-1,1],["\u03b1\u03b8\u03c1\u03bf",-1,1],["\u03c3\u03c5\u03bd\u03b1\u03b8\u03c1\u03bf",17,1]],k=[["\u03b9\u03c3\u03bf\u03c5\u03bc\u03b5",-1,1],["\u03b9\u03c3\u03bf\u03c5\u03bd\u03b5",-1,1],["\u03b9\u03c3\u03b5\u03c4\u03b5",-1,1],["\u03b9\u03c3\u03b5\u03b9",-1,1],["\u03b9\u03c3\u03bf\u03c5\u03bd",-1, +1],["\u03b9\u03c3\u03b5\u03b9\u03c3",-1,1],["\u03b9\u03c3\u03c9",-1,1]],v=[["\u03b1\u03c4\u03b1",-1,2],["\u03c6\u03b1",-1,2],["\u03b7\u03c6\u03b1",1,2],["\u03bc\u03b5\u03b3",-1,2],["\u03bb\u03c5\u03b3",-1,2],["\u03b7\u03b4",-1,2],["\u03ba\u03bb\u03b5",-1,1],["\u03b5\u03c3\u03c9\u03ba\u03bb\u03b5",6,1],["\u03c0\u03bb\u03b5",-1,1],["\u03b4\u03b1\u03bd\u03b5",-1,1],["\u03c3\u03b5",-1,1],["\u03b1\u03c3\u03b5",10,1],["\u03ba\u03b1\u03b8",-1,2],["\u03b5\u03c7\u03b8",-1,2],["\u03ba\u03b1\u03ba",-1,2],["\u03bc\u03b1\u03ba", +-1,2],["\u03c3\u03ba",-1,2],["\u03c6\u03b9\u03bb",-1,2],["\u03ba\u03c5\u03bb",-1,2],["\u03bc",-1,2],["\u03b3\u03b5\u03bc",19,2],["\u03b1\u03c7\u03bd",-1,2],["\u03c3\u03c5\u03bd\u03b1\u03b8\u03c1\u03bf",-1,1],["\u03c0",-1,2],["\u03b1\u03c0",23,2],["\u03b5\u03bc\u03c0",23,2],["\u03b5\u03c5\u03c0",23,2],["\u03b1\u03c1",-1,2],["\u03b1\u03bf\u03c1",-1,2],["\u03b3\u03c5\u03c1",-1,2],["\u03c7\u03c1",-1,2],["\u03c7\u03c9\u03c1",-1,2],["\u03ba\u03c4",-1,2],["\u03b1\u03ba\u03c4",32,2],["\u03c7\u03c4",-1,2], +["\u03b1\u03c7\u03c4",34,2],["\u03c4\u03b1\u03c7",-1,2],["\u03c3\u03c7",-1,2],["\u03b1\u03c3\u03c7",37,2],["\u03c5\u03c8",-1,2]],w=[["\u03b9\u03c3\u03c4\u03b1",-1,1],["\u03b9\u03c3\u03c4\u03b5",-1,1],["\u03b9\u03c3\u03c4\u03b7",-1,1],["\u03b9\u03c3\u03c4\u03bf\u03b9",-1,1],["\u03b9\u03c3\u03c4\u03c9\u03bd",-1,1],["\u03b9\u03c3\u03c4\u03bf",-1,1],["\u03b9\u03c3\u03c4\u03b5\u03c3",-1,1],["\u03b9\u03c3\u03c4\u03b7\u03c3",-1,1],["\u03b9\u03c3\u03c4\u03bf\u03c3",-1,1],["\u03b9\u03c3\u03c4\u03bf\u03c5\u03c3", +-1,1],["\u03b9\u03c3\u03c4\u03bf\u03c5",-1,1]],y=[["\u03b5\u03b3\u03ba\u03bb\u03b5",-1,1],["\u03b1\u03c0\u03bf\u03ba\u03bb\u03b5",-1,1],["\u03b4\u03b1\u03bd\u03b5",-1,2],["\u03b1\u03bd\u03c4\u03b9\u03b4\u03b1\u03bd\u03b5",2,2],["\u03c3\u03b5",-1,1],["\u03bc\u03b5\u03c4\u03b1\u03c3\u03b5",4,1],["\u03bc\u03b9\u03ba\u03c1\u03bf\u03c3\u03b5",4,1]],z=[["\u03b1\u03c4\u03bf\u03bc\u03b9\u03ba",-1,2],["\u03b5\u03b8\u03bd\u03b9\u03ba",-1,4],["\u03c4\u03bf\u03c0\u03b9\u03ba",-1,7],["\u03b5\u03ba\u03bb\u03b5\u03ba\u03c4\u03b9\u03ba", +-1,5],["\u03c3\u03ba\u03b5\u03c0\u03c4\u03b9\u03ba",-1,6],["\u03b3\u03bd\u03c9\u03c3\u03c4\u03b9\u03ba",-1,3],["\u03b1\u03b3\u03bd\u03c9\u03c3\u03c4\u03b9\u03ba",5,1],["\u03b1\u03bb\u03b5\u03be\u03b1\u03bd\u03b4\u03c1\u03b9\u03bd",-1,8],["\u03b8\u03b5\u03b1\u03c4\u03c1\u03b9\u03bd",-1,10],["\u03b2\u03c5\u03b6\u03b1\u03bd\u03c4\u03b9\u03bd",-1,9]],c=[["\u03b9\u03c3\u03bc\u03bf\u03b9",-1,1],["\u03b9\u03c3\u03bc\u03c9\u03bd",-1,1],["\u03b9\u03c3\u03bc\u03bf",-1,1],["\u03b9\u03c3\u03bc\u03bf\u03c3",-1, +1],["\u03b9\u03c3\u03bc\u03bf\u03c5\u03c3",-1,1],["\u03b9\u03c3\u03bc\u03bf\u03c5",-1,1]],A=[["\u03c3",-1,1],["\u03c7",-1,1]],D=[["\u03bf\u03c5\u03b4\u03b1\u03ba\u03b9\u03b1",-1,1],["\u03b1\u03c1\u03b1\u03ba\u03b9\u03b1",-1,1],["\u03bf\u03c5\u03b4\u03b1\u03ba\u03b9",-1,1],["\u03b1\u03c1\u03b1\u03ba\u03b9",-1,1]],x=[["\u03b2",-1,2],["\u03b2\u03b1\u03bc\u03b2",0,1],["\u03c3\u03bb\u03bf\u03b2",0,1],["\u03c4\u03c3\u03b5\u03c7\u03bf\u03c3\u03bb\u03bf\u03b2",2,1],["\u03ba\u03b1\u03c1\u03b4",-1,2],["\u03b6", +-1,2],["\u03c4\u03b6",5,1],["\u03ba",-1,1],["\u03ba\u03b1\u03c0\u03b1\u03ba",7,1],["\u03c3\u03bf\u03ba",7,1],["\u03c3\u03ba",7,1],["\u03b2\u03b1\u03bb",-1,2],["\u03bc\u03b1\u03bb",-1,1],["\u03b3\u03bb",-1,2],["\u03c4\u03c1\u03b9\u03c0\u03bf\u03bb",-1,2],["\u03c0\u03bb",-1,1],["\u03bb\u03bf\u03c5\u03bb",-1,1],["\u03c6\u03c5\u03bb",-1,1],["\u03ba\u03b1\u03b9\u03bc",-1,1],["\u03ba\u03bb\u03b9\u03bc",-1,1],["\u03c6\u03b1\u03c1\u03bc",-1,1],["\u03b3\u03b9\u03b1\u03bd",-1,2],["\u03c3\u03c0\u03b1\u03bd", +-1,1],["\u03b7\u03b3\u03bf\u03c5\u03bc\u03b5\u03bd",-1,2],["\u03ba\u03bf\u03bd",-1,1],["\u03bc\u03b1\u03ba\u03c1\u03c5\u03bd",-1,2],["\u03c0",-1,2],["\u03ba\u03b1\u03c4\u03c1\u03b1\u03c0",26,1],["\u03c1",-1,1],["\u03b2\u03c1",28,1],["\u03bb\u03b1\u03b2\u03c1",29,1],["\u03b1\u03bc\u03b2\u03c1",29,1],["\u03bc\u03b5\u03c1",28,1],["\u03c0\u03b1\u03c4\u03b5\u03c1",28,2],["\u03b1\u03bd\u03b8\u03c1",28,1],["\u03ba\u03bf\u03c1",28,1],["\u03c3",-1,1],["\u03bd\u03b1\u03b3\u03ba\u03b1\u03c3",36,1],["\u03c4\u03bf\u03c3", +36,2],["\u03bc\u03bf\u03c5\u03c3\u03c4",-1,1],["\u03c1\u03c5",-1,1],["\u03c6",-1,1],["\u03c3\u03c6",41,1],["\u03b1\u03bb\u03b9\u03c3\u03c6",42,1],["\u03bd\u03c5\u03c6",41,2],["\u03c7",-1,1]],G=[["\u03b1\u03ba\u03b9\u03b1",-1,1],["\u03b1\u03c1\u03b1\u03ba\u03b9\u03b1",0,1],["\u03b9\u03c4\u03c3\u03b1",-1,1],["\u03b1\u03ba\u03b9",-1,1],["\u03b1\u03c1\u03b1\u03ba\u03b9",3,1],["\u03b9\u03c4\u03c3\u03c9\u03bd",-1,1],["\u03b9\u03c4\u03c3\u03b1\u03c3",-1,1],["\u03b9\u03c4\u03c3\u03b5\u03c3",-1,1]],E=[["\u03c8\u03b1\u03bb", +-1,1],["\u03b1\u03b9\u03c6\u03bd",-1,1],["\u03bf\u03bb\u03bf",-1,1],["\u03b9\u03c1",-1,1]],J=[["\u03b5",-1,1],["\u03c0\u03b1\u03b9\u03c7\u03bd",-1,1]],O=[["\u03b9\u03b4\u03b9\u03b1",-1,1],["\u03b9\u03b4\u03b9\u03c9\u03bd",-1,1],["\u03b9\u03b4\u03b9\u03bf",-1,1]],N=[["\u03b9\u03b2",-1,1],["\u03b4",-1,1],["\u03c6\u03c1\u03b1\u03b3\u03ba",-1,1],["\u03bb\u03c5\u03ba",-1,1],["\u03bf\u03b2\u03b5\u03bb",-1,1],["\u03bc\u03b7\u03bd",-1,1],["\u03c1",-1,1]],P=[["\u03b9\u03c3\u03ba\u03b5",-1,1],["\u03b9\u03c3\u03ba\u03bf", +-1,1],["\u03b9\u03c3\u03ba\u03bf\u03c3",-1,1],["\u03b9\u03c3\u03ba\u03bf\u03c5",-1,1]],Q=[["\u03b1\u03b4\u03c9\u03bd",-1,1],["\u03b1\u03b4\u03b5\u03c3",-1,1]],T=[["\u03b3\u03b9\u03b1\u03b3\u03b9",-1,-1],["\u03b8\u03b5\u03b9",-1,-1],["\u03bf\u03ba",-1,-1],["\u03bc\u03b1\u03bc",-1,-1],["\u03bc\u03b1\u03bd",-1,-1],["\u03bc\u03c0\u03b1\u03bc\u03c0",-1,-1],["\u03c0\u03b5\u03b8\u03b5\u03c1",-1,-1],["\u03c0\u03b1\u03c4\u03b5\u03c1",-1,-1],["\u03ba\u03c5\u03c1",-1,-1],["\u03bd\u03c4\u03b1\u03bd\u03c4",-1, +-1]],U=[["\u03b5\u03b4\u03c9\u03bd",-1,1],["\u03b5\u03b4\u03b5\u03c3",-1,1]],R=[["\u03bc\u03b9\u03bb",-1,1],["\u03b4\u03b1\u03c0",-1,1],["\u03b3\u03b7\u03c0",-1,1],["\u03b9\u03c0",-1,1],["\u03b5\u03bc\u03c0",-1,1],["\u03bf\u03c0",-1,1],["\u03ba\u03c1\u03b1\u03c3\u03c0",-1,1],["\u03c5\u03c0",-1,1]],S=[["\u03bf\u03c5\u03b4\u03c9\u03bd",-1,1],["\u03bf\u03c5\u03b4\u03b5\u03c3",-1,1]],V=[["\u03c4\u03c1\u03b1\u03b3",-1,1],["\u03c6\u03b5",-1,1],["\u03ba\u03b1\u03bb\u03b9\u03b1\u03ba",-1,1],["\u03b1\u03c1\u03ba", +-1,1],["\u03c3\u03ba",-1,1],["\u03c0\u03b5\u03c4\u03b1\u03bb",-1,1],["\u03b2\u03b5\u03bb",-1,1],["\u03bb\u03bf\u03c5\u03bb",-1,1],["\u03c6\u03bb",-1,1],["\u03c7\u03bd",-1,1],["\u03c0\u03bb\u03b5\u03be",-1,1],["\u03c3\u03c0",-1,1],["\u03c6\u03c1",-1,1],["\u03c3",-1,1],["\u03bb\u03b9\u03c7",-1,1]],M=[["\u03b5\u03c9\u03bd",-1,1],["\u03b5\u03c9\u03c3",-1,1]],K=[["\u03b4",-1,1],["\u03b9\u03b4",0,1],["\u03b8",-1,1],["\u03b3\u03b1\u03bb",-1,1],["\u03b5\u03bb",-1,1],["\u03bd",-1,1],["\u03c0",-1,1],["\u03c0\u03b1\u03c1", +-1,1]],H=[["\u03b9\u03b1",-1,1],["\u03b9\u03c9\u03bd",-1,1],["\u03b9\u03bf\u03c5",-1,1]],I=[["\u03b9\u03ba\u03b1",-1,1],["\u03b9\u03ba\u03c9\u03bd",-1,1],["\u03b9\u03ba\u03bf",-1,1],["\u03b9\u03ba\u03bf\u03c5",-1,1]],L=[["\u03b1\u03b4",-1,1],["\u03c3\u03c5\u03bd\u03b1\u03b4",0,1],["\u03ba\u03b1\u03c4\u03b1\u03b4",0,1],["\u03b1\u03bd\u03c4\u03b9\u03b4",-1,1],["\u03b5\u03bd\u03b4",-1,1],["\u03c6\u03c5\u03bb\u03bf\u03b4",-1,1],["\u03c5\u03c0\u03bf\u03b4",-1,1],["\u03c0\u03c1\u03c9\u03c4\u03bf\u03b4", +-1,1],["\u03b5\u03be\u03c9\u03b4",-1,1],["\u03b7\u03b8",-1,1],["\u03b1\u03bd\u03b7\u03b8",9,1],["\u03be\u03b9\u03ba",-1,1],["\u03b1\u03bb",-1,1],["\u03b1\u03bc\u03bc\u03bf\u03c7\u03b1\u03bb",12,1],["\u03c3\u03c5\u03bd\u03bf\u03bc\u03b7\u03bb",-1,1],["\u03bc\u03c0\u03bf\u03bb",-1,1],["\u03bc\u03bf\u03c5\u03bb",-1,1],["\u03c4\u03c3\u03b1\u03bc",-1,1],["\u03b2\u03c1\u03c9\u03bc",-1,1],["\u03b1\u03bc\u03b1\u03bd",-1,1],["\u03bc\u03c0\u03b1\u03bd",-1,1],["\u03ba\u03b1\u03bb\u03bb\u03b9\u03bd",-1,1],["\u03c0\u03bf\u03c3\u03c4\u03b5\u03bb\u03bd", +-1,1],["\u03c6\u03b9\u03bb\u03bf\u03bd",-1,1],["\u03ba\u03b1\u03bb\u03c0",-1,1],["\u03b3\u03b5\u03c1",-1,1],["\u03c7\u03b1\u03c3",-1,1],["\u03bc\u03c0\u03bf\u03c3",-1,1],["\u03c0\u03bb\u03b9\u03b1\u03c4\u03c3",-1,1],["\u03c0\u03b5\u03c4\u03c3",-1,1],["\u03c0\u03b9\u03c4\u03c3",-1,1],["\u03c6\u03c5\u03c3",-1,1],["\u03bc\u03c0\u03b1\u03b3\u03b9\u03b1\u03c4",-1,1],["\u03bd\u03b9\u03c4",-1,1],["\u03c0\u03b9\u03ba\u03b1\u03bd\u03c4",-1,1],["\u03c3\u03b5\u03c1\u03c4",-1,1]],$=[["\u03b1\u03b3\u03b1\u03bc\u03b5", +-1,1],["\u03b7\u03ba\u03b1\u03bc\u03b5",-1,1],["\u03b7\u03b8\u03b7\u03ba\u03b1\u03bc\u03b5",1,1],["\u03b7\u03c3\u03b1\u03bc\u03b5",-1,1],["\u03bf\u03c5\u03c3\u03b1\u03bc\u03b5",-1,1]],aa=[["\u03b2\u03bf\u03c5\u03b2",-1,1],["\u03be\u03b5\u03b8",-1,1],["\u03c0\u03b5\u03b8",-1,1],["\u03b1\u03c0\u03bf\u03b8",-1,1],["\u03b1\u03c0\u03bf\u03ba",-1,1],["\u03bf\u03c5\u03bb",-1,1],["\u03b1\u03bd\u03b1\u03c0",-1,1],["\u03c0\u03b9\u03ba\u03c1",-1,1],["\u03c0\u03bf\u03c4",-1,1],["\u03b1\u03c0\u03bf\u03c3\u03c4", +-1,1],["\u03c7",-1,1],["\u03c3\u03b9\u03c7",10,1]],ba=[["\u03c4\u03c1",-1,1],["\u03c4\u03c3",-1,1]],ca=[["\u03b1\u03b3\u03b1\u03bd\u03b5",-1,1],["\u03b7\u03ba\u03b1\u03bd\u03b5",-1,1],["\u03b7\u03b8\u03b7\u03ba\u03b1\u03bd\u03b5",1,1],["\u03b7\u03c3\u03b1\u03bd\u03b5",-1,1],["\u03bf\u03c5\u03c3\u03b1\u03bd\u03b5",-1,1],["\u03bf\u03bd\u03c4\u03b1\u03bd\u03b5",-1,1],["\u03b9\u03bf\u03bd\u03c4\u03b1\u03bd\u03b5",5,1],["\u03bf\u03c5\u03bd\u03c4\u03b1\u03bd\u03b5",-1,1],["\u03b9\u03bf\u03c5\u03bd\u03c4\u03b1\u03bd\u03b5", +7,1],["\u03bf\u03c4\u03b1\u03bd\u03b5",-1,1],["\u03b9\u03bf\u03c4\u03b1\u03bd\u03b5",9,1]],F=[["\u03c4\u03b1\u03b2",-1,1],["\u03bd\u03c4\u03b1\u03b2",0,1],["\u03c8\u03b7\u03bb\u03bf\u03c4\u03b1\u03b2",0,1],["\u03bb\u03b9\u03b2",-1,1],["\u03ba\u03bb\u03b9\u03b2",3,1],["\u03be\u03b7\u03c1\u03bf\u03ba\u03bb\u03b9\u03b2",4,1],["\u03b3",-1,1],["\u03b1\u03b3",6,1],["\u03c4\u03c1\u03b1\u03b3",7,1],["\u03c4\u03c3\u03b1\u03b3",7,1],["\u03b1\u03b8\u03b9\u03b3\u03b3",6,1],["\u03c4\u03c3\u03b9\u03b3\u03b3",6, +1],["\u03b1\u03c4\u03c3\u03b9\u03b3\u03b3",11,1],["\u03c3\u03c4\u03b5\u03b3",6,1],["\u03b1\u03c0\u03b7\u03b3",6,1],["\u03c3\u03b9\u03b3",6,1],["\u03b1\u03bd\u03bf\u03c1\u03b3",6,1],["\u03b5\u03bd\u03bf\u03c1\u03b3",6,1],["\u03ba\u03b1\u03bb\u03c0\u03bf\u03c5\u03b6",-1,1],["\u03b8",-1,1],["\u03bc\u03c9\u03b1\u03bc\u03b5\u03b8",19,1],["\u03c0\u03b9\u03b8",19,1],["\u03b1\u03c0\u03b9\u03b8",21,1],["\u03b4\u03b5\u03ba",-1,1],["\u03c0\u03b5\u03bb\u03b5\u03ba",-1,1],["\u03b9\u03ba",-1,1],["\u03b1\u03bd\u03b9\u03ba", +25,1],["\u03b2\u03bf\u03c5\u03bb\u03ba",-1,1],["\u03b2\u03b1\u03c3\u03ba",-1,1],["\u03b2\u03c1\u03b1\u03c7\u03c5\u03ba",-1,1],["\u03b3\u03b1\u03bb",-1,1],["\u03ba\u03b1\u03c4\u03b1\u03b3\u03b1\u03bb",30,1],["\u03bf\u03bb\u03bf\u03b3\u03b1\u03bb",30,1],["\u03b2\u03b1\u03b8\u03c5\u03b3\u03b1\u03bb",30,1],["\u03bc\u03b5\u03bb",-1,1],["\u03ba\u03b1\u03c3\u03c4\u03b5\u03bb",-1,1],["\u03c0\u03bf\u03c1\u03c4\u03bf\u03bb",-1,1],["\u03c0\u03bb",-1,1],["\u03b4\u03b9\u03c0\u03bb",37,1],["\u03bb\u03b1\u03bf\u03c0\u03bb", +37,1],["\u03c8\u03c5\u03c7\u03bf\u03c0\u03bb",37,1],["\u03bf\u03c5\u03bb",-1,1],["\u03bc",-1,1],["\u03bf\u03bb\u03b9\u03b3\u03bf\u03b4\u03b1\u03bc",42,1],["\u03bc\u03bf\u03c5\u03c3\u03bf\u03c5\u03bb\u03bc",42,1],["\u03b4\u03c1\u03b1\u03b4\u03bf\u03c5\u03bc",42,1],["\u03b2\u03c1\u03b1\u03c7\u03bc",42,1],["\u03bd",-1,1],["\u03b1\u03bc\u03b5\u03c1\u03b9\u03ba\u03b1\u03bd",47,1],["\u03c0",-1,1],["\u03b1\u03b4\u03b1\u03c0",49,1],["\u03c7\u03b1\u03bc\u03b7\u03bb\u03bf\u03b4\u03b1\u03c0",49,1],["\u03c0\u03bf\u03bb\u03c5\u03b4\u03b1\u03c0", +49,1],["\u03ba\u03bf\u03c0",49,1],["\u03c5\u03c0\u03bf\u03ba\u03bf\u03c0",53,1],["\u03c4\u03c3\u03bf\u03c0",49,1],["\u03c3\u03c0",49,1],["\u03b5\u03c1",-1,1],["\u03b3\u03b5\u03c1",57,1],["\u03b2\u03b5\u03c4\u03b5\u03c1",57,1],["\u03bb\u03bf\u03c5\u03b8\u03b7\u03c1",-1,1],["\u03ba\u03bf\u03c1\u03bc\u03bf\u03c1",-1,1],["\u03c0\u03b5\u03c1\u03b9\u03c4\u03c1",-1,1],["\u03bf\u03c5\u03c1",-1,1],["\u03c3",-1,1],["\u03b2\u03b1\u03c3",64,1],["\u03c0\u03bf\u03bb\u03b9\u03c3",64,1],["\u03c3\u03b1\u03c1\u03b1\u03ba\u03b1\u03c4\u03c3", +64,1],["\u03b8\u03c5\u03c3",64,1],["\u03b4\u03b9\u03b1\u03c4",-1,1],["\u03c0\u03bb\u03b1\u03c4",-1,1],["\u03c4\u03c3\u03b1\u03c1\u03bb\u03b1\u03c4",-1,1],["\u03c4\u03b5\u03c4",-1,1],["\u03c0\u03bf\u03c5\u03c1\u03b9\u03c4",-1,1],["\u03c3\u03bf\u03c5\u03bb\u03c4",-1,1],["\u03bc\u03b1\u03b9\u03bd\u03c4",-1,1],["\u03b6\u03c9\u03bd\u03c4",-1,1],["\u03ba\u03b1\u03c3\u03c4",-1,1],["\u03c6",-1,1],["\u03b4\u03b9\u03b1\u03c6",78,1],["\u03c3\u03c4\u03b5\u03c6",78,1],["\u03c6\u03c9\u03c4\u03bf\u03c3\u03c4\u03b5\u03c6", +80,1],["\u03c0\u03b5\u03c1\u03b7\u03c6",78,1],["\u03c5\u03c0\u03b5\u03c1\u03b7\u03c6",82,1],["\u03ba\u03bf\u03b9\u03bb\u03b1\u03c1\u03c6",78,1],["\u03c0\u03b5\u03bd\u03c4\u03b1\u03c1\u03c6",78,1],["\u03bf\u03c1\u03c6",78,1],["\u03c7",-1,1],["\u03b1\u03bc\u03b7\u03c7",87,1],["\u03b2\u03b9\u03bf\u03bc\u03b7\u03c7",87,1],["\u03bc\u03b5\u03b3\u03bb\u03bf\u03b2\u03b9\u03bf\u03bc\u03b7\u03c7",89,1],["\u03ba\u03b1\u03c0\u03bd\u03bf\u03b2\u03b9\u03bf\u03bc\u03b7\u03c7",89,1],["\u03bc\u03b9\u03ba\u03c1\u03bf\u03b2\u03b9\u03bf\u03bc\u03b7\u03c7", +89,1],["\u03c0\u03bf\u03bb\u03c5\u03bc\u03b7\u03c7",87,1],["\u03bb\u03b9\u03c7",87,1]],W=[["\u03b7\u03c3\u03b5\u03c4\u03b5",-1,1]],da=[["\u03b5\u03bd\u03b4",-1,1],["\u03c3\u03c5\u03bd\u03b4",-1,1],["\u03bf\u03b4",-1,1],["\u03b4\u03b9\u03b1\u03b8",-1,1],["\u03ba\u03b1\u03b8",-1,1],["\u03c1\u03b1\u03b8",-1,1],["\u03c4\u03b1\u03b8",-1,1],["\u03c4\u03b9\u03b8",-1,1],["\u03b5\u03ba\u03b8",-1,1],["\u03b5\u03bd\u03b8",-1,1],["\u03c3\u03c5\u03bd\u03b8",-1,1],["\u03c1\u03bf\u03b8",-1,1],["\u03c5\u03c0\u03b5\u03c1\u03b8", +-1,1],["\u03c3\u03b8",-1,1],["\u03b5\u03c5\u03b8",-1,1],["\u03b1\u03c1\u03ba",-1,1],["\u03c9\u03c6\u03b5\u03bb",-1,1],["\u03b2\u03bf\u03bb",-1,1],["\u03b1\u03b9\u03bd",-1,1],["\u03c0\u03bf\u03bd",-1,1],["\u03c1\u03bf\u03bd",-1,1],["\u03c3\u03c5\u03bd",-1,1],["\u03b2\u03b1\u03c1",-1,1],["\u03b2\u03c1",-1,1],["\u03b1\u03b9\u03c1",-1,1],["\u03c6\u03bf\u03c1",-1,1],["\u03b5\u03c5\u03c1",-1,1],["\u03c0\u03c5\u03c1",-1,1],["\u03c7\u03c9\u03c1",-1,1],["\u03bd\u03b5\u03c4",-1,1],["\u03c3\u03c7",-1,1]],ea= +[["\u03c0\u03b1\u03b3",-1,1],["\u03b4",-1,1],["\u03b1\u03b4",1,1],["\u03b8",-1,1],["\u03b1\u03b8",3,1],["\u03c4\u03bf\u03ba",-1,1],["\u03c3\u03ba",-1,1],["\u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03bb",-1,1],["\u03c3\u03ba\u03b5\u03bb",-1,1],["\u03b1\u03c0\u03bb",-1,1],["\u03b5\u03bc",-1,1],["\u03b1\u03bd",-1,1],["\u03b2\u03b5\u03bd",-1,1],["\u03b2\u03b1\u03c1\u03bf\u03bd",-1,1],["\u03ba\u03bf\u03c0",-1,1],["\u03c3\u03b5\u03c1\u03c0",-1,1],["\u03b1\u03b2\u03b1\u03c1",-1,1],["\u03b5\u03bd\u03b1\u03c1", +-1,1],["\u03b1\u03b2\u03c1",-1,1],["\u03bc\u03c0\u03bf\u03c1",-1,1],["\u03b8\u03b1\u03c1\u03c1",-1,1],["\u03bd\u03c4\u03c1",-1,1],["\u03c5",-1,1],["\u03bd\u03b9\u03c6",-1,1],["\u03c3\u03c5\u03c1\u03c6",-1,1]],fa=[["\u03bf\u03bd\u03c4\u03b1\u03c3",-1,1],["\u03c9\u03bd\u03c4\u03b1\u03c3",-1,1]],ga=[["\u03bf\u03bc\u03b1\u03c3\u03c4\u03b5",-1,1],["\u03b9\u03bf\u03bc\u03b1\u03c3\u03c4\u03b5",0,1]],Y=[["\u03c0",-1,1],["\u03b1\u03c0",0,1],["\u03b1\u03ba\u03b1\u03c4\u03b1\u03c0",1,1],["\u03c3\u03c5\u03bc\u03c0", +0,1],["\u03b1\u03c3\u03c5\u03bc\u03c0",3,1],["\u03b1\u03bc\u03b5\u03c4\u03b1\u03bc\u03c6",-1,1]],Z=[["\u03b6",-1,1],["\u03b1\u03bb",-1,1],["\u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03bb",1,1],["\u03b5\u03ba\u03c4\u03b5\u03bb",-1,1],["\u03bc",-1,1],["\u03be",-1,1],["\u03c0\u03c1\u03bf",-1,1],["\u03b1\u03c1",-1,1],["\u03bd\u03b9\u03c3",-1,1]],X=[["\u03b7\u03b8\u03b7\u03ba\u03b1",-1,1],["\u03b7\u03b8\u03b7\u03ba\u03b5",-1,1],["\u03b7\u03b8\u03b7\u03ba\u03b5\u03c3",-1,1]],ja=[["\u03c0\u03b9\u03b8",-1,1], +["\u03bf\u03b8",-1,1],["\u03bd\u03b1\u03c1\u03b8",-1,1],["\u03c3\u03ba\u03bf\u03c5\u03bb",-1,1],["\u03c3\u03ba\u03c9\u03bb",-1,1],["\u03c3\u03c6",-1,1]],ka=[["\u03b8",-1,1],["\u03b4\u03b9\u03b1\u03b8",0,1],["\u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03c4\u03b1\u03b8",0,1],["\u03c3\u03c5\u03bd\u03b8",0,1],["\u03c0\u03c1\u03bf\u03c3\u03b8",0,1]],la=[["\u03b7\u03ba\u03b1",-1,1],["\u03b7\u03ba\u03b5",-1,1],["\u03b7\u03ba\u03b5\u03c3",-1,1]],ma=[["\u03c6\u03b1\u03b3",-1,1],["\u03bb\u03b7\u03b3",-1,1],["\u03c6\u03c1\u03c5\u03b4", +-1,1],["\u03bc\u03b1\u03bd\u03c4\u03b9\u03bb",-1,1],["\u03bc\u03b1\u03bb\u03bb",-1,1],["\u03bf\u03bc",-1,1],["\u03b2\u03bb\u03b5\u03c0",-1,1],["\u03c0\u03bf\u03b4\u03b1\u03c1",-1,1],["\u03ba\u03c5\u03bc\u03b1\u03c4",-1,1],["\u03c0\u03c1\u03c9\u03c4",-1,1],["\u03bb\u03b1\u03c7",-1,1],["\u03c0\u03b1\u03bd\u03c4\u03b1\u03c7",-1,1]],na=[["\u03c4\u03c3\u03b1",-1,1],["\u03c7\u03b1\u03b4",-1,1],["\u03bc\u03b5\u03b4",-1,1],["\u03bb\u03b1\u03bc\u03c0\u03b9\u03b4",-1,1],["\u03b4\u03b5",-1,1],["\u03c0\u03bb\u03b5", +-1,1],["\u03bc\u03b5\u03c3\u03b1\u03b6",-1,1],["\u03b4\u03b5\u03c3\u03c0\u03bf\u03b6",-1,1],["\u03b1\u03b9\u03b8",-1,1],["\u03c6\u03b1\u03c1\u03bc\u03b1\u03ba",-1,1],["\u03b1\u03b3\u03ba",-1,1],["\u03b1\u03bd\u03b7\u03ba",-1,1],["\u03bb",-1,1],["\u03bc",-1,1],["\u03b1\u03bc",13,1],["\u03b2\u03c1\u03bf\u03bc",13,1],["\u03c5\u03c0\u03bf\u03c4\u03b5\u03b9\u03bd",-1,1],["\u03b5\u03ba\u03bb\u03b9\u03c0",-1,1],["\u03c1",-1,1],["\u03b5\u03bd\u03b4\u03b9\u03b1\u03c6\u03b5\u03c1",18,1],["\u03b1\u03bd\u03b1\u03c1\u03c1", +18,1],["\u03c0\u03b1\u03c4",-1,1],["\u03ba\u03b1\u03b8\u03b1\u03c1\u03b5\u03c5",-1,1],["\u03b4\u03b5\u03c5\u03c4\u03b5\u03c1\u03b5\u03c5",-1,1],["\u03bb\u03b5\u03c7",-1,1]],oa=[["\u03bf\u03c5\u03c3\u03b1",-1,1],["\u03bf\u03c5\u03c3\u03b5",-1,1],["\u03bf\u03c5\u03c3\u03b5\u03c3",-1,1]],pa=[["\u03c0\u03b5\u03bb",-1,1],["\u03bb\u03bb",-1,1],["\u03c3\u03bc\u03b7\u03bd",-1,1],["\u03c1\u03c0",-1,1],["\u03c0\u03c1",-1,1],["\u03c6\u03c1",-1,1],["\u03c7\u03bf\u03c1\u03c4",-1,1],["\u03bf\u03c6",-1,1],["\u03c8\u03bf\u03c6", +7,-1],["\u03c3\u03c6",-1,1],["\u03bb\u03bf\u03c7",-1,1],["\u03bd\u03b1\u03c5\u03bb\u03bf\u03c7",10,-1]],qa=[["\u03b1\u03bc\u03b1\u03bb\u03bb\u03b9",-1,1],["\u03bb",-1,1],["\u03b1\u03bc\u03b1\u03bb",1,1],["\u03bc",-1,1],["\u03bf\u03c5\u03bb\u03b1\u03bc",3,1],["\u03b5\u03bd",-1,1],["\u03b4\u03b5\u03c1\u03b2\u03b5\u03bd",5,1],["\u03c0",-1,1],["\u03b1\u03b5\u03b9\u03c0",7,1],["\u03b1\u03c1\u03c4\u03b9\u03c0",7,1],["\u03c3\u03c5\u03bc\u03c0",7,1],["\u03bd\u03b5\u03bf\u03c0",7,1],["\u03ba\u03c1\u03bf\u03ba\u03b1\u03bb\u03bf\u03c0", +7,1],["\u03bf\u03bb\u03bf\u03c0",7,1],["\u03c0\u03c1\u03bf\u03c3\u03c9\u03c0\u03bf\u03c0",7,1],["\u03c3\u03b9\u03b4\u03b7\u03c1\u03bf\u03c0",7,1],["\u03b4\u03c1\u03bf\u03c3\u03bf\u03c0",7,1],["\u03b1\u03c3\u03c0",7,1],["\u03b1\u03bd\u03c5\u03c0",7,1],["\u03c1",-1,1],["\u03b1\u03c3\u03c0\u03b1\u03c1",19,1],["\u03c7\u03b1\u03c1",19,1],["\u03b1\u03c7\u03b1\u03c1",21,1],["\u03b1\u03c0\u03b5\u03c1",19,1],["\u03c4\u03c1",19,1],["\u03bf\u03c5\u03c1",19,1],["\u03c4",-1,1],["\u03b4\u03b9\u03b1\u03c4",26,1], +["\u03b5\u03c0\u03b9\u03c4",26,1],["\u03c3\u03c5\u03bd\u03c4",26,1],["\u03bf\u03bc\u03bf\u03c4",26,1],["\u03bd\u03bf\u03bc\u03bf\u03c4",30,1],["\u03b1\u03c0\u03bf\u03c4",26,1],["\u03c5\u03c0\u03bf\u03c4",26,1],["\u03b1\u03b2\u03b1\u03c3\u03c4",26,1],["\u03b1\u03b9\u03bc\u03bf\u03c3\u03c4",26,1],["\u03c0\u03c1\u03bf\u03c3\u03c4",26,1],["\u03b1\u03bd\u03c5\u03c3\u03c4",26,1],["\u03bd\u03b1\u03c5",-1,1],["\u03b1\u03c6",-1,1],["\u03be\u03b5\u03c6",-1,1],["\u03b1\u03b4\u03b7\u03c6",-1,1],["\u03c0\u03b1\u03bc\u03c6", +-1,1],["\u03c0\u03bf\u03bb\u03c5\u03c6",-1,1]],ra=[["\u03b1\u03b3\u03b1",-1,1],["\u03b1\u03b3\u03b5",-1,1],["\u03b1\u03b3\u03b5\u03c3",-1,1]],sa=[["\u03b7\u03c3\u03b1",-1,1],["\u03b7\u03c3\u03b5",-1,1],["\u03b7\u03c3\u03bf\u03c5",-1,1]],ta=[["\u03bd",-1,1],["\u03b4\u03c9\u03b4\u03b5\u03ba\u03b1\u03bd",0,1],["\u03b5\u03c0\u03c4\u03b1\u03bd",0,1],["\u03bc\u03b5\u03b3\u03b1\u03bb\u03bf\u03bd",0,1],["\u03b5\u03c1\u03b7\u03bc\u03bf\u03bd",0,1],["\u03c7\u03b5\u03c1\u03c3\u03bf\u03bd",0,1]],ua=[["\u03b7\u03c3\u03c4\u03b5", +-1,1]],va=[["\u03c3\u03b2",-1,1],["\u03b1\u03c3\u03b2",0,1],["\u03b1\u03c0\u03bb",-1,1],["\u03b1\u03b5\u03b9\u03bc\u03bd",-1,1],["\u03c7\u03c1",-1,1],["\u03b1\u03c7\u03c1",4,1],["\u03ba\u03bf\u03b9\u03bd\u03bf\u03c7\u03c1",4,1],["\u03b4\u03c5\u03c3\u03c7\u03c1",4,1],["\u03b5\u03c5\u03c7\u03c1",4,1],["\u03c0\u03b1\u03bb\u03b9\u03bc\u03c8",-1,1]],wa=[["\u03bf\u03c5\u03bd\u03b5",-1,1],["\u03b7\u03b8\u03bf\u03c5\u03bd\u03b5",0,1],["\u03b7\u03c3\u03bf\u03c5\u03bd\u03b5",0,1]],xa=[["\u03c3\u03c0\u03b9", +-1,1],["\u03bd",-1,1],["\u03b5\u03be\u03c9\u03bd",1,1],["\u03c1",-1,1],["\u03c3\u03c4\u03c1\u03b1\u03b2\u03bf\u03bc\u03bf\u03c5\u03c4\u03c3",-1,1],["\u03ba\u03b1\u03ba\u03bf\u03bc\u03bf\u03c5\u03c4\u03c3",-1,1]],ya=[["\u03bf\u03c5\u03bc\u03b5",-1,1],["\u03b7\u03b8\u03bf\u03c5\u03bc\u03b5",0,1],["\u03b7\u03c3\u03bf\u03c5\u03bc\u03b5",0,1]],za=[["\u03b1\u03b6",-1,1],["\u03c9\u03c1\u03b9\u03bf\u03c0\u03bb",-1,1],["\u03b1\u03c3\u03bf\u03c5\u03c3",-1,1],["\u03c0\u03b1\u03c1\u03b1\u03c3\u03bf\u03c5\u03c3", +2,1],["\u03b1\u03bb\u03bb\u03bf\u03c3\u03bf\u03c5\u03c3",-1,1],["\u03c6",-1,1],["\u03c7",-1,1]],Aa=[["\u03bc\u03b1\u03c4\u03b1",-1,1],["\u03bc\u03b1\u03c4\u03c9\u03bd",-1,1],["\u03bc\u03b1\u03c4\u03bf\u03c3",-1,1]],Ba=[["\u03b1",-1,1],["\u03b9\u03bf\u03c5\u03bc\u03b1",0,1],["\u03bf\u03bc\u03bf\u03c5\u03bd\u03b1",0,1],["\u03b9\u03bf\u03bc\u03bf\u03c5\u03bd\u03b1",2,1],["\u03bf\u03c3\u03bf\u03c5\u03bd\u03b1",0,1],["\u03b9\u03bf\u03c3\u03bf\u03c5\u03bd\u03b1",4,1],["\u03b5",-1,1],["\u03b1\u03b3\u03b1\u03c4\u03b5", +6,1],["\u03b7\u03ba\u03b1\u03c4\u03b5",6,1],["\u03b7\u03b8\u03b7\u03ba\u03b1\u03c4\u03b5",8,1],["\u03b7\u03c3\u03b1\u03c4\u03b5",6,1],["\u03bf\u03c5\u03c3\u03b1\u03c4\u03b5",6,1],["\u03b5\u03b9\u03c4\u03b5",6,1],["\u03b7\u03b8\u03b5\u03b9\u03c4\u03b5",12,1],["\u03b9\u03b5\u03bc\u03b1\u03c3\u03c4\u03b5",6,1],["\u03bf\u03c5\u03bc\u03b1\u03c3\u03c4\u03b5",6,1],["\u03b9\u03bf\u03c5\u03bc\u03b1\u03c3\u03c4\u03b5",15,1],["\u03b9\u03b5\u03c3\u03b1\u03c3\u03c4\u03b5",6,1],["\u03bf\u03c3\u03b1\u03c3\u03c4\u03b5", +6,1],["\u03b9\u03bf\u03c3\u03b1\u03c3\u03c4\u03b5",18,1],["\u03b7",-1,1],["\u03b9",-1,1],["\u03b1\u03bc\u03b1\u03b9",21,1],["\u03b9\u03b5\u03bc\u03b1\u03b9",21,1],["\u03bf\u03bc\u03b1\u03b9",21,1],["\u03bf\u03c5\u03bc\u03b1\u03b9",21,1],["\u03b1\u03c3\u03b1\u03b9",21,1],["\u03b5\u03c3\u03b1\u03b9",21,1],["\u03b9\u03b5\u03c3\u03b1\u03b9",27,1],["\u03b1\u03c4\u03b1\u03b9",21,1],["\u03b5\u03c4\u03b1\u03b9",21,1],["\u03b9\u03b5\u03c4\u03b1\u03b9",30,1],["\u03bf\u03bd\u03c4\u03b1\u03b9",21,1],["\u03bf\u03c5\u03bd\u03c4\u03b1\u03b9", +21,1],["\u03b9\u03bf\u03c5\u03bd\u03c4\u03b1\u03b9",33,1],["\u03b5\u03b9",21,1],["\u03b1\u03b5\u03b9",35,1],["\u03b7\u03b8\u03b5\u03b9",35,1],["\u03b7\u03c3\u03b5\u03b9",35,1],["\u03bf\u03b9",21,1],["\u03b1\u03bd",-1,1],["\u03b1\u03b3\u03b1\u03bd",40,1],["\u03b7\u03ba\u03b1\u03bd",40,1],["\u03b7\u03b8\u03b7\u03ba\u03b1\u03bd",42,1],["\u03b7\u03c3\u03b1\u03bd",40,1],["\u03bf\u03c5\u03c3\u03b1\u03bd",40,1],["\u03bf\u03bd\u03c4\u03bf\u03c5\u03c3\u03b1\u03bd",45,1],["\u03b9\u03bf\u03bd\u03c4\u03bf\u03c5\u03c3\u03b1\u03bd", +46,1],["\u03bf\u03bd\u03c4\u03b1\u03bd",40,1],["\u03b9\u03bf\u03bd\u03c4\u03b1\u03bd",48,1],["\u03bf\u03c5\u03bd\u03c4\u03b1\u03bd",40,1],["\u03b9\u03bf\u03c5\u03bd\u03c4\u03b1\u03bd",50,1],["\u03bf\u03c4\u03b1\u03bd",40,1],["\u03b9\u03bf\u03c4\u03b1\u03bd",52,1],["\u03bf\u03bc\u03b1\u03c3\u03c4\u03b1\u03bd",40,1],["\u03b9\u03bf\u03bc\u03b1\u03c3\u03c4\u03b1\u03bd",54,1],["\u03bf\u03c3\u03b1\u03c3\u03c4\u03b1\u03bd",40,1],["\u03b9\u03bf\u03c3\u03b1\u03c3\u03c4\u03b1\u03bd",56,1],["\u03bf\u03c5\u03bd", +-1,1],["\u03b7\u03b8\u03bf\u03c5\u03bd",58,1],["\u03bf\u03bc\u03bf\u03c5\u03bd",58,1],["\u03b9\u03bf\u03bc\u03bf\u03c5\u03bd",60,1],["\u03b7\u03c3\u03bf\u03c5\u03bd",58,1],["\u03bf\u03c3\u03bf\u03c5\u03bd",58,1],["\u03b9\u03bf\u03c3\u03bf\u03c5\u03bd",63,1],["\u03c9\u03bd",-1,1],["\u03b7\u03b4\u03c9\u03bd",65,1],["\u03bf",-1,1],["\u03b1\u03c3",-1,1],["\u03b5\u03c3",-1,1],["\u03b7\u03b4\u03b5\u03c3",69,1],["\u03b7\u03c3\u03b5\u03c3",69,1],["\u03b7\u03c3",-1,1],["\u03b5\u03b9\u03c3",-1,1],["\u03b7\u03b8\u03b5\u03b9\u03c3", +73,1],["\u03bf\u03c3",-1,1],["\u03c5\u03c3",-1,1],["\u03bf\u03c5\u03c3",76,1],["\u03c5",-1,1],["\u03bf\u03c5",78,1],["\u03c9",-1,1],["\u03b1\u03c9",80,1],["\u03b7\u03b8\u03c9",80,1],["\u03b7\u03c3\u03c9",80,1]],Ca=[["\u03bf\u03c4\u03b5\u03c1",-1,1],["\u03b5\u03c3\u03c4\u03b5\u03c1",-1,1],["\u03c5\u03c4\u03b5\u03c1",-1,1],["\u03c9\u03c4\u03b5\u03c1",-1,1],["\u03bf\u03c4\u03b1\u03c4",-1,1],["\u03b5\u03c3\u03c4\u03b1\u03c4",-1,1],["\u03c5\u03c4\u03b1\u03c4",-1,1],["\u03c9\u03c4\u03b1\u03c4",-1,1]],ia= +[81,65,16,1],ha=[81,65,0,1],B=q;this.m=function(){b.f=b.cursor;b.cursor=b.a;var l=b.a-b.cursor;g();b.cursor=b.a-l;if(!(3<=b.j.length))return q;B=f;l=b.a-b.cursor;d();b.cursor=b.a-l;var l=b.a-b.cursor,n;b.d=b.cursor;if(0!=b.g(h)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,n=b.g(e),!(0==n||b.cursor>b.f))))switch(n){case 1:if(!b.b("\u03b9"))break;break;case 2:b.b("\u03b9\u03b6")}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(t)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(u)|| +b.cursor>b.f||b.b("\u03c9\u03bd")));b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(r)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;if(b.h("\u03b9\u03c3\u03b1")&&!(b.cursor>b.f)){if(!b.b("\u03b9\u03c3"))break a;break b}b.cursor=b.a-n;b.d=b.cursor;b.c=b.cursor;n=b.g(s);if(!(0==n||b.cursor>b.f))switch(n){case 1:if(!b.b("\u03b9"))break;break;case 2:b.b("\u03b9\u03c3")}}}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(k)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(p)||b.cursor> +b.f||b.b("\u03b9")));b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;if(0!=b.g(w)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,n=b.g(v),!(0==n||b.cursor>b.f))))switch(n){case 1:if(!b.b("\u03b9"))break;break;case 2:b.b("\u03b9\u03c3\u03c4")}b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(c)&&(b.c=b.cursor,b.e())){B=q;b:{var F=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;n=b.g(y);if(0!=n&&!(b.cursor>b.f)){switch(n){case 1:if(!b.b("\u03b9\u03c3\u03bc"))break a;break;case 2:if(!b.b("\u03b9"))break a}break b}b.cursor= +b.a-F;b.d=b.cursor;n=b.g(z);if(0!=n)switch(b.c=b.cursor,n){case 1:if(!b.b("\u03b1\u03b3\u03bd\u03c9\u03c3\u03c4"))break;break;case 2:if(!b.b("\u03b1\u03c4\u03bf\u03bc"))break;break;case 3:if(!b.b("\u03b3\u03bd\u03c9\u03c3\u03c4"))break;break;case 4:if(!b.b("\u03b5\u03b8\u03bd"))break;break;case 5:if(!b.b("\u03b5\u03ba\u03bb\u03b5\u03ba\u03c4"))break;break;case 6:if(!b.b("\u03c3\u03ba\u03b5\u03c0\u03c4"))break;break;case 7:if(!b.b("\u03c4\u03bf\u03c0"))break;break;case 8:if(!b.b("\u03b1\u03bb\u03b5\u03be\u03b1\u03bd\u03b4\u03c1"))break; +break;case 9:if(!b.b("\u03b2\u03c5\u03b6\u03b1\u03bd\u03c4"))break;break;case 10:b.b("\u03b8\u03b5\u03b1\u03c4\u03c1")}}}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(D)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(A)||b.cursor>b.f||b.b("\u03b1\u03c1\u03b1\u03ba")));b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(G)&&(b.c=b.cursor,b.e())){B=q;b:{F=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;n=b.g(x);if(0!=n&&!(b.cursor>b.f)){switch(n){case 1:if(!b.b("\u03b1\u03ba"))break a;break; +case 2:if(!b.b("\u03b9\u03c4\u03c3"))break a}break b}b.cursor=b.a-F;b.d=b.cursor;b.c=b.cursor;!b.h("\u03ba\u03bf\u03c1")||b.b("\u03b9\u03c4\u03c3")}}b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(O)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(0!=b.g(E)&&!(b.cursor>b.f)){if(!b.b("\u03b9\u03b4"))break a;break b}b.cursor=b.a-n;b.d=b.cursor;b.c=b.cursor;0==b.g(J)||b.b("\u03b9\u03b4")}}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(P)&&(b.c=b.cursor,b.e()&&(B=q, +b.d=b.cursor,b.c=b.cursor,0==b.g(N)||b.cursor>b.f||b.b("\u03b9\u03c3\u03ba")));b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(Q)&&(b.c=b.cursor,b.e())){n=b.a-b.cursor;if(0!=b.g(T))break a;b.cursor=b.a-n;n=b.cursor;b.r(b.cursor,b.cursor,"\u03b1\u03b4");b.cursor=n}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(U)&&(b.c=b.cursor,b.e()&&(b.d=b.cursor,b.c=b.cursor,0==b.g(R)||b.b("\u03b5\u03b4")));b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(S)&&(b.c=b.cursor,b.e()&&(b.d=b.cursor,b.c=b.cursor, +0==b.g(V)||b.b("\u03bf\u03c5\u03b4")));b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(M)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(K)||b.cursor>b.f||b.b("\u03b5")));b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(H)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,!b.l(ia,945,969)||b.b("\u03b9")));b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(I)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(b.l(ia,945,969)){if(!b.b("\u03b9\u03ba"))break a; +break b}b.cursor=b.a-n;b.d=b.cursor}b.c=b.cursor;0==b.g(L)||b.cursor>b.f||b.b("\u03b9\u03ba")}b.cursor=b.a-l;l=b.a-b.cursor;a:{n=b.a-b.cursor;if(b.h("\u03b1\u03b3\u03b1\u03bc\u03b5")&&!(b.cursor>b.f)&&!b.b("\u03b1\u03b3\u03b1\u03bc"))break a;b.cursor=b.a-n;n=b.a-b.cursor;b.d=b.cursor;if(0!=b.g($)){b.c=b.cursor;if(!b.e())break a;B=q}b.cursor=b.a-n;b.d=b.cursor;b.h("\u03b1\u03bc\u03b5")&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(aa)||b.cursor>b.f||b.b("\u03b1\u03bc")))}b.cursor=b.a- +l;l=b.a-b.cursor;m();b.cursor=b.a-l;l=b.a-b.cursor;a();b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(fa)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(b.h("\u03b1\u03c1\u03c7")&&!(b.cursor>b.f)){if(!b.b("\u03bf\u03bd\u03c4"))break a;break b}b.cursor=b.a-n;b.d=b.cursor;b.c=b.cursor;!b.h("\u03ba\u03c1\u03b5")||b.b("\u03c9\u03bd\u03c4")}}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(ga)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,!b.h("\u03bf\u03bd")|| +b.cursor>b.f||b.b("\u03bf\u03bc\u03b1\u03c3\u03c4")));b.cursor=b.a-l;l=b.a-b.cursor;a:{n=b.a-b.cursor;b.d=b.cursor;if(b.h("\u03b9\u03b5\u03c3\u03c4\u03b5")){b.c=b.cursor;if(!b.e())break a;B=q;b.d=b.cursor;b.c=b.cursor;if(0!=b.g(Y)&&!(b.cursor>b.f)&&!b.b("\u03b9\u03b5\u03c3\u03c4"))break a}b.cursor=b.a-n;b.d=b.cursor;b.h("\u03b5\u03c3\u03c4\u03b5")&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(Z)||b.cursor>b.f||b.b("\u03b9\u03b5\u03c3\u03c4")))}b.cursor=b.a-l;l=b.a-b.cursor;a:{n=b.a- +b.cursor;b.d=b.cursor;if(0!=b.g(X)){b.c=b.cursor;if(!b.e())break a;B=q}b.cursor=b.a-n;b.d=b.cursor;if(0!=b.g(la)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(0!=b.g(ja)){if(!b.b("\u03b7\u03ba"))break a;break b}b.cursor=b.a-n;b.d=b.cursor;b.c=b.cursor;0==b.g(ka)||b.cursor>b.f||b.b("\u03b7\u03ba")}}}b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(oa)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(0!=b.g(ma)){if(!b.b("\u03bf\u03c5\u03c3"))break a; +break b}b.cursor=b.a-n;b.d=b.cursor;b.c=b.cursor;0==b.g(na)||b.cursor>b.f||b.b("\u03bf\u03c5\u03c3")}}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(sa)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(ta)||b.cursor>b.f||b.b("\u03b7\u03c3")));b.cursor=b.a-l;l=b.a-b.cursor;a:if(b.d=b.cursor,0!=b.g(ra)&&(b.c=b.cursor,b.e())){B=q;b:{n=b.a-b.cursor;b.d=b.cursor;b.c=b.cursor;if(b.h("\u03ba\u03bf\u03bb\u03bb")){if(!b.b("\u03b1\u03b3"))break a;break b}b.cursor=b.a-n;c:{F=b.a-b.cursor;b.d=b.cursor; +b.c=b.cursor;n=b.g(pa);if(0!=n){switch(n){case 1:if(!b.b("\u03b1\u03b3"))break a}break c}b.cursor=b.a-F;b.d=b.cursor;b.c=b.cursor;0==b.g(qa)||b.cursor>b.f||b.b("\u03b1\u03b3")}}}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(ua)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(va)||b.cursor>b.f||b.b("\u03b7\u03c3\u03c4")));b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(wa)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(xa)||b.cursor>b.f||b.b("\u03bf\u03c5\u03bd")));b.cursor= +b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(ya)&&(b.c=b.cursor,b.e()&&(B=q,b.d=b.cursor,b.c=b.cursor,0==b.g(za)||b.cursor>b.f||b.b("\u03bf\u03c5\u03bc")));b.cursor=b.a-l;l=b.a-b.cursor;a:{n=b.a-b.cursor;b.d=b.cursor;if(0!=b.g(Aa)&&(b.c=b.cursor,!b.b("\u03bc\u03b1")))break a;b.cursor=b.a-n;B&&(b.d=b.cursor,0!=b.g(Ba)&&(b.c=b.cursor,b.e()))}b.cursor=b.a-l;l=b.a-b.cursor;b.d=b.cursor;0!=b.g(Ca)&&(b.c=b.cursor,b.e());b.cursor=b.a-l;b.cursor=b.f;return f};this.stemWord=function(a){b.p(a);this.m();return b.j}};window.HindiStemmer=function(){function g(){return!d.l(a,2325,2399)?q:f}var d=new C,m=[["\u0906\u0901",-1,-1],["\u093e\u0901",-1,-1],["\u0907\u092f\u093e\u0901",1,-1],["\u0906\u0907\u092f\u093e\u0901",2,-1],["\u093e\u0907\u092f\u093e\u0901",2,-1],["\u093f\u092f\u093e\u0901",1,-1],["\u0906\u0902",-1,-1],["\u0909\u0906\u0902",6,-1],["\u0941\u0906\u0902",6,-1],["\u0908\u0902",-1,-1],["\u0906\u0908\u0902",9,-1],["\u093e\u0908\u0902",9,-1],["\u090f\u0902",-1,-1],["\u0906\u090f\u0902",12,-1],["\u0909\u090f\u0902", +12,-1],["\u093e\u090f\u0902",12,-1],["\u0924\u093e\u090f\u0902",15,-1,g],["\u0905\u0924\u093e\u090f\u0902",16,-1],["\u0928\u093e\u090f\u0902",15,-1,g],["\u0905\u0928\u093e\u090f\u0902",18,-1],["\u0941\u090f\u0902",12,-1],["\u0913\u0902",-1,-1],["\u0906\u0913\u0902",21,-1],["\u0909\u0913\u0902",21,-1],["\u093e\u0913\u0902",21,-1],["\u0924\u093e\u0913\u0902",24,-1,g],["\u0905\u0924\u093e\u0913\u0902",25,-1],["\u0928\u093e\u0913\u0902",24,-1,g],["\u0905\u0928\u093e\u0913\u0902",27,-1],["\u0941\u0913\u0902", +21,-1],["\u093e\u0902",-1,-1],["\u0907\u092f\u093e\u0902",30,-1],["\u0906\u0907\u092f\u093e\u0902",31,-1],["\u093e\u0907\u092f\u093e\u0902",31,-1],["\u093f\u092f\u093e\u0902",30,-1],["\u0940\u0902",-1,-1],["\u0924\u0940\u0902",35,-1,g],["\u0905\u0924\u0940\u0902",36,-1],["\u0906\u0924\u0940\u0902",36,-1],["\u093e\u0924\u0940\u0902",36,-1],["\u0947\u0902",-1,-1],["\u094b\u0902",-1,-1],["\u0907\u092f\u094b\u0902",41,-1],["\u0906\u0907\u092f\u094b\u0902",42,-1],["\u093e\u0907\u092f\u094b\u0902",42,-1], +["\u093f\u092f\u094b\u0902",41,-1],["\u0905",-1,-1],["\u0906",-1,-1],["\u0907",-1,-1],["\u0908",-1,-1],["\u0906\u0908",49,-1],["\u093e\u0908",49,-1],["\u0909",-1,-1],["\u090a",-1,-1],["\u090f",-1,-1],["\u0906\u090f",54,-1],["\u0907\u090f",54,-1],["\u0906\u0907\u090f",56,-1],["\u093e\u0907\u090f",56,-1],["\u093e\u090f",54,-1],["\u093f\u090f",54,-1],["\u0913",-1,-1],["\u0906\u0913",61,-1],["\u093e\u0913",61,-1],["\u0915\u0930",-1,-1,g],["\u0905\u0915\u0930",64,-1],["\u0906\u0915\u0930",64,-1],["\u093e\u0915\u0930", +64,-1],["\u093e",-1,-1],["\u090a\u0902\u0917\u093e",68,-1],["\u0906\u090a\u0902\u0917\u093e",69,-1],["\u093e\u090a\u0902\u0917\u093e",69,-1],["\u0942\u0902\u0917\u093e",68,-1],["\u090f\u0917\u093e",68,-1],["\u0906\u090f\u0917\u093e",73,-1],["\u093e\u090f\u0917\u093e",73,-1],["\u0947\u0917\u093e",68,-1],["\u0924\u093e",68,-1,g],["\u0905\u0924\u093e",77,-1],["\u0906\u0924\u093e",77,-1],["\u093e\u0924\u093e",77,-1],["\u0928\u093e",68,-1,g],["\u0905\u0928\u093e",81,-1],["\u0906\u0928\u093e",81,-1],["\u093e\u0928\u093e", +81,-1],["\u0906\u092f\u093e",68,-1],["\u093e\u092f\u093e",68,-1],["\u093f",-1,-1],["\u0940",-1,-1],["\u090a\u0902\u0917\u0940",88,-1],["\u0906\u090a\u0902\u0917\u0940",89,-1],["\u093e\u090a\u0902\u0917\u0940",89,-1],["\u090f\u0902\u0917\u0940",88,-1],["\u0906\u090f\u0902\u0917\u0940",92,-1],["\u093e\u090f\u0902\u0917\u0940",92,-1],["\u0942\u0902\u0917\u0940",88,-1],["\u0947\u0902\u0917\u0940",88,-1],["\u090f\u0917\u0940",88,-1],["\u0906\u090f\u0917\u0940",97,-1],["\u093e\u090f\u0917\u0940",97,-1], +["\u0913\u0917\u0940",88,-1],["\u0906\u0913\u0917\u0940",100,-1],["\u093e\u0913\u0917\u0940",100,-1],["\u0947\u0917\u0940",88,-1],["\u094b\u0917\u0940",88,-1],["\u0924\u0940",88,-1,g],["\u0905\u0924\u0940",105,-1],["\u0906\u0924\u0940",105,-1],["\u093e\u0924\u0940",105,-1],["\u0928\u0940",88,-1,g],["\u0905\u0928\u0940",109,-1],["\u0941",-1,-1],["\u0942",-1,-1],["\u0947",-1,-1],["\u090f\u0902\u0917\u0947",113,-1],["\u0906\u090f\u0902\u0917\u0947",114,-1],["\u093e\u090f\u0902\u0917\u0947",114,-1],["\u0947\u0902\u0917\u0947", +113,-1],["\u0913\u0917\u0947",113,-1],["\u0906\u0913\u0917\u0947",118,-1],["\u093e\u0913\u0917\u0947",118,-1],["\u094b\u0917\u0947",113,-1],["\u0924\u0947",113,-1,g],["\u0905\u0924\u0947",122,-1],["\u0906\u0924\u0947",122,-1],["\u093e\u0924\u0947",122,-1],["\u0928\u0947",113,-1,g],["\u0905\u0928\u0947",126,-1],["\u0906\u0928\u0947",126,-1],["\u093e\u0928\u0947",126,-1],["\u094b",-1,-1],["\u094d",-1,-1]],a=[255,255,255,255,159,0,0,0,248,7];this.m=function(){if(d.cursor>=d.a)return q;d.cursor++;d.f= +d.cursor;d.cursor=d.a;d.d=d.cursor;if(0==d.g(m))return q;d.c=d.cursor;if(!d.e())return q;d.cursor=d.f;return f};this.stemWord=function(a){d.p(a);this.m();return d.j}};window.HungarianStemmer=function(){function g(){return y<=a.cursor}function d(){var b=a.a-a.cursor;if(0==a.g(n))return q;a.cursor=a.a-b;return f}function m(){if(!(a.cursor<=a.f)){a.cursor--;a.d=a.cursor;var b=a.cursor-1;b=a.a)break c;a.cursor++}d:{n=a.cursor;if(0!=a.o(b))break d;a.cursor=n;if(a.cursor>=a.a)break c;a.cursor++}y=a.cursor;break b}a.cursor=c;if(a.k(w,97,369)){c:for(;;){if(a.i(w,97,369))break c; +if(a.cursor>=a.a)break a;a.cursor++}y=a.cursor}}}a.cursor=z;a.f=a.cursor;a.cursor=a.a;z=a.a-a.cursor;a.d=a.cursor;0!=a.g(e)&&(a.c=a.cursor,!g()||!d()||!a.e()||m());a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;if(0!=a.g(h)&&(a.c=a.cursor,g()&&a.e()&&(a.d=a.cursor,c=a.g(l),0!=c&&(a.c=a.cursor,g()))))switch(c){case 1:if(!a.b("a"))break;break;case 2:a.b("e")}a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;c=a.g(u);if(0!=c&&(a.c=a.cursor,g()))switch(c){case 1:if(!a.b("e"))break;break;case 2:a.b("a")}a.cursor= +a.a-z;z=a.a-a.cursor;a.d=a.cursor;c=a.g(t);if(0!=c&&(a.c=a.cursor,g()))switch(c){case 1:if(!a.e())break;break;case 2:if(!a.b("a"))break;break;case 3:a.b("e")}a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;0!=a.g(s)&&(a.c=a.cursor,!g()||!d()||!a.e()||m());a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;c=a.g(p);if(0!=c&&(a.c=a.cursor,g()))switch(c){case 1:if(!a.e())break;break;case 2:if(!a.b("e"))break;break;case 3:a.b("a")}a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;c=a.g(k);if(0!=c&&(a.c=a.cursor,g()))switch(c){case 1:if(!a.e())break; +break;case 2:if(!a.b("a"))break;break;case 3:a.b("e")}a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;c=a.g(v);if(0!=c&&(a.c=a.cursor,g()))switch(c){case 1:if(!a.e())break;break;case 2:if(!a.b("a"))break;break;case 3:a.b("e")}a.cursor=a.a-z;z=a.a-a.cursor;a.d=a.cursor;c=a.g(r);if(0!=c&&(a.c=a.cursor,g()))switch(c){case 1:if(!a.b("a"))break;break;case 2:if(!a.b("e"))break;break;case 3:a.e()}a.cursor=a.a-z;a.cursor=a.f;return f};this.stemWord=function(b){a.p(b);this.m();return a.j}};window.IndonesianStemmer=function(){function g(){b.d=b.cursor;if(0==b.g(e))return q;b.c=b.cursor;if(!b.e())return q;r-=1;return f}function d(){return!b.i(t,97,117)?q:f}function m(){var a;b.c=b.cursor;a=b.o(h);if(0==a)return q;b.d=b.cursor;switch(a){case 1:if(!b.e())return q;s=1;r-=1;break;case 2:if(!b.e())return q;s=3;r-=1;break;case 3:s=1;if(!b.b("s"))return q;r-=1;break;case 4:s=3;if(!b.b("s"))return q;r-=1;break;case 5:s=1;r-=1;a:{a=b.cursor;var d=b.cursor;if(b.i(t,97,117)){b.cursor=d;if(!b.b("p"))return q; +break a}b.cursor=a;if(!b.e())return q}break;case 6:s=3;r-=1;a:{a=b.cursor;d=b.cursor;if(b.i(t,97,117)){b.cursor=d;if(!b.b("p"))return q;break a}b.cursor=a;if(!b.e())return q}}return f}function a(){var a;b.c=b.cursor;a=b.o(u);if(0!=a)switch(b.d=b.cursor,a){case 1:if(!b.e())break;s=2;r-=1;break;case 2:if(!b.b("ajar"))break;r-=1;break;case 3:if(!b.e())break;s=4;r-=1;break;case 4:if(!b.b("ajar"))break;s=4;r-=1}}var b=new C,l=[["kah",-1,1],["lah",-1,1],["pun",-1,1]],n=[["nya",-1,1],["ku",-1,1],["mu",-1, +1]],e=[["i",-1,1,function(){if(2=b.a)break b;b.cursor++}r+=1;continue}b.cursor=e;break}b.cursor=d;if(2>=r)return q;s=0;b.f=b.cursor;b.cursor=b.a;d=b.a-b.cursor;b.d=b.cursor;0!=b.g(l)&&(b.c=b.cursor,b.e()&&(r-=1));b.cursor=b.a-d;if(2>=r)return q;d=b.a-b.cursor;b.d=b.cursor;0!=b.g(n)&&(b.c=b.cursor,b.e()&&(r-=1));b.cursor=b.a-d;b.cursor=b.f;if(2>=r)return q;a:{e=b.cursor;d=b.cursor;if(m()){e=b.cursor;var h=b.cursor;2>= +r||(b.f=b.cursor,b.cursor=b.a,g()&&(b.cursor=b.f,b.cursor=h,2>=r||a()));b.cursor=e;b.cursor=d;break a}b.cursor=e;d=b.cursor;a();b.cursor=d;d=b.cursor;2>=r||(b.f=b.cursor,b.cursor=b.a,g()&&(b.cursor=b.f));b.cursor=d}return f};this.stemWord=function(a){b.p(a);this.m();return b.j}};window.IrishStemmer=function(){var g=new C,d=[["b'",-1,1],["bh",-1,4],["bhf",1,2],["bp",-1,8],["ch",-1,5],["d'",-1,1],["d'fh",5,2],["dh",-1,6],["dt",-1,9],["fh",-1,2],["gc",-1,5],["gh",-1,7],["h-",-1,1],["m'",-1,1],["mb",-1,4],["mh",-1,10],["n-",-1,1],["nd",-1,6],["ng",-1,7],["ph",-1,8],["sh",-1,3],["t-",-1,1],["th",-1,9],["ts",-1,3]],m=[["\u00edochta",-1,1],["a\u00edochta",0,1],["ire",-1,2],["aire",2,2],["abh",-1,1],["eabh",4,1],["ibh",-1,1],["aibh",6,1],["amh",-1,1],["eamh",8,1],["imh",-1,1],["aimh", +10,1],["\u00edocht",-1,1],["a\u00edocht",12,1],["ir\u00ed",-1,2],["air\u00ed",14,2]],a=[["\u00f3ideacha",-1,6],["patacha",-1,5],["achta",-1,1],["arcachta",2,2],["eachta",2,1],["grafa\u00edochta",-1,4],["paite",-1,5],["ach",-1,1],["each",7,1],["\u00f3ideach",8,6],["gineach",8,3],["patach",7,5],["grafa\u00edoch",-1,4],["pataigh",-1,5],["\u00f3idigh",-1,6],["acht\u00fail",-1,1],["eacht\u00fail",15,1],["gineas",-1,3],["ginis",-1,3],["acht",-1,1],["arcacht",19,2],["eacht",19,1],["grafa\u00edocht",-1,4], +["arcachta\u00ed",-1,2],["grafa\u00edochta\u00ed",-1,4]],b=[["imid",-1,1],["aimid",0,1],["\u00edmid",-1,1],["a\u00edmid",2,1],["adh",-1,2],["eadh",4,2],["faidh",-1,1],["fidh",-1,1],["\u00e1il",-1,2],["ain",-1,2],["tear",-1,2],["tar",-1,2]],l=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,1,17,4,2],n=0,e=0,h=0;this.m=function(){var u=g.cursor,t;g.c=g.cursor;t=g.o(d);if(0!=t)switch(g.d=g.cursor,t){case 1:if(!g.e())break;break;case 2:if(!g.b("f"))break;break;case 3:if(!g.b("s"))break;break;case 4:if(!g.b("b"))break; +break;case 5:if(!g.b("c"))break;break;case 6:if(!g.b("d"))break;break;case 7:if(!g.b("g"))break;break;case 8:if(!g.b("p"))break;break;case 9:if(!g.b("t"))break;break;case 10:g.b("m")}g.cursor=u;n=e=h=g.a;u=g.cursor;a:{b:for(;;){if(g.i(l,97,250))break b;if(g.cursor>=g.a)break a;g.cursor++}h=g.cursor;b:for(;;){if(g.k(l,97,250))break b;if(g.cursor>=g.a)break a;g.cursor++}e=g.cursor;b:for(;;){if(g.i(l,97,250))break b;if(g.cursor>=g.a)break a;g.cursor++}b:for(;;){if(g.k(l,97,250))break b;if(g.cursor>= +g.a)break a;g.cursor++}n=g.cursor}g.cursor=u;g.f=g.cursor;g.cursor=g.a;u=g.a-g.cursor;g.d=g.cursor;t=g.g(m);if(0!=t)switch(g.c=g.cursor,t){case 1:if(!(e<=g.cursor)||!g.e())break;break;case 2:!(n<=g.cursor)||g.e()}g.cursor=g.a-u;u=g.a-g.cursor;g.d=g.cursor;t=g.g(a);if(0!=t)switch(g.c=g.cursor,t){case 1:if(!(n<=g.cursor)||!g.e())break;break;case 2:if(!g.b("arc"))break;break;case 3:if(!g.b("gin"))break;break;case 4:if(!g.b("graf"))break;break;case 5:if(!g.b("paite"))break;break;case 6:g.b("\u00f3id")}g.cursor= +g.a-u;u=g.a-g.cursor;g.d=g.cursor;t=g.g(b);if(0!=t)switch(g.c=g.cursor,t){case 1:if(!(h<=g.cursor)||!g.e())break;break;case 2:!(e<=g.cursor)||g.e()}g.cursor=g.a-u;g.cursor=g.f;return f};this.stemWord=function(a){g.p(a);this.m();return g.j}};window.ItalianStemmer=function(){function g(){for(var d,c=a.cursor;;){var e=a.cursor;a:{a.c=a.cursor;d=a.o(b);a.d=a.cursor;switch(d){case 1:if(!a.b("\u00e0"))return;break;case 2:if(!a.b("\u00e8"))return;break;case 3:if(!a.b("\u00ec"))return;break;case 4:if(!a.b("\u00f2"))return;break;case 5:if(!a.b("\u00f9"))return;break;case 6:if(!a.b("qU"))return;break;case 7:if(a.cursor>=a.a)break a;a.cursor++}continue}a.cursor=e;break}for(a.cursor=c;;){d=a.cursor;a:{b:for(;;){c=a.cursor;c:if(a.i(r,97,249)){a.c= +a.cursor;d:{e=a.cursor;if(a.n("u")&&(a.d=a.cursor,a.i(r,97,249))){if(!a.b("U"))return;break d}a.cursor=e;if(!a.n("i"))break c;a.d=a.cursor;if(!a.i(r,97,249))break c;if(!a.b("I"))return}a.cursor=c;break b}a.cursor=c;if(a.cursor>=a.a)break a;a.cursor++}continue}a.cursor=d;break}}function d(){return v<=a.cursor}function m(){var b;a.d=a.cursor;b=a.g(t);if(0==b)return q;a.c=a.cursor;switch(b){case 1:if(!d()||!a.e())return q;break;case 2:if(!d()||!a.e())return q;b=a.a-a.cursor;a.d=a.cursor;if(a.h("ic"))if(a.c= +a.cursor,d()){if(!a.e())return q}else a.cursor=a.a-b;else a.cursor=a.a-b;break;case 3:if(!d()||!a.b("log"))return q;break;case 4:if(!d()||!a.b("u"))return q;break;case 5:if(!d()||!a.b("ente"))return q;break;case 6:if(!(y<=a.cursor)||!a.e())return q;break;case 7:if(!(w<=a.cursor)||!a.e())return q;var c=a.a-a.cursor;a:if(a.d=a.cursor,b=a.g(h),0==b)a.cursor=a.a-c;else if(a.c=a.cursor,d()){if(!a.e())return q;switch(b){case 1:a.d=a.cursor;if(!a.h("at")){a.cursor=a.a-c;break a}a.c=a.cursor;if(!d()){a.cursor= +a.a-c;break a}if(!a.e())return q}}else a.cursor=a.a-c;break;case 8:if(!d()||!a.e())return q;b=a.a-a.cursor;a.d=a.cursor;if(0==a.g(u))a.cursor=a.a-b;else if(a.c=a.cursor,d()){if(!a.e())return q}else a.cursor=a.a-b;break;case 9:if(!d()||!a.e())return q;b=a.a-a.cursor;a.d=a.cursor;if(a.h("at"))if(a.c=a.cursor,d()){if(!a.e())return q;a.d=a.cursor;if(a.h("ic"))if(a.c=a.cursor,d()){if(!a.e())return q}else a.cursor=a.a-b;else a.cursor=a.a-b}else a.cursor=a.a-b;else a.cursor=a.a-b}return f}var a=new C,b= +[["",-1,7],["qu",0,6],["\u00e1",0,1],["\u00e9",0,2],["\u00ed",0,3],["\u00f3",0,4],["\u00fa",0,5]],l=[["",-1,3],["I",0,1],["U",0,2]],n=[["la",-1,-1],["cela",0,-1],["gliela",0,-1],["mela",0,-1],["tela",0,-1],["vela",0,-1],["le",-1,-1],["cele",6,-1],["gliele",6,-1],["mele",6,-1],["tele",6,-1],["vele",6,-1],["ne",-1,-1],["cene",12,-1],["gliene",12,-1],["mene",12,-1],["sene",12,-1],["tene",12,-1],["vene",12,-1],["ci",-1,-1],["li",-1,-1],["celi",20,-1],["glieli",20,-1],["meli",20,-1],["teli",20,-1],["veli", +20,-1],["gli",20,-1],["mi",-1,-1],["si",-1,-1],["ti",-1,-1],["vi",-1,-1],["lo",-1,-1],["celo",31,-1],["glielo",31,-1],["melo",31,-1],["telo",31,-1],["velo",31,-1]],e=[["ando",-1,1],["endo",-1,1],["ar",-1,2],["er",-1,2],["ir",-1,2]],h=[["ic",-1,-1],["abil",-1,-1],["os",-1,-1],["iv",-1,1]],u=[["ic",-1,1],["abil",-1,1],["iv",-1,1]],t=[["ica",-1,1],["logia",-1,3],["osa",-1,1],["ista",-1,1],["iva",-1,9],["anza",-1,1],["enza",-1,5],["ice",-1,1],["atrice",7,1],["iche",-1,1],["logie",-1,3],["abile",-1,1], +["ibile",-1,1],["usione",-1,4],["azione",-1,2],["uzione",-1,4],["atore",-1,2],["ose",-1,1],["ante",-1,1],["mente",-1,1],["amente",19,7],["iste",-1,1],["ive",-1,9],["anze",-1,1],["enze",-1,5],["ici",-1,1],["atrici",25,1],["ichi",-1,1],["abili",-1,1],["ibili",-1,1],["ismi",-1,1],["usioni",-1,4],["azioni",-1,2],["uzioni",-1,4],["atori",-1,2],["osi",-1,1],["anti",-1,1],["amenti",-1,6],["imenti",-1,6],["isti",-1,1],["ivi",-1,9],["ico",-1,1],["ismo",-1,1],["oso",-1,1],["amento",-1,6],["imento",-1,6],["ivo", +-1,9],["it\u00e0",-1,8],["ist\u00e0",-1,1],["ist\u00e8",-1,1],["ist\u00ec",-1,1]],s=[["isca",-1,1],["enda",-1,1],["ata",-1,1],["ita",-1,1],["uta",-1,1],["ava",-1,1],["eva",-1,1],["iva",-1,1],["erebbe",-1,1],["irebbe",-1,1],["isce",-1,1],["ende",-1,1],["are",-1,1],["ere",-1,1],["ire",-1,1],["asse",-1,1],["ate",-1,1],["avate",16,1],["evate",16,1],["ivate",16,1],["ete",-1,1],["erete",20,1],["irete",20,1],["ite",-1,1],["ereste",-1,1],["ireste",-1,1],["ute",-1,1],["erai",-1,1],["irai",-1,1],["isci",-1, +1],["endi",-1,1],["erei",-1,1],["irei",-1,1],["assi",-1,1],["ati",-1,1],["iti",-1,1],["eresti",-1,1],["iresti",-1,1],["uti",-1,1],["avi",-1,1],["evi",-1,1],["ivi",-1,1],["isco",-1,1],["ando",-1,1],["endo",-1,1],["Yamo",-1,1],["iamo",-1,1],["avamo",-1,1],["evamo",-1,1],["ivamo",-1,1],["eremo",-1,1],["iremo",-1,1],["assimo",-1,1],["ammo",-1,1],["emmo",-1,1],["eremmo",54,1],["iremmo",54,1],["immo",-1,1],["ano",-1,1],["iscano",58,1],["avano",58,1],["evano",58,1],["ivano",58,1],["eranno",-1,1],["iranno", +-1,1],["ono",-1,1],["iscono",65,1],["arono",65,1],["erono",65,1],["irono",65,1],["erebbero",-1,1],["irebbero",-1,1],["assero",-1,1],["essero",-1,1],["issero",-1,1],["ato",-1,1],["ito",-1,1],["uto",-1,1],["avo",-1,1],["evo",-1,1],["ivo",-1,1],["ar",-1,1],["ir",-1,1],["er\u00e0",-1,1],["ir\u00e0",-1,1],["er\u00f2",-1,1],["ir\u00f2",-1,1]],r=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,128,128,8,2,1],p=[17,65,0,0,0,0,0,0,0,0,0,0,0,0,0,128,128,8,2],k=[17],v=0,w=0,y=0;this.m=function(){a:{var b=a.cursor;var c;a.c= +a.cursor;!a.n("divano")||a.cursor=a.a)break f;a.cursor++}break e}a.cursor=d;if(!a.i(r,97,249))break d;f:for(;;){if(a.k(r,97,249))break f;if(a.cursor>=a.a)break d;a.cursor++}}break c}a.cursor=c;if(!a.k(r,97,249))break b;d:{c=a.cursor;e:if(a.k(r,97,249)){f:for(;;){if(a.i(r, +97,249))break f;if(a.cursor>=a.a)break e;a.cursor++}break d}a.cursor=c;if(!a.i(r,97,249))break b;if(a.cursor>=a.a)break b;a.cursor++}}y=a.cursor}a.cursor=b;b=a.cursor;b:{c:for(;;){if(a.i(r,97,249))break c;if(a.cursor>=a.a)break b;a.cursor++}c:for(;;){if(a.k(r,97,249))break c;if(a.cursor>=a.a)break b;a.cursor++}w=a.cursor;c:for(;;){if(a.i(r,97,249))break c;if(a.cursor>=a.a)break b;a.cursor++}c:for(;;){if(a.k(r,97,249))break c;if(a.cursor>=a.a)break b;a.cursor++}v=a.cursor}a.cursor=b;a.f=a.cursor;a.cursor= +a.a;b=a.a-a.cursor;a.d=a.cursor;if(0!=a.g(n)&&(a.c=a.cursor,c=a.g(e),0!=c&&y<=a.cursor))switch(c){case 1:if(!a.e())break;break;case 2:a.b("e")}a.cursor=a.a-b;b=a.a-a.cursor;c:{c=a.a-a.cursor;if(m())break c;a.cursor=a.a-c;a.cursor=a.a)break c;a.cursor++}continue}a.cursor=d;break}a.cursor=b}return f};this.stemWord=function(b){a.p(b);this.m(); +return a.j}};window.LithuanianStemmer=function(){function g(){var a;d.d=d.cursor;a=d.g(l);if(0!=a)switch(d.c=d.cursor,a){case 1:if(!d.b("t"))break;break;case 2:d.b("d")}}var d=new C,m=[["a",-1,-1],["ia",0,-1],["eria",1,-1],["osna",0,-1],["iosna",3,-1],["uosna",3,-1],["iuosna",5,-1],["ysna",0,-1],["\u0117sna",0,-1],["e",-1,-1],["ie",9,-1],["enie",10,-1],["erie",10,-1],["oje",9,-1],["ioje",13,-1],["uje",9,-1],["iuje",15,-1],["yje",9,-1],["enyje",17,-1],["eryje",17,-1],["\u0117je",9,-1],["ame",9,-1],["iame",21,-1], +["sime",9,-1],["ome",9,-1],["\u0117me",9,-1],["tum\u0117me",25,-1],["ose",9,-1],["iose",27,-1],["uose",27,-1],["iuose",29,-1],["yse",9,-1],["enyse",31,-1],["eryse",31,-1],["\u0117se",9,-1],["ate",9,-1],["iate",35,-1],["ite",9,-1],["kite",37,-1],["site",37,-1],["ote",9,-1],["tute",9,-1],["\u0117te",9,-1],["tum\u0117te",42,-1],["i",-1,-1],["ai",44,-1],["iai",45,-1],["eriai",46,-1],["ei",44,-1],["tumei",48,-1],["ki",44,-1],["imi",44,-1],["erimi",51,-1],["umi",44,-1],["iumi",53,-1],["si",44,-1],["asi", +55,-1],["iasi",56,-1],["esi",55,-1],["iesi",58,-1],["siesi",59,-1],["isi",55,-1],["aisi",61,-1],["eisi",61,-1],["tumeisi",63,-1],["uisi",61,-1],["osi",55,-1],["\u0117josi",66,-1],["uosi",66,-1],["iuosi",68,-1],["siuosi",69,-1],["usi",55,-1],["ausi",71,-1],["\u010diausi",72,-1],["\u0105si",55,-1],["\u0117si",55,-1],["\u0173si",55,-1],["t\u0173si",76,-1],["ti",44,-1],["enti",78,-1],["inti",78,-1],["oti",78,-1],["ioti",81,-1],["uoti",81,-1],["iuoti",83,-1],["auti",78,-1],["iauti",85,-1],["yti",78,-1], +["\u0117ti",78,-1],["tel\u0117ti",88,-1],["in\u0117ti",88,-1],["ter\u0117ti",88,-1],["ui",44,-1],["iui",92,-1],["eniui",93,-1],["oj",-1,-1],["\u0117j",-1,-1],["k",-1,-1],["am",-1,-1],["iam",98,-1],["iem",-1,-1],["im",-1,-1],["sim",101,-1],["om",-1,-1],["tum",-1,-1],["\u0117m",-1,-1],["tum\u0117m",105,-1],["an",-1,-1],["on",-1,-1],["ion",108,-1],["un",-1,-1],["iun",110,-1],["\u0117n",-1,-1],["o",-1,-1],["io",113,-1],["enio",114,-1],["\u0117jo",113,-1],["uo",113,-1],["s",-1,-1],["as",118,-1],["ias", +119,-1],["es",118,-1],["ies",121,-1],["is",118,-1],["ais",123,-1],["iais",124,-1],["tumeis",123,-1],["imis",123,-1],["enimis",127,-1],["omis",123,-1],["iomis",129,-1],["umis",123,-1],["\u0117mis",123,-1],["enis",123,-1],["asis",123,-1],["ysis",123,-1],["ams",118,-1],["iams",136,-1],["iems",118,-1],["ims",118,-1],["enims",139,-1],["erims",139,-1],["oms",118,-1],["ioms",142,-1],["ums",118,-1],["\u0117ms",118,-1],["ens",118,-1],["os",118,-1],["ios",147,-1],["uos",147,-1],["iuos",149,-1],["ers",118,-1], +["us",118,-1],["aus",152,-1],["iaus",153,-1],["ius",152,-1],["ys",118,-1],["enys",156,-1],["erys",156,-1],["\u0105s",118,-1],["i\u0105s",159,-1],["\u0117s",118,-1],["am\u0117s",161,-1],["iam\u0117s",162,-1],["im\u0117s",161,-1],["kim\u0117s",164,-1],["sim\u0117s",164,-1],["om\u0117s",161,-1],["\u0117m\u0117s",161,-1],["tum\u0117m\u0117s",168,-1],["at\u0117s",161,-1],["iat\u0117s",170,-1],["sit\u0117s",161,-1],["ot\u0117s",161,-1],["\u0117t\u0117s",161,-1],["tum\u0117t\u0117s",174,-1],["\u012fs",118, +-1],["\u016bs",118,-1],["t\u0173s",118,-1],["at",-1,-1],["iat",179,-1],["it",-1,-1],["sit",181,-1],["ot",-1,-1],["\u0117t",-1,-1],["tum\u0117t",184,-1],["u",-1,-1],["au",186,-1],["iau",187,-1],["\u010diau",188,-1],["iu",186,-1],["eniu",190,-1],["siu",190,-1],["y",-1,-1],["\u0105",-1,-1],["i\u0105",194,-1],["\u0117",-1,-1],["\u0119",-1,-1],["\u012f",-1,-1],["en\u012f",198,-1],["er\u012f",198,-1],["\u0173",-1,-1],["i\u0173",201,-1],["er\u0173",201,-1]],a=[["ing",-1,-1],["aj",-1,-1],["iaj",1,-1],["iej", +-1,-1],["oj",-1,-1],["ioj",4,-1],["uoj",4,-1],["iuoj",6,-1],["auj",-1,-1],["\u0105j",-1,-1],["i\u0105j",9,-1],["\u0117j",-1,-1],["\u0173j",-1,-1],["i\u0173j",12,-1],["ok",-1,-1],["iok",14,-1],["iuk",-1,-1],["uliuk",16,-1],["u\u010diuk",16,-1],["i\u0161k",-1,-1],["iul",-1,-1],["yl",-1,-1],["\u0117l",-1,-1],["am",-1,-1],["dam",23,-1],["jam",23,-1],["zgan",-1,-1],["ain",-1,-1],["esn",-1,-1],["op",-1,-1],["iop",29,-1],["ias",-1,-1],["ies",-1,-1],["ais",-1,-1],["iais",33,-1],["os",-1,-1],["ios",35,-1], +["uos",35,-1],["iuos",37,-1],["aus",-1,-1],["iaus",39,-1],["\u0105s",-1,-1],["i\u0105s",41,-1],["\u0119s",-1,-1],["ut\u0117ait",-1,-1],["ant",-1,-1],["iant",45,-1],["siant",46,-1],["int",-1,-1],["ot",-1,-1],["uot",49,-1],["iuot",50,-1],["yt",-1,-1],["\u0117t",-1,-1],["yk\u0161t",-1,-1],["iau",-1,-1],["dav",-1,-1],["sv",-1,-1],["\u0161v",-1,-1],["yk\u0161\u010d",-1,-1],["\u0119",-1,-1],["\u0117j\u0119",60,-1]],b=[["ojime",-1,7],["\u0117jime",-1,3],["avime",-1,6],["okate",-1,8],["aite",-1,1],["uote", +-1,2],["asius",-1,5],["okat\u0117s",-1,8],["ait\u0117s",-1,1],["uot\u0117s",-1,2],["esiu",-1,4]],l=[["\u010d",-1,1],["d\u017e",-1,2]],n=[["gd",-1,1]],e=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,64,1,0,64,0,0,0,0,0,0,0,4,4],h=0;this.m=function(){h=d.a;var l=d.cursor;a:{var t=d.cursor;var s=d.cursor;d.n("a")?(d.cursor=s,6>=d.j.length?d.cursor=t:(s=d.cursor+1,d.cursor=s>d.a?t:s)):d.cursor=t;b:for(;;){if(d.i(e,97,371))break b;if(d.cursor>=d.a)break a;d.cursor++}b:for(;;){if(d.k(e,97,371))break b; +if(d.cursor>=d.a)break a;d.cursor++}h=d.cursor}d.cursor=l;d.f=d.cursor;d.cursor=d.a;l=d.a-d.cursor;d.d=d.cursor;t=d.g(b);if(0!=t)switch(d.c=d.cursor,t){case 1:if(!d.b("ait\u0117"))break;break;case 2:if(!d.b("uot\u0117"))break;break;case 3:if(!d.b("\u0117jimas"))break;break;case 4:if(!d.b("esys"))break;break;case 5:if(!d.b("asys"))break;break;case 6:if(!d.b("avimas"))break;break;case 7:if(!d.b("ojimas"))break;break;case 8:d.b("okat\u0117")}d.cursor=d.a-l;l=d.a-d.cursor;d.cursorg.a)){g.cursor=t;n=g.cursor;g.cursor=u;b:for(;;){u=g.cursor;if(g.i(b,97,248)){g.cursor=u;break b}g.cursor=u;if(g.cursor>=g.a)break a;g.cursor++}b:for(;;){if(g.k(b,97,248))break b;if(g.cursor>=g.a)break a;g.cursor++}e=g.cursor;e>=n||(e=n)}}g.cursor=h;g.f= +g.cursor;g.cursor=g.a;h=g.a-g.cursor;if(!(g.cursor=d.a)break b;d.cursor++}if(!d.b("Y"))return q;t=f;continue}d.cursor=k;break}d.cursor=p;s=r=d.a;p=d.cursor;a:{b:for(;;){if(d.i(h,97,121))break b;if(d.cursor>=d.a)break a; +d.cursor++}b:for(;;){if(d.k(h,97,121))break b;if(d.cursor>=d.a)break a;d.cursor++}r=d.cursor;b:for(;;){if(d.i(h,97,121))break b;if(d.cursor>=d.a)break a;d.cursor++}b:for(;;){if(d.k(h,97,121))break b;if(d.cursor>=d.a)break a;d.cursor++}s=d.cursor}d.cursor=p;d.f=d.cursor;d.cursor=d.a;p=d.a-d.cursor;d.d=d.cursor;k=d.g(m);if(0!=k)switch(d.c=d.cursor,k){case 1:if(!d.b("ss"))break;break;case 2:if(!d.b("i"))break;break;case 3:d.e()}d.cursor=d.a-p;p=d.a-d.cursor;a:if(d.d=d.cursor,k=d.g(b),0!=k)switch(d.c= +d.cursor,k){case 1:if(!(r<=d.cursor)||!d.b("ee"))break;break;case 2:k=d.a-d.cursor;b:for(;;){if(d.l(h,97,121))break b;if(d.cursor<=d.f)break a;d.cursor--}d.cursor=d.a-k;if(!d.e())break;v=d.a-d.cursor;k=d.g(a);d.cursor=d.a-v;switch(k){case 1:k=d.cursor;d.r(d.cursor,d.cursor,"e");d.cursor=k;break;case 2:d.d=d.cursor;if(d.cursor<=d.f)break;d.cursor--;d.c=d.cursor;if(!d.e())break;break;case 3:if(d.cursor!=r)break;k=d.a-d.cursor;if(!g())break;d.cursor=d.a-k;k=d.cursor;d.r(d.cursor,d.cursor,"e");d.cursor= +k}}d.cursor=d.a-p;p=d.a-d.cursor;a:{d.d=d.cursor;b:{k=d.a-d.cursor;if(d.h("y"))break b;d.cursor=d.a-k;if(!d.h("Y"))break a}d.c=d.cursor;b:for(;;){if(d.l(h,97,121))break b;if(d.cursor<=d.f)break a;d.cursor--}d.b("i")}d.cursor=d.a-p;p=d.a-d.cursor;d.d=d.cursor;k=d.g(l);if(0!=k&&(d.c=d.cursor,r<=d.cursor))switch(k){case 1:if(!d.b("tion"))break;break;case 2:if(!d.b("ence"))break;break;case 3:if(!d.b("ance"))break;break;case 4:if(!d.b("able"))break;break;case 5:if(!d.b("ent"))break;break;case 6:if(!d.b("e"))break; +break;case 7:if(!d.b("ize"))break;break;case 8:if(!d.b("ate"))break;break;case 9:if(!d.b("al"))break;break;case 10:if(!d.b("ful"))break;break;case 11:if(!d.b("ous"))break;break;case 12:if(!d.b("ive"))break;break;case 13:d.b("ble")}d.cursor=d.a-p;p=d.a-d.cursor;d.d=d.cursor;k=d.g(n);if(0!=k&&(d.c=d.cursor,r<=d.cursor))switch(k){case 1:if(!d.b("al"))break;break;case 2:if(!d.b("ic"))break;break;case 3:d.e()}d.cursor=d.a-p;p=d.a-d.cursor;d.d=d.cursor;k=d.g(e);if(0!=k&&(d.c=d.cursor,s<=d.cursor))switch(k){case 1:if(!d.e())break; +break;case 2:a:{k=d.a-d.cursor;if(d.h("s"))break a;d.cursor=d.a-k;if(!d.h("t"))break}d.e()}d.cursor=d.a-p;p=d.a-d.cursor;a:if(d.d=d.cursor,d.h("e")){d.c=d.cursor;b:{if(s<=d.cursor)break b;d.cursor=d.a-(d.a-d.cursor);if(!(r<=d.cursor))break a;k=d.a-d.cursor;if(g())break a;d.cursor=d.a-k}d.e()}d.cursor=d.a-p;p=d.a-d.cursor;d.d=d.cursor;d.h("l")&&(d.c=d.cursor,!(s<=d.cursor)||!d.h("l")||d.e());d.cursor=d.a-p;d.cursor=d.f;p=d.cursor;if(t)for(;;){k=d.cursor;b:{c:for(;;){v=d.cursor;d.c=d.cursor;if(d.n("Y")){d.d= +d.cursor;d.cursor=v;break c}d.cursor=v;if(d.cursor>=d.a)break b;d.cursor++}if(!d.b("y"))return q;continue}d.cursor=k;break}d.cursor=p;return f};this.stemWord=function(a){d.p(a);this.m();return d.j}};window.PortugueseStemmer=function(){function g(){return k<=a.cursor}function d(){var b;a.d=a.cursor;b=a.g(u);if(0==b)return q;a.c=a.cursor;switch(b){case 1:if(!g()||!a.e())return q;break;case 2:if(!g()||!a.b("log"))return q;break;case 3:if(!g()||!a.b("u"))return q;break;case 4:if(!g()||!a.b("ente"))return q;break;case 5:if(!(v<=a.cursor)||!a.e())return q;var d=a.a-a.cursor;a:if(a.d=a.cursor,b=a.g(n),0==b)a.cursor=a.a-d;else if(a.c=a.cursor,g()){if(!a.e())return q;switch(b){case 1:a.d=a.cursor;if(!a.h("at")){a.cursor= +a.a-d;break a}a.c=a.cursor;if(!g()){a.cursor=a.a-d;break a}if(!a.e())return q}}else a.cursor=a.a-d;break;case 6:if(!g()||!a.e())return q;b=a.a-a.cursor;a.d=a.cursor;if(0==a.g(e))a.cursor=a.a-b;else if(a.c=a.cursor,g()){if(!a.e())return q}else a.cursor=a.a-b;break;case 7:if(!g()||!a.e())return q;b=a.a-a.cursor;a.d=a.cursor;if(0==a.g(h))a.cursor=a.a-b;else if(a.c=a.cursor,g()){if(!a.e())return q}else a.cursor=a.a-b;break;case 8:if(!g()||!a.e())return q;b=a.a-a.cursor;a.d=a.cursor;if(a.h("at"))if(a.c= +a.cursor,g()){if(!a.e())return q}else a.cursor=a.a-b;else a.cursor=a.a-b;break;case 9:if(!(w<=a.cursor)||!a.h("e")||!a.b("ir"))return q}return f}function m(){if(a.cursor=a.a)break b; +a.cursor++}continue}a.cursor=c;break}a.cursor=e;k=v=w=a.a;e=a.cursor;a:{b:{g=a.cursor;c:if(a.i(p,97,250)){d:{c=a.cursor;e:if(a.k(p,97,250)){f:for(;;){if(a.i(p,97,250))break f;if(a.cursor>=a.a)break e;a.cursor++}break d}a.cursor=c;if(!a.i(p,97,250))break c;e:for(;;){if(a.k(p,97,250))break e;if(a.cursor>=a.a)break c;a.cursor++}}break b}a.cursor=g;if(!a.k(p,97,250))break a;c:{g=a.cursor;d:if(a.k(p,97,250)){e:for(;;){if(a.i(p,97,250))break e;if(a.cursor>=a.a)break d;a.cursor++}break c}a.cursor=g;if(!a.i(p, +97,250))break a;if(a.cursor>=a.a)break a;a.cursor++}}w=a.cursor}a.cursor=e;e=a.cursor;a:{b:for(;;){if(a.i(p,97,250))break b;if(a.cursor>=a.a)break a;a.cursor++}b:for(;;){if(a.k(p,97,250))break b;if(a.cursor>=a.a)break a;a.cursor++}v=a.cursor;b:for(;;){if(a.i(p,97,250))break b;if(a.cursor>=a.a)break a;a.cursor++}b:for(;;){if(a.k(p,97,250))break b;if(a.cursor>=a.a)break a;a.cursor++}k=a.cursor}a.cursor=e;a.f=a.cursor;a.cursor=a.a;e=a.a-a.cursor;b:{g=a.a-a.cursor;c:{c=a.a-a.cursor;d:{var h=a.a-a.cursor; +if(d())break d;a.cursor=a.a-h;if(!m())break c}a.cursor=a.a-c;g=a.a-a.cursor;a.d=a.cursor;if(a.h("i")&&(a.c=a.cursor,c=a.a-a.cursor,a.h("c")&&(a.cursor=a.a-c,w<=a.cursor&&!a.e())))return q;a.cursor=a.a-g;break b}a.cursor=a.a-g;a.d=a.cursor;0!=a.g(s)&&(a.c=a.cursor,!(w<=a.cursor)||a.e())}a.cursor=a.a-e;e=a.a-a.cursor;a.d=a.cursor;g=a.g(r);if(0!=g)switch(a.c=a.cursor,g){case 1:if(!(w<=a.cursor)||!a.e())break;a.d=a.cursor;a:{g=a.a-a.cursor;if(a.h("u")&&(a.c=a.cursor,c=a.a-a.cursor,a.h("g"))){a.cursor= +a.a-c;break a}a.cursor=a.a-g;if(!a.h("i"))break;a.c=a.cursor;g=a.a-a.cursor;if(!a.h("c"))break;a.cursor=a.a-g}if(!(w<=a.cursor)||!a.e())break;break;case 2:a.b("c")}a.cursor=a.a-e;a.cursor=a.f;e=a.cursor;a:for(;;){c=a.cursor;b:{a.c=a.cursor;g=a.o(l);a.d=a.cursor;switch(g){case 1:if(!a.b("\u00e3"))break a;break;case 2:if(!a.b("\u00f5"))break a;break;case 3:if(a.cursor>=a.a)break b;a.cursor++}continue}a.cursor=c;break}a.cursor=e;return f};this.stemWord=function(b){a.p(b);this.m();return a.j}};window.RomanianStemmer=function(){function g(){var b,d=a.a-a.cursor;a.d=a.cursor;b=a.g(e);if(0==b)return q;a.c=a.cursor;if(!(k<=a.cursor))return q;switch(b){case 1:if(!a.b("abil"))return q;break;case 2:if(!a.b("ibil"))return q;break;case 3:if(!a.b("iv"))return q;break;case 4:if(!a.b("ic"))return q;break;case 5:if(!a.b("at"))return q;break;case 6:if(!a.b("it"))return q}r=f;a.cursor=a.a-d;return f}function d(){var b;for(r=q;;){b=a.a-a.cursor;if(g())continue;a.cursor=a.a-b;break}a.d=a.cursor;b=a.g(h); +if(0!=b&&(a.c=a.cursor,p<=a.cursor)){switch(b){case 1:if(!a.e())return;break;case 2:if(!a.h("\u021b"))return;a.c=a.cursor;if(!a.b("t"))return;break;case 3:if(!a.b("ist"))return}r=f}}function m(){var b;if(!(a.cursor=a.a)break c;a.cursor++}continue}a.cursor=h;break}a.cursor=g}e=a.cursor;a:for(;;){g=a.cursor;b:{c:for(;;){h=a.cursor;d:if(a.i(s,97,259)){a.c=a.cursor;e:{c=a.cursor;if(a.n("u")&&(a.d=a.cursor,a.i(s,97,259))){if(!a.b("U"))break a;break e}a.cursor=c;if(!a.n("i"))break d;a.d=a.cursor;if(!a.i(s,97,259))break d;if(!a.b("I"))break a}a.cursor=h;break c}a.cursor=h;if(a.cursor>=a.a)break b;a.cursor++}continue}a.cursor=g;break}a.cursor=e;p=k=v=a.a;e=a.cursor;a:{b:{g=a.cursor; +c:if(a.i(s,97,259)){d:{h=a.cursor;e:if(a.k(s,97,259)){f:for(;;){if(a.i(s,97,259))break f;if(a.cursor>=a.a)break e;a.cursor++}break d}a.cursor=h;if(!a.i(s,97,259))break c;e:for(;;){if(a.k(s,97,259))break e;if(a.cursor>=a.a)break c;a.cursor++}}break b}a.cursor=g;if(!a.k(s,97,259))break a;c:{g=a.cursor;d:if(a.k(s,97,259)){e:for(;;){if(a.i(s,97,259))break e;if(a.cursor>=a.a)break d;a.cursor++}break c}a.cursor=g;if(!a.i(s,97,259))break a;if(a.cursor>=a.a)break a;a.cursor++}}v=a.cursor}a.cursor=e;e=a.cursor; +a:{b:for(;;){if(a.i(s,97,259))break b;if(a.cursor>=a.a)break a;a.cursor++}b:for(;;){if(a.k(s,97,259))break b;if(a.cursor>=a.a)break a;a.cursor++}k=a.cursor;b:for(;;){if(a.i(s,97,259))break b;if(a.cursor>=a.a)break a;a.cursor++}b:for(;;){if(a.k(s,97,259))break b;if(a.cursor>=a.a)break a;a.cursor++}p=a.cursor}a.cursor=e;a.f=a.cursor;a.cursor=a.a;e=a.a-a.cursor;a.d=a.cursor;g=a.g(n);if(0!=g&&(a.c=a.cursor,k<=a.cursor))switch(g){case 1:if(!a.e())break;break;case 2:if(!a.b("a"))break;break;case 3:if(!a.b("e"))break; +break;case 4:if(!a.b("i"))break;break;case 5:g=a.a-a.cursor;if(a.h("ab"))break;a.cursor=a.a-g;if(!a.b("i"))break;break;case 6:if(!a.b("at"))break;break;case 7:a.b("a\u021bi")}a.cursor=a.a-e;e=a.a-a.cursor;d();a.cursor=a.a-e;e=a.a-a.cursor;b:{if(r)break b;a.cursor=a.a-(a.a-a.cursor);m()}a.cursor=a.a-e;e=a.a-a.cursor;a.d=a.cursor;0!=a.g(t)&&(a.c=a.cursor,!(v<=a.cursor)||a.e());a.cursor=a.a-e;a.cursor=a.f;e=a.cursor;a:for(;;){h=a.cursor;b:{a.c=a.cursor;g=a.o(l);a.d=a.cursor;switch(g){case 1:if(!a.b("i"))break a; +break;case 2:if(!a.b("u"))break a;break;case 3:if(a.cursor>=a.a)break b;a.cursor++}continue}a.cursor=h;break}a.cursor=e;return f};this.stemWord=function(b){a.p(b);this.m();return a.j}};window.RussianStemmer=function(){function g(){var d;a.d=a.cursor;d=a.g(b);if(0==d)return q;a.c=a.cursor;switch(d){case 1:a:{d=a.a-a.cursor;if(a.h("\u0430"))break a;a.cursor=a.a-d;if(!a.h("\u044f"))return q}if(!a.e())return q;break;case 2:if(!a.e())return q}return f}function d(){var b,d;a.d=a.cursor;0==a.g(l)?d=q:(a.c=a.cursor,d=!a.e()?q:f);if(!d)return q;d=a.a-a.cursor;a:if(a.d=a.cursor,b=a.g(n),0==b)a.cursor=a.a-d;else switch(a.c=a.cursor,b){case 1:b:{b=a.a-a.cursor;if(a.h("\u0430"))break b;a.cursor= +a.a-b;if(!a.h("\u044f")){a.cursor=a.a-d;break a}}if(!a.e())return q;break;case 2:if(!a.e())return q}return f}function m(){var b;a.d=a.cursor;b=a.g(h);if(0==b)return q;a.c=a.cursor;switch(b){case 1:a:{b=a.a-a.cursor;if(a.h("\u0430"))break a;a.cursor=a.a-b;if(!a.h("\u044f"))return q}if(!a.e())return q;break;case 2:if(!a.e())return q}return f}var a=new C,b=[["\u0432",-1,1],["\u0438\u0432",0,2],["\u044b\u0432",0,2],["\u0432\u0448\u0438",-1,1],["\u0438\u0432\u0448\u0438",3,2],["\u044b\u0432\u0448\u0438", +3,2],["\u0432\u0448\u0438\u0441\u044c",-1,1],["\u0438\u0432\u0448\u0438\u0441\u044c",6,2],["\u044b\u0432\u0448\u0438\u0441\u044c",6,2]],l=[["\u0435\u0435",-1,1],["\u0438\u0435",-1,1],["\u043e\u0435",-1,1],["\u044b\u0435",-1,1],["\u0438\u043c\u0438",-1,1],["\u044b\u043c\u0438",-1,1],["\u0435\u0439",-1,1],["\u0438\u0439",-1,1],["\u043e\u0439",-1,1],["\u044b\u0439",-1,1],["\u0435\u043c",-1,1],["\u0438\u043c",-1,1],["\u043e\u043c",-1,1],["\u044b\u043c",-1,1],["\u0435\u0433\u043e",-1,1],["\u043e\u0433\u043e", +-1,1],["\u0435\u043c\u0443",-1,1],["\u043e\u043c\u0443",-1,1],["\u0438\u0445",-1,1],["\u044b\u0445",-1,1],["\u0435\u044e",-1,1],["\u043e\u044e",-1,1],["\u0443\u044e",-1,1],["\u044e\u044e",-1,1],["\u0430\u044f",-1,1],["\u044f\u044f",-1,1]],n=[["\u0435\u043c",-1,1],["\u043d\u043d",-1,1],["\u0432\u0448",-1,1],["\u0438\u0432\u0448",2,2],["\u044b\u0432\u0448",2,2],["\u0449",-1,1],["\u044e\u0449",5,1],["\u0443\u044e\u0449",6,2]],e=[["\u0441\u044c",-1,1],["\u0441\u044f",-1,1]],h=[["\u043b\u0430",-1,1],["\u0438\u043b\u0430", +0,2],["\u044b\u043b\u0430",0,2],["\u043d\u0430",-1,1],["\u0435\u043d\u0430",3,2],["\u0435\u0442\u0435",-1,1],["\u0438\u0442\u0435",-1,2],["\u0439\u0442\u0435",-1,1],["\u0435\u0439\u0442\u0435",7,2],["\u0443\u0439\u0442\u0435",7,2],["\u043b\u0438",-1,1],["\u0438\u043b\u0438",10,2],["\u044b\u043b\u0438",10,2],["\u0439",-1,1],["\u0435\u0439",13,2],["\u0443\u0439",13,2],["\u043b",-1,1],["\u0438\u043b",16,2],["\u044b\u043b",16,2],["\u0435\u043c",-1,1],["\u0438\u043c",-1,2],["\u044b\u043c",-1,2],["\u043d", +-1,1],["\u0435\u043d",22,2],["\u043b\u043e",-1,1],["\u0438\u043b\u043e",24,2],["\u044b\u043b\u043e",24,2],["\u043d\u043e",-1,1],["\u0435\u043d\u043e",27,2],["\u043d\u043d\u043e",27,1],["\u0435\u0442",-1,1],["\u0443\u0435\u0442",30,2],["\u0438\u0442",-1,2],["\u044b\u0442",-1,2],["\u044e\u0442",-1,1],["\u0443\u044e\u0442",34,2],["\u044f\u0442",-1,2],["\u043d\u044b",-1,1],["\u0435\u043d\u044b",37,2],["\u0442\u044c",-1,1],["\u0438\u0442\u044c",39,2],["\u044b\u0442\u044c",39,2],["\u0435\u0448\u044c",-1, +1],["\u0438\u0448\u044c",-1,2],["\u044e",-1,2],["\u0443\u044e",44,2]],u=[["\u0430",-1,1],["\u0435\u0432",-1,1],["\u043e\u0432",-1,1],["\u0435",-1,1],["\u0438\u0435",3,1],["\u044c\u0435",3,1],["\u0438",-1,1],["\u0435\u0438",6,1],["\u0438\u0438",6,1],["\u0430\u043c\u0438",6,1],["\u044f\u043c\u0438",6,1],["\u0438\u044f\u043c\u0438",10,1],["\u0439",-1,1],["\u0435\u0439",12,1],["\u0438\u0435\u0439",13,1],["\u0438\u0439",12,1],["\u043e\u0439",12,1],["\u0430\u043c",-1,1],["\u0435\u043c",-1,1],["\u0438\u0435\u043c", +18,1],["\u043e\u043c",-1,1],["\u044f\u043c",-1,1],["\u0438\u044f\u043c",21,1],["\u043e",-1,1],["\u0443",-1,1],["\u0430\u0445",-1,1],["\u044f\u0445",-1,1],["\u0438\u044f\u0445",26,1],["\u044b",-1,1],["\u044c",-1,1],["\u044e",-1,1],["\u0438\u044e",30,1],["\u044c\u044e",30,1],["\u044f",-1,1],["\u0438\u044f",33,1],["\u044c\u044f",33,1]],t=[["\u043e\u0441\u0442",-1,1],["\u043e\u0441\u0442\u044c",-1,1]],s=[["\u0435\u0439\u0448\u0435",-1,1],["\u043d",-1,2],["\u0435\u0439\u0448",-1,1],["\u044c",-1,3]],r= +[33,65,8,232],p=0,k=0;this.m=function(){var b=a.cursor;for(;;){var h=a.cursor;b:{c:for(;;){var l=a.cursor;a.c=a.cursor;if(a.n("\u0451")){a.d=a.cursor;a.cursor=l;break c}a.cursor=l;if(a.cursor>=a.a)break b;a.cursor++}if(!a.b("\u0435"))return q;continue}a.cursor=h;break}a.cursor=b;p=k=a.a;b=a.cursor;a:{b:for(;;){if(a.i(r,1072,1103))break b;if(a.cursor>=a.a)break a;a.cursor++}k=a.cursor;b:for(;;){if(a.k(r,1072,1103))break b;if(a.cursor>=a.a)break a;a.cursor++}b:for(;;){if(a.i(r,1072,1103))break b;if(a.cursor>= +a.a)break a;a.cursor++}b:for(;;){if(a.k(r,1072,1103))break b;if(a.cursor>=a.a)break a;a.cursor++}p=a.cursor}a.cursor=b;a.f=a.cursor;a.cursor=a.a;if(a.cursor=a.a)break b;a.cursor++}continue}a.cursor=g;break}a.cursor=e}function d(){var b=a.cursor;for(;;){var d=a.cursor;b:{c:for(;;){var e=a.cursor;if(a.i(t,98,382)&&(a.c=a.cursor,a.n("ije")&&(a.d=a.cursor,a.i(t,98,382)))){if(!a.b("e"))return;a.cursor=e;break c}a.cursor=e;if(a.cursor>= +a.a)break b;a.cursor++}continue}a.cursor=d;break}a.cursor=b;b=a.cursor;for(;;){d=a.cursor;b:{c:for(;;){e=a.cursor;if(a.i(t,98,382)&&(a.c=a.cursor,a.n("je")&&(a.d=a.cursor,a.i(t,98,382)))){if(!a.b("e"))return;a.cursor=e;break c}a.cursor=e;if(a.cursor>=a.a)break b;a.cursor++}continue}a.cursor=d;break}a.cursor=b;b=a.cursor;for(;;){d=a.cursor;b:{c:for(;;){e=a.cursor;a.c=a.cursor;if(a.n("dj")){a.d=a.cursor;if(!a.b("\u0111"))return;a.cursor=e;break c}a.cursor=e;if(a.cursor>=a.a)break b;a.cursor++}continue}a.cursor= +d;break}a.cursor=b}function m(){var b;a.d=a.cursor;b=a.g(n);if(0==b)return q;a.c=a.cursor;if(!(r<=a.cursor))return q;switch(b){case 1:if(!a.b("sk"))return q;break;case 2:if(!a.b("\u0161k"))return q;break;case 3:if(!a.b("stv"))return q;break;case 4:if(!a.b("\u0161tv"))return q;break;case 5:if(!a.b("tanij"))return q;break;case 6:if(!a.b("manij"))return q;break;case 7:if(!a.b("panij"))return q;break;case 8:if(!a.b("ranij"))return q;break;case 9:if(!a.b("ganij"))return q;break;case 10:if(!a.b("an"))return q; +break;case 11:if(!a.b("in"))return q;break;case 12:if(!a.b("on"))return q;break;case 13:if(!a.b("n"))return q;break;case 14:if(!a.b("a\u0107"))return q;break;case 15:if(!a.b("e\u0107"))return q;break;case 16:if(!a.b("u\u0107"))return q;break;case 17:if(!a.b("ugov"))return q;break;case 18:if(!a.b("ug"))return q;break;case 19:if(!a.b("log"))return q;break;case 20:if(!a.b("g"))return q;break;case 21:if(!a.b("rari"))return q;break;case 22:if(!a.b("oti"))return q;break;case 23:if(!a.b("si"))return q;break; +case 24:if(!a.b("li"))return q;break;case 25:if(!a.b("uj"))return q;break;case 26:if(!a.b("caj"))return q;break;case 27:if(!a.b("\u010daj"))return q;break;case 28:if(!a.b("\u0107aj"))return q;break;case 29:if(!a.b("\u0111aj"))return q;break;case 30:if(!a.b("laj"))return q;break;case 31:if(!a.b("raj"))return q;break;case 32:if(!a.b("bij"))return q;break;case 33:if(!a.b("cij"))return q;break;case 34:if(!a.b("dij"))return q;break;case 35:if(!a.b("lij"))return q;break;case 36:if(!a.b("nij"))return q; +break;case 37:if(!a.b("mij"))return q;break;case 38:if(!a.b("\u017eij"))return q;break;case 39:if(!a.b("gij"))return q;break;case 40:if(!a.b("fij"))return q;break;case 41:if(!a.b("pij"))return q;break;case 42:if(!a.b("rij"))return q;break;case 43:if(!a.b("sij"))return q;break;case 44:if(!a.b("tij"))return q;break;case 45:if(!a.b("zij"))return q;break;case 46:if(!a.b("nal"))return q;break;case 47:if(!a.b("ijal"))return q;break;case 48:if(!a.b("ozil"))return q;break;case 49:if(!a.b("olov"))return q; +break;case 50:if(!a.b("ol"))return q;break;case 51:if(!a.b("lem"))return q;break;case 52:if(!a.b("ram"))return q;break;case 53:if(!a.b("ar"))return q;break;case 54:if(!a.b("dr"))return q;break;case 55:if(!a.b("er"))return q;break;case 56:if(!a.b("or"))return q;break;case 57:if(!a.b("es"))return q;break;case 58:if(!a.b("is"))return q;break;case 59:if(!a.b("ta\u0161"))return q;break;case 60:if(!a.b("na\u0161"))return q;break;case 61:if(!a.b("ja\u0161"))return q;break;case 62:if(!a.b("ka\u0161"))return q; +break;case 63:if(!a.b("ba\u0161"))return q;break;case 64:if(!a.b("ga\u0161"))return q;break;case 65:if(!a.b("va\u0161"))return q;break;case 66:if(!a.b("e\u0161"))return q;break;case 67:if(!a.b("i\u0161"))return q;break;case 68:if(!a.b("ikat"))return q;break;case 69:if(!a.b("lat"))return q;break;case 70:if(!a.b("et"))return q;break;case 71:if(!a.b("est"))return q;break;case 72:if(!a.b("ist"))return q;break;case 73:if(!a.b("kst"))return q;break;case 74:if(!a.b("ost"))return q;break;case 75:if(!a.b("i\u0161t"))return q; +break;case 76:if(!a.b("ova"))return q;break;case 77:if(!a.b("av"))return q;break;case 78:if(!a.b("ev"))return q;break;case 79:if(!a.b("iv"))return q;break;case 80:if(!a.b("ov"))return q;break;case 81:if(!a.b("mov"))return q;break;case 82:if(!a.b("lov"))return q;break;case 83:if(!a.b("el"))return q;break;case 84:if(!a.b("anj"))return q;break;case 85:if(!a.b("enj"))return q;break;case 86:if(!a.b("\u0161nj"))return q;break;case 87:if(!a.b("en"))return q;break;case 88:if(!a.b("\u0161n"))return q;break; +case 89:if(!a.b("\u010din"))return q;break;case 90:if(!a.b("ro\u0161i"))return q;break;case 91:if(!a.b("o\u0161"))return q;break;case 92:if(!a.b("evit"))return q;break;case 93:if(!a.b("ovit"))return q;break;case 94:if(!a.b("ast"))return q;break;case 95:if(!a.b("k"))return q;break;case 96:if(!a.b("eva"))return q;break;case 97:if(!a.b("ava"))return q;break;case 98:if(!a.b("iva"))return q;break;case 99:if(!a.b("uva"))return q;break;case 100:if(!a.b("ir"))return q;break;case 101:if(!a.b("a\u010d"))return q; +break;case 102:if(!a.b("a\u010da"))return q;break;case 103:if(!a.b("ni"))return q;break;case 104:if(!a.b("a"))return q;break;case 105:if(!a.b("ur"))return q;break;case 106:if(!a.b("astaj"))return q;break;case 107:if(!a.b("istaj"))return q;break;case 108:if(!a.b("ostaj"))return q;break;case 109:if(!a.b("aj"))return q;break;case 110:if(!a.b("asta"))return q;break;case 111:if(!a.b("ista"))return q;break;case 112:if(!a.b("osta"))return q;break;case 113:if(!a.b("ta"))return q;break;case 114:if(!a.b("inj"))return q; +break;case 115:if(!a.b("as"))return q;break;case 116:if(!a.b("i"))return q;break;case 117:if(!a.b("lu\u010d"))return q;break;case 118:if(!a.b("jeti"))return q;break;case 119:if(!a.b("e"))return q;break;case 120:if(!a.b("at"))return q;break;case 121:if(!p||!a.b("luc"))return q;break;case 122:if(!p||!a.b("snj"))return q;break;case 123:if(!p||!a.b("os"))return q;break;case 124:if(!p||!a.b("ac"))return q;break;case 125:if(!p||!a.b("ec"))return q;break;case 126:if(!p||!a.b("uc"))return q;break;case 127:if(!p|| +!a.b("rosi"))return q;break;case 128:if(!p||!a.b("aca"))return q;break;case 129:if(!p||!a.b("jas"))return q;break;case 130:if(!p||!a.b("tas"))return q;break;case 131:if(!p||!a.b("gas"))return q;break;case 132:if(!p||!a.b("nas"))return q;break;case 133:if(!p||!a.b("kas"))return q;break;case 134:if(!p||!a.b("vas"))return q;break;case 135:if(!p||!a.b("bas"))return q;break;case 136:if(!p||!a.b("as"))return q;break;case 137:if(!p||!a.b("cin"))return q;break;case 138:if(!p||!a.b("astaj"))return q;break; +case 139:if(!p||!a.b("istaj"))return q;break;case 140:if(!p||!a.b("ostaj"))return q;break;case 141:if(!p||!a.b("asta"))return q;break;case 142:if(!p||!a.b("ista"))return q;break;case 143:if(!p||!a.b("osta"))return q;break;case 144:if(!p||!a.b("ava"))return q;break;case 145:if(!p||!a.b("eva"))return q;break;case 146:if(!p||!a.b("iva"))return q;break;case 147:if(!p||!a.b("uva"))return q;break;case 148:if(!p||!a.b("ova"))return q;break;case 149:if(!p||!a.b("jeti"))return q;break;case 150:if(!p||!a.b("inj"))return q; +break;case 151:if(!p||!a.b("ist"))return q;break;case 152:if(!p||!a.b("es"))return q;break;case 153:if(!p||!a.b("et"))return q;break;case 154:if(!p||!a.b("is"))return q;break;case 155:if(!p||!a.b("ir"))return q;break;case 156:if(!p||!a.b("ur"))return q;break;case 157:if(!p||!a.b("uj"))return q;break;case 158:if(!p||!a.b("ni"))return q;break;case 159:if(!p||!a.b("sn"))return q;break;case 160:if(!p||!a.b("ta"))return q;break;case 161:if(!p||!a.b("a"))return q;break;case 162:if(!p||!a.b("i"))return q; +break;case 163:if(!p||!a.b("e"))return q;break;case 164:if(!p||!a.b("n"))return q}return f}var a=new C,b=[["\u0430",-1,1],["\u0431",-1,2],["\u0432",-1,3],["\u0433",-1,4],["\u0434",-1,5],["\u0435",-1,7],["\u0436",-1,8],["\u0437",-1,9],["\u0438",-1,10],["\u043a",-1,12],["\u043b",-1,13],["\u043c",-1,15],["\u043d",-1,16],["\u043e",-1,18],["\u043f",-1,19],["\u0440",-1,20],["\u0441",-1,21],["\u0442",-1,22],["\u0443",-1,24],["\u0444",-1,25],["\u0445",-1,26],["\u0446",-1,27],["\u0447",-1,28],["\u0448",-1, +30],["\u0452",-1,6],["\u0458",-1,11],["\u0459",-1,14],["\u045a",-1,17],["\u045b",-1,23],["\u045f",-1,29]],l=[["daba",-1,73],["ajaca",-1,12],["ejaca",-1,14],["ljaca",-1,13],["njaca",-1,85],["ojaca",-1,15],["alaca",-1,82],["elaca",-1,83],["olaca",-1,84],["maca",-1,75],["naca",-1,76],["raca",-1,81],["saca",-1,80],["vaca",-1,79],["\u0161aca",-1,18],["aoca",-1,82],["acaka",-1,55],["ajaka",-1,16],["ojaka",-1,17],["anaka",-1,78],["ataka",-1,58],["etaka",-1,59],["itaka",-1,60],["otaka",-1,61],["utaka",-1, +62],["a\u010daka",-1,54],["esama",-1,67],["izama",-1,87],["jacima",-1,5],["nicima",-1,23],["ticima",-1,24],["teticima",30,21],["zicima",-1,25],["atcima",-1,58],["utcima",-1,62],["\u010dcima",-1,74],["pesima",-1,2],["inzima",-1,19],["lozima",-1,1],["metara",-1,68],["centara",-1,69],["istara",-1,70],["ekata",-1,86],["anata",-1,53],["nstava",-1,22],["kustava",-1,29],["ajac",-1,12],["ejac",-1,14],["ljac",-1,13],["njac",-1,85],["anjac",49,11],["ojac",-1,15],["alac",-1,82],["elac",-1,83],["olac",-1,84], +["mac",-1,75],["nac",-1,76],["rac",-1,81],["sac",-1,80],["vac",-1,79],["\u0161ac",-1,18],["jebe",-1,88],["olce",-1,84],["kuse",-1,27],["rave",-1,42],["save",-1,52],["\u0161ave",-1,51],["baci",-1,89],["jaci",-1,5],["tvenici",-1,20],["snici",-1,26],["tetici",-1,21],["bojci",-1,4],["vojci",-1,3],["ojsci",-1,66],["atci",-1,58],["itci",-1,60],["utci",-1,62],["\u010dci",-1,74],["pesi",-1,2],["inzi",-1,19],["lozi",-1,1],["acak",-1,55],["usak",-1,57],["atak",-1,58],["etak",-1,59],["itak",-1,60],["otak",-1, +61],["utak",-1,62],["a\u010dak",-1,54],["u\u0161ak",-1,56],["izam",-1,87],["tican",-1,65],["cajan",-1,7],["\u010dajan",-1,6],["voljan",-1,77],["eskan",-1,63],["alan",-1,40],["bilan",-1,33],["gilan",-1,37],["nilan",-1,39],["rilan",-1,38],["silan",-1,36],["tilan",-1,34],["avilan",-1,35],["laran",-1,9],["eran",-1,8],["asan",-1,91],["esan",-1,10],["dusan",-1,31],["kusan",-1,28],["atan",-1,47],["pletan",-1,50],["tetan",-1,49],["antan",-1,32],["pravan",-1,44],["stavan",-1,43],["sivan",-1,46],["tivan",-1, +45],["ozan",-1,41],["ti\u010dan",-1,64],["a\u0161an",-1,90],["du\u0161an",-1,30],["metar",-1,68],["centar",-1,69],["istar",-1,70],["ekat",-1,86],["enat",-1,48],["oscu",-1,72],["o\u0161\u0107u",-1,71]],n=[["aca",-1,124],["eca",-1,125],["uca",-1,126],["ga",-1,20],["acega",3,124],["ecega",3,125],["ucega",3,126],["anjijega",3,84],["enjijega",3,85],["snjijega",3,122],["\u0161njijega",3,86],["kijega",3,95],["skijega",11,1],["\u0161kijega",11,2],["elijega",3,83],["nijega",3,13],["osijega",3,123],["atijega", +3,120],["evitijega",3,92],["ovitijega",3,93],["astijega",3,94],["avijega",3,77],["evijega",3,78],["ivijega",3,79],["ovijega",3,80],["o\u0161ijega",3,91],["anjega",3,84],["enjega",3,85],["snjega",3,122],["\u0161njega",3,86],["kega",3,95],["skega",30,1],["\u0161kega",30,2],["elega",3,83],["nega",3,13],["anega",34,10],["enega",34,87],["snega",34,159],["\u0161nega",34,88],["osega",3,123],["atega",3,120],["evitega",3,92],["ovitega",3,93],["astega",3,94],["avega",3,77],["evega",3,78],["ivega",3,79],["ovega", +3,80],["a\u0107ega",3,14],["e\u0107ega",3,15],["u\u0107ega",3,16],["o\u0161ega",3,91],["acoga",3,124],["ecoga",3,125],["ucoga",3,126],["anjoga",3,84],["enjoga",3,85],["snjoga",3,122],["\u0161njoga",3,86],["koga",3,95],["skoga",59,1],["\u0161koga",59,2],["loga",3,19],["eloga",62,83],["noga",3,13],["cinoga",64,137],["\u010dinoga",64,89],["osoga",3,123],["atoga",3,120],["evitoga",3,92],["ovitoga",3,93],["astoga",3,94],["avoga",3,77],["evoga",3,78],["ivoga",3,79],["ovoga",3,80],["a\u0107oga",3,14],["e\u0107oga", +3,15],["u\u0107oga",3,16],["o\u0161oga",3,91],["uga",3,18],["aja",-1,109],["caja",81,26],["laja",81,30],["raja",81,31],["\u0107aja",81,28],["\u010daja",81,27],["\u0111aja",81,29],["bija",-1,32],["cija",-1,33],["dija",-1,34],["fija",-1,40],["gija",-1,39],["anjija",-1,84],["enjija",-1,85],["snjija",-1,122],["\u0161njija",-1,86],["kija",-1,95],["skija",97,1],["\u0161kija",97,2],["lija",-1,24],["elija",100,83],["mija",-1,37],["nija",-1,13],["ganija",103,9],["manija",103,6],["panija",103,7],["ranija", +103,8],["tanija",103,5],["pija",-1,41],["rija",-1,42],["rarija",110,21],["sija",-1,23],["osija",112,123],["tija",-1,44],["atija",114,120],["evitija",114,92],["ovitija",114,93],["otija",114,22],["astija",114,94],["avija",-1,77],["evija",-1,78],["ivija",-1,79],["ovija",-1,80],["zija",-1,45],["o\u0161ija",-1,91],["\u017eija",-1,38],["anja",-1,84],["enja",-1,85],["snja",-1,122],["\u0161nja",-1,86],["ka",-1,95],["ska",131,1],["\u0161ka",131,2],["ala",-1,104],["acala",134,128],["astajala",134,106],["istajala", +134,107],["ostajala",134,108],["ijala",134,47],["injala",134,114],["nala",134,46],["irala",134,100],["urala",134,105],["tala",134,113],["astala",144,110],["istala",144,111],["ostala",144,112],["avala",134,97],["evala",134,96],["ivala",134,98],["ovala",134,76],["uvala",134,99],["a\u010dala",134,102],["ela",-1,83],["ila",-1,116],["acila",155,124],["lucila",155,121],["nila",155,103],["astanila",158,110],["istanila",158,111],["ostanila",158,112],["rosila",155,127],["jetila",155,118],["ozila",155,48], +["a\u010dila",155,101],["lu\u010dila",155,117],["ro\u0161ila",155,90],["ola",-1,50],["asla",-1,115],["nula",-1,13],["gama",-1,20],["logama",171,19],["ugama",171,18],["ajama",-1,109],["cajama",174,26],["lajama",174,30],["rajama",174,31],["\u0107ajama",174,28],["\u010dajama",174,27],["\u0111ajama",174,29],["bijama",-1,32],["cijama",-1,33],["dijama",-1,34],["fijama",-1,40],["gijama",-1,39],["lijama",-1,35],["mijama",-1,37],["nijama",-1,36],["ganijama",188,9],["manijama",188,6],["panijama",188,7],["ranijama", +188,8],["tanijama",188,5],["pijama",-1,41],["rijama",-1,42],["sijama",-1,43],["tijama",-1,44],["zijama",-1,45],["\u017eijama",-1,38],["alama",-1,104],["ijalama",200,47],["nalama",200,46],["elama",-1,119],["ilama",-1,116],["ramama",-1,52],["lemama",-1,51],["inama",-1,11],["cinama",207,137],["\u010dinama",207,89],["rama",-1,52],["arama",210,53],["drama",210,54],["erama",210,55],["orama",210,56],["basama",-1,135],["gasama",-1,131],["jasama",-1,129],["kasama",-1,133],["nasama",-1,132],["tasama",-1,130], +["vasama",-1,134],["esama",-1,152],["isama",-1,154],["etama",-1,70],["estama",-1,71],["istama",-1,72],["kstama",-1,73],["ostama",-1,74],["avama",-1,77],["evama",-1,78],["ivama",-1,79],["ba\u0161ama",-1,63],["ga\u0161ama",-1,64],["ja\u0161ama",-1,61],["ka\u0161ama",-1,62],["na\u0161ama",-1,60],["ta\u0161ama",-1,59],["va\u0161ama",-1,65],["e\u0161ama",-1,66],["i\u0161ama",-1,67],["lema",-1,51],["acima",-1,124],["ecima",-1,125],["ucima",-1,126],["ajima",-1,109],["cajima",245,26],["lajima",245,30],["rajima", +245,31],["\u0107ajima",245,28],["\u010dajima",245,27],["\u0111ajima",245,29],["bijima",-1,32],["cijima",-1,33],["dijima",-1,34],["fijima",-1,40],["gijima",-1,39],["anjijima",-1,84],["enjijima",-1,85],["snjijima",-1,122],["\u0161njijima",-1,86],["kijima",-1,95],["skijima",261,1],["\u0161kijima",261,2],["lijima",-1,35],["elijima",264,83],["mijima",-1,37],["nijima",-1,13],["ganijima",267,9],["manijima",267,6],["panijima",267,7],["ranijima",267,8],["tanijima",267,5],["pijima",-1,41],["rijima",-1,42], +["sijima",-1,43],["osijima",275,123],["tijima",-1,44],["atijima",277,120],["evitijima",277,92],["ovitijima",277,93],["astijima",277,94],["avijima",-1,77],["evijima",-1,78],["ivijima",-1,79],["ovijima",-1,80],["zijima",-1,45],["o\u0161ijima",-1,91],["\u017eijima",-1,38],["anjima",-1,84],["enjima",-1,85],["snjima",-1,122],["\u0161njima",-1,86],["kima",-1,95],["skima",293,1],["\u0161kima",293,2],["alima",-1,104],["ijalima",296,47],["nalima",296,46],["elima",-1,83],["ilima",-1,116],["ozilima",300,48], +["olima",-1,50],["lemima",-1,51],["nima",-1,13],["anima",304,10],["inima",304,11],["cinima",306,137],["\u010dinima",306,89],["onima",304,12],["arima",-1,53],["drima",-1,54],["erima",-1,55],["orima",-1,56],["basima",-1,135],["gasima",-1,131],["jasima",-1,129],["kasima",-1,133],["nasima",-1,132],["tasima",-1,130],["vasima",-1,134],["esima",-1,57],["isima",-1,58],["osima",-1,123],["atima",-1,120],["ikatima",324,68],["latima",324,69],["etima",-1,70],["evitima",-1,92],["ovitima",-1,93],["astima",-1,94], +["estima",-1,71],["istima",-1,72],["kstima",-1,73],["ostima",-1,74],["i\u0161tima",-1,75],["avima",-1,77],["evima",-1,78],["ajevima",337,109],["cajevima",338,26],["lajevima",338,30],["rajevima",338,31],["\u0107ajevima",338,28],["\u010dajevima",338,27],["\u0111ajevima",338,29],["ivima",-1,79],["ovima",-1,80],["govima",346,20],["ugovima",347,17],["lovima",346,82],["olovima",349,49],["movima",346,81],["onovima",346,12],["stvima",-1,3],["\u0161tvima",-1,4],["a\u0107ima",-1,14],["e\u0107ima",-1,15],["u\u0107ima", +-1,16],["ba\u0161ima",-1,63],["ga\u0161ima",-1,64],["ja\u0161ima",-1,61],["ka\u0161ima",-1,62],["na\u0161ima",-1,60],["ta\u0161ima",-1,59],["va\u0161ima",-1,65],["e\u0161ima",-1,66],["i\u0161ima",-1,67],["o\u0161ima",-1,91],["na",-1,13],["ana",368,10],["acana",369,128],["urana",369,105],["tana",369,113],["avana",369,97],["evana",369,96],["ivana",369,98],["uvana",369,99],["a\u010dana",369,102],["acena",368,124],["lucena",368,121],["a\u010dena",368,101],["lu\u010dena",368,117],["ina",368,11],["cina", +382,137],["anina",382,10],["\u010dina",382,89],["ona",368,12],["ara",-1,53],["dra",-1,54],["era",-1,55],["ora",-1,56],["basa",-1,135],["gasa",-1,131],["jasa",-1,129],["kasa",-1,133],["nasa",-1,132],["tasa",-1,130],["vasa",-1,134],["esa",-1,57],["isa",-1,58],["osa",-1,123],["ata",-1,120],["ikata",401,68],["lata",401,69],["eta",-1,70],["evita",-1,92],["ovita",-1,93],["asta",-1,94],["esta",-1,71],["ista",-1,72],["ksta",-1,73],["osta",-1,74],["nuta",-1,13],["i\u0161ta",-1,75],["ava",-1,77],["eva",-1, +78],["ajeva",415,109],["cajeva",416,26],["lajeva",416,30],["rajeva",416,31],["\u0107ajeva",416,28],["\u010dajeva",416,27],["\u0111ajeva",416,29],["iva",-1,79],["ova",-1,80],["gova",424,20],["ugova",425,17],["lova",424,82],["olova",427,49],["mova",424,81],["onova",424,12],["stva",-1,3],["\u0161tva",-1,4],["a\u0107a",-1,14],["e\u0107a",-1,15],["u\u0107a",-1,16],["ba\u0161a",-1,63],["ga\u0161a",-1,64],["ja\u0161a",-1,61],["ka\u0161a",-1,62],["na\u0161a",-1,60],["ta\u0161a",-1,59],["va\u0161a",-1,65], +["e\u0161a",-1,66],["i\u0161a",-1,67],["o\u0161a",-1,91],["ace",-1,124],["ece",-1,125],["uce",-1,126],["luce",448,121],["astade",-1,110],["istade",-1,111],["ostade",-1,112],["ge",-1,20],["loge",453,19],["uge",453,18],["aje",-1,104],["caje",456,26],["laje",456,30],["raje",456,31],["astaje",456,106],["istaje",456,107],["ostaje",456,108],["\u0107aje",456,28],["\u010daje",456,27],["\u0111aje",456,29],["ije",-1,116],["bije",466,32],["cije",466,33],["dije",466,34],["fije",466,40],["gije",466,39],["anjije", +466,84],["enjije",466,85],["snjije",466,122],["\u0161njije",466,86],["kije",466,95],["skije",476,1],["\u0161kije",476,2],["lije",466,35],["elije",479,83],["mije",466,37],["nije",466,13],["ganije",482,9],["manije",482,6],["panije",482,7],["ranije",482,8],["tanije",482,5],["pije",466,41],["rije",466,42],["sije",466,43],["osije",490,123],["tije",466,44],["atije",492,120],["evitije",492,92],["ovitije",492,93],["astije",492,94],["avije",466,77],["evije",466,78],["ivije",466,79],["ovije",466,80],["zije", +466,45],["o\u0161ije",466,91],["\u017eije",466,38],["anje",-1,84],["enje",-1,85],["snje",-1,122],["\u0161nje",-1,86],["uje",-1,25],["lucuje",508,121],["iruje",508,100],["lu\u010duje",508,117],["ke",-1,95],["ske",512,1],["\u0161ke",512,2],["ale",-1,104],["acale",515,128],["astajale",515,106],["istajale",515,107],["ostajale",515,108],["ijale",515,47],["injale",515,114],["nale",515,46],["irale",515,100],["urale",515,105],["tale",515,113],["astale",525,110],["istale",525,111],["ostale",525,112],["avale", +515,97],["evale",515,96],["ivale",515,98],["ovale",515,76],["uvale",515,99],["a\u010dale",515,102],["ele",-1,83],["ile",-1,116],["acile",536,124],["lucile",536,121],["nile",536,103],["rosile",536,127],["jetile",536,118],["ozile",536,48],["a\u010dile",536,101],["lu\u010dile",536,117],["ro\u0161ile",536,90],["ole",-1,50],["asle",-1,115],["nule",-1,13],["rame",-1,52],["leme",-1,51],["acome",-1,124],["ecome",-1,125],["ucome",-1,126],["anjome",-1,84],["enjome",-1,85],["snjome",-1,122],["\u0161njome",-1, +86],["kome",-1,95],["skome",558,1],["\u0161kome",558,2],["elome",-1,83],["nome",-1,13],["cinome",562,137],["\u010dinome",562,89],["osome",-1,123],["atome",-1,120],["evitome",-1,92],["ovitome",-1,93],["astome",-1,94],["avome",-1,77],["evome",-1,78],["ivome",-1,79],["ovome",-1,80],["a\u0107ome",-1,14],["e\u0107ome",-1,15],["u\u0107ome",-1,16],["o\u0161ome",-1,91],["ne",-1,13],["ane",578,10],["acane",579,128],["urane",579,105],["tane",579,113],["astane",582,110],["istane",582,111],["ostane",582,112], +["avane",579,97],["evane",579,96],["ivane",579,98],["uvane",579,99],["a\u010dane",579,102],["acene",578,124],["lucene",578,121],["a\u010dene",578,101],["lu\u010dene",578,117],["ine",578,11],["cine",595,137],["anine",595,10],["\u010dine",595,89],["one",578,12],["are",-1,53],["dre",-1,54],["ere",-1,55],["ore",-1,56],["ase",-1,161],["base",604,135],["acase",604,128],["gase",604,131],["jase",604,129],["astajase",608,138],["istajase",608,139],["ostajase",608,140],["injase",608,150],["kase",604,133],["nase", +604,132],["irase",604,155],["urase",604,156],["tase",604,130],["vase",604,134],["avase",618,144],["evase",618,145],["ivase",618,146],["ovase",618,148],["uvase",618,147],["ese",-1,57],["ise",-1,58],["acise",625,124],["lucise",625,121],["rosise",625,127],["jetise",625,149],["ose",-1,123],["astadose",630,141],["istadose",630,142],["ostadose",630,143],["ate",-1,104],["acate",634,128],["ikate",634,68],["late",634,69],["irate",634,100],["urate",634,105],["tate",634,113],["avate",634,97],["evate",634,96], +["ivate",634,98],["uvate",634,99],["a\u010date",634,102],["ete",-1,70],["astadete",646,110],["istadete",646,111],["ostadete",646,112],["astajete",646,106],["istajete",646,107],["ostajete",646,108],["ijete",646,116],["injete",646,114],["ujete",646,25],["lucujete",655,121],["irujete",655,100],["lu\u010dujete",655,117],["nete",646,13],["astanete",659,110],["istanete",659,111],["ostanete",659,112],["astete",646,115],["ite",-1,116],["acite",664,124],["lucite",664,121],["nite",664,13],["astanite",667,110], +["istanite",667,111],["ostanite",667,112],["rosite",664,127],["jetite",664,118],["astite",664,115],["evite",664,92],["ovite",664,93],["a\u010dite",664,101],["lu\u010dite",664,117],["ro\u0161ite",664,90],["ajte",-1,104],["urajte",679,105],["tajte",679,113],["astajte",681,106],["istajte",681,107],["ostajte",681,108],["avajte",679,97],["evajte",679,96],["ivajte",679,98],["uvajte",679,99],["ijte",-1,116],["lucujte",-1,121],["irujte",-1,100],["lu\u010dujte",-1,117],["aste",-1,94],["acaste",693,128],["astajaste", +693,106],["istajaste",693,107],["ostajaste",693,108],["injaste",693,114],["iraste",693,100],["uraste",693,105],["taste",693,113],["avaste",693,97],["evaste",693,96],["ivaste",693,98],["ovaste",693,76],["uvaste",693,99],["a\u010daste",693,102],["este",-1,71],["iste",-1,72],["aciste",709,124],["luciste",709,121],["niste",709,103],["rosiste",709,127],["jetiste",709,118],["a\u010diste",709,101],["lu\u010diste",709,117],["ro\u0161iste",709,90],["kste",-1,73],["oste",-1,74],["astadoste",719,110],["istadoste", +719,111],["ostadoste",719,112],["nuste",-1,13],["i\u0161te",-1,75],["ave",-1,77],["eve",-1,78],["ajeve",726,109],["cajeve",727,26],["lajeve",727,30],["rajeve",727,31],["\u0107ajeve",727,28],["\u010dajeve",727,27],["\u0111ajeve",727,29],["ive",-1,79],["ove",-1,80],["gove",735,20],["ugove",736,17],["love",735,82],["olove",738,49],["move",735,81],["onove",735,12],["a\u0107e",-1,14],["e\u0107e",-1,15],["u\u0107e",-1,16],["a\u010de",-1,101],["lu\u010de",-1,117],["a\u0161e",-1,104],["ba\u0161e",747,63], +["ga\u0161e",747,64],["ja\u0161e",747,61],["astaja\u0161e",750,106],["istaja\u0161e",750,107],["ostaja\u0161e",750,108],["inja\u0161e",750,114],["ka\u0161e",747,62],["na\u0161e",747,60],["ira\u0161e",747,100],["ura\u0161e",747,105],["ta\u0161e",747,59],["va\u0161e",747,65],["ava\u0161e",760,97],["eva\u0161e",760,96],["iva\u0161e",760,98],["ova\u0161e",760,76],["uva\u0161e",760,99],["a\u010da\u0161e",747,102],["e\u0161e",-1,66],["i\u0161e",-1,67],["jeti\u0161e",768,118],["a\u010di\u0161e",768,101], +["lu\u010di\u0161e",768,117],["ro\u0161i\u0161e",768,90],["o\u0161e",-1,91],["astado\u0161e",773,110],["istado\u0161e",773,111],["ostado\u0161e",773,112],["aceg",-1,124],["eceg",-1,125],["uceg",-1,126],["anjijeg",-1,84],["enjijeg",-1,85],["snjijeg",-1,122],["\u0161njijeg",-1,86],["kijeg",-1,95],["skijeg",784,1],["\u0161kijeg",784,2],["elijeg",-1,83],["nijeg",-1,13],["osijeg",-1,123],["atijeg",-1,120],["evitijeg",-1,92],["ovitijeg",-1,93],["astijeg",-1,94],["avijeg",-1,77],["evijeg",-1,78],["ivijeg", +-1,79],["ovijeg",-1,80],["o\u0161ijeg",-1,91],["anjeg",-1,84],["enjeg",-1,85],["snjeg",-1,122],["\u0161njeg",-1,86],["keg",-1,95],["eleg",-1,83],["neg",-1,13],["aneg",805,10],["eneg",805,87],["sneg",805,159],["\u0161neg",805,88],["oseg",-1,123],["ateg",-1,120],["aveg",-1,77],["eveg",-1,78],["iveg",-1,79],["oveg",-1,80],["a\u0107eg",-1,14],["e\u0107eg",-1,15],["u\u0107eg",-1,16],["o\u0161eg",-1,91],["acog",-1,124],["ecog",-1,125],["ucog",-1,126],["anjog",-1,84],["enjog",-1,85],["snjog",-1,122],["\u0161njog", +-1,86],["kog",-1,95],["skog",827,1],["\u0161kog",827,2],["elog",-1,83],["nog",-1,13],["cinog",831,137],["\u010dinog",831,89],["osog",-1,123],["atog",-1,120],["evitog",-1,92],["ovitog",-1,93],["astog",-1,94],["avog",-1,77],["evog",-1,78],["ivog",-1,79],["ovog",-1,80],["a\u0107og",-1,14],["e\u0107og",-1,15],["u\u0107og",-1,16],["o\u0161og",-1,91],["ah",-1,104],["acah",847,128],["astajah",847,106],["istajah",847,107],["ostajah",847,108],["injah",847,114],["irah",847,100],["urah",847,105],["tah",847, +113],["avah",847,97],["evah",847,96],["ivah",847,98],["ovah",847,76],["uvah",847,99],["a\u010dah",847,102],["ih",-1,116],["acih",862,124],["ecih",862,125],["ucih",862,126],["lucih",865,121],["anjijih",862,84],["enjijih",862,85],["snjijih",862,122],["\u0161njijih",862,86],["kijih",862,95],["skijih",871,1],["\u0161kijih",871,2],["elijih",862,83],["nijih",862,13],["osijih",862,123],["atijih",862,120],["evitijih",862,92],["ovitijih",862,93],["astijih",862,94],["avijih",862,77],["evijih",862,78],["ivijih", +862,79],["ovijih",862,80],["o\u0161ijih",862,91],["anjih",862,84],["enjih",862,85],["snjih",862,122],["\u0161njih",862,86],["kih",862,95],["skih",890,1],["\u0161kih",890,2],["elih",862,83],["nih",862,13],["cinih",894,137],["\u010dinih",894,89],["osih",862,123],["rosih",897,127],["atih",862,120],["jetih",862,118],["evitih",862,92],["ovitih",862,93],["astih",862,94],["avih",862,77],["evih",862,78],["ivih",862,79],["ovih",862,80],["a\u0107ih",862,14],["e\u0107ih",862,15],["u\u0107ih",862,16],["a\u010dih", +862,101],["lu\u010dih",862,117],["o\u0161ih",862,91],["ro\u0161ih",913,90],["astadoh",-1,110],["istadoh",-1,111],["ostadoh",-1,112],["acuh",-1,124],["ecuh",-1,125],["ucuh",-1,126],["a\u0107uh",-1,14],["e\u0107uh",-1,15],["u\u0107uh",-1,16],["aci",-1,124],["aceci",-1,124],["ieci",-1,162],["ajuci",-1,161],["irajuci",927,155],["urajuci",927,156],["astajuci",927,138],["istajuci",927,139],["ostajuci",927,140],["avajuci",927,144],["evajuci",927,145],["ivajuci",927,146],["uvajuci",927,147],["ujuci",-1,157], +["lucujuci",937,121],["irujuci",937,155],["luci",-1,121],["nuci",-1,164],["etuci",-1,153],["astuci",-1,136],["gi",-1,20],["ugi",944,18],["aji",-1,109],["caji",946,26],["laji",946,30],["raji",946,31],["\u0107aji",946,28],["\u010daji",946,27],["\u0111aji",946,29],["biji",-1,32],["ciji",-1,33],["diji",-1,34],["fiji",-1,40],["giji",-1,39],["anjiji",-1,84],["enjiji",-1,85],["snjiji",-1,122],["\u0161njiji",-1,86],["kiji",-1,95],["skiji",962,1],["\u0161kiji",962,2],["liji",-1,35],["eliji",965,83],["miji", +-1,37],["niji",-1,13],["ganiji",968,9],["maniji",968,6],["paniji",968,7],["raniji",968,8],["taniji",968,5],["piji",-1,41],["riji",-1,42],["siji",-1,43],["osiji",976,123],["tiji",-1,44],["atiji",978,120],["evitiji",978,92],["ovitiji",978,93],["astiji",978,94],["aviji",-1,77],["eviji",-1,78],["iviji",-1,79],["oviji",-1,80],["ziji",-1,45],["o\u0161iji",-1,91],["\u017eiji",-1,38],["anji",-1,84],["enji",-1,85],["snji",-1,122],["\u0161nji",-1,86],["ki",-1,95],["ski",994,1],["\u0161ki",994,2],["ali",-1, +104],["acali",997,128],["astajali",997,106],["istajali",997,107],["ostajali",997,108],["ijali",997,47],["injali",997,114],["nali",997,46],["irali",997,100],["urali",997,105],["tali",997,113],["astali",1007,110],["istali",1007,111],["ostali",1007,112],["avali",997,97],["evali",997,96],["ivali",997,98],["ovali",997,76],["uvali",997,99],["a\u010dali",997,102],["eli",-1,83],["ili",-1,116],["acili",1018,124],["lucili",1018,121],["nili",1018,103],["rosili",1018,127],["jetili",1018,118],["ozili",1018,48], +["a\u010dili",1018,101],["lu\u010dili",1018,117],["ro\u0161ili",1018,90],["oli",-1,50],["asli",-1,115],["nuli",-1,13],["rami",-1,52],["lemi",-1,51],["ni",-1,13],["ani",1033,10],["acani",1034,128],["urani",1034,105],["tani",1034,113],["avani",1034,97],["evani",1034,96],["ivani",1034,98],["uvani",1034,99],["a\u010dani",1034,102],["aceni",1033,124],["luceni",1033,121],["a\u010deni",1033,101],["lu\u010deni",1033,117],["ini",1033,11],["cini",1047,137],["\u010dini",1047,89],["oni",1033,12],["ari",-1,53], +["dri",-1,54],["eri",-1,55],["ori",-1,56],["basi",-1,135],["gasi",-1,131],["jasi",-1,129],["kasi",-1,133],["nasi",-1,132],["tasi",-1,130],["vasi",-1,134],["esi",-1,152],["isi",-1,154],["osi",-1,123],["avsi",-1,161],["acavsi",1065,128],["iravsi",1065,155],["tavsi",1065,160],["etavsi",1068,153],["astavsi",1068,141],["istavsi",1068,142],["ostavsi",1068,143],["ivsi",-1,162],["nivsi",1073,158],["rosivsi",1073,127],["nuvsi",-1,164],["ati",-1,104],["acati",1077,128],["astajati",1077,106],["istajati",1077, +107],["ostajati",1077,108],["injati",1077,114],["ikati",1077,68],["lati",1077,69],["irati",1077,100],["urati",1077,105],["tati",1077,113],["astati",1087,110],["istati",1087,111],["ostati",1087,112],["avati",1077,97],["evati",1077,96],["ivati",1077,98],["ovati",1077,76],["uvati",1077,99],["a\u010dati",1077,102],["eti",-1,70],["iti",-1,116],["aciti",1098,124],["luciti",1098,121],["niti",1098,103],["rositi",1098,127],["jetiti",1098,118],["eviti",1098,92],["oviti",1098,93],["a\u010diti",1098,101],["lu\u010diti", +1098,117],["ro\u0161iti",1098,90],["asti",-1,94],["esti",-1,71],["isti",-1,72],["ksti",-1,73],["osti",-1,74],["nuti",-1,13],["avi",-1,77],["evi",-1,78],["ajevi",1116,109],["cajevi",1117,26],["lajevi",1117,30],["rajevi",1117,31],["\u0107ajevi",1117,28],["\u010dajevi",1117,27],["\u0111ajevi",1117,29],["ivi",-1,79],["ovi",-1,80],["govi",1125,20],["ugovi",1126,17],["lovi",1125,82],["olovi",1128,49],["movi",1125,81],["onovi",1125,12],["ie\u0107i",-1,116],["a\u010de\u0107i",-1,101],["aju\u0107i",-1,104], +["iraju\u0107i",1134,100],["uraju\u0107i",1134,105],["astaju\u0107i",1134,106],["istaju\u0107i",1134,107],["ostaju\u0107i",1134,108],["avaju\u0107i",1134,97],["evaju\u0107i",1134,96],["ivaju\u0107i",1134,98],["uvaju\u0107i",1134,99],["uju\u0107i",-1,25],["iruju\u0107i",1144,100],["lu\u010duju\u0107i",1144,117],["nu\u0107i",-1,13],["etu\u0107i",-1,70],["astu\u0107i",-1,115],["a\u010di",-1,101],["lu\u010di",-1,117],["ba\u0161i",-1,63],["ga\u0161i",-1,64],["ja\u0161i",-1,61],["ka\u0161i",-1,62],["na\u0161i", +-1,60],["ta\u0161i",-1,59],["va\u0161i",-1,65],["e\u0161i",-1,66],["i\u0161i",-1,67],["o\u0161i",-1,91],["av\u0161i",-1,104],["irav\u0161i",1162,100],["tav\u0161i",1162,113],["etav\u0161i",1164,70],["astav\u0161i",1164,110],["istav\u0161i",1164,111],["ostav\u0161i",1164,112],["a\u010dav\u0161i",1162,102],["iv\u0161i",-1,116],["niv\u0161i",1170,103],["ro\u0161iv\u0161i",1170,90],["nuv\u0161i",-1,13],["aj",-1,104],["uraj",1174,105],["taj",1174,113],["avaj",1174,97],["evaj",1174,96],["ivaj",1174,98], +["uvaj",1174,99],["ij",-1,116],["acoj",-1,124],["ecoj",-1,125],["ucoj",-1,126],["anjijoj",-1,84],["enjijoj",-1,85],["snjijoj",-1,122],["\u0161njijoj",-1,86],["kijoj",-1,95],["skijoj",1189,1],["\u0161kijoj",1189,2],["elijoj",-1,83],["nijoj",-1,13],["osijoj",-1,123],["evitijoj",-1,92],["ovitijoj",-1,93],["astijoj",-1,94],["avijoj",-1,77],["evijoj",-1,78],["ivijoj",-1,79],["ovijoj",-1,80],["o\u0161ijoj",-1,91],["anjoj",-1,84],["enjoj",-1,85],["snjoj",-1,122],["\u0161njoj",-1,86],["koj",-1,95],["skoj", +1207,1],["\u0161koj",1207,2],["aloj",-1,104],["eloj",-1,83],["noj",-1,13],["cinoj",1212,137],["\u010dinoj",1212,89],["osoj",-1,123],["atoj",-1,120],["evitoj",-1,92],["ovitoj",-1,93],["astoj",-1,94],["avoj",-1,77],["evoj",-1,78],["ivoj",-1,79],["ovoj",-1,80],["a\u0107oj",-1,14],["e\u0107oj",-1,15],["u\u0107oj",-1,16],["o\u0161oj",-1,91],["lucuj",-1,121],["iruj",-1,100],["lu\u010duj",-1,117],["al",-1,104],["iral",1231,100],["ural",1231,105],["el",-1,119],["il",-1,116],["am",-1,104],["acam",1236,128], +["iram",1236,100],["uram",1236,105],["tam",1236,113],["avam",1236,97],["evam",1236,96],["ivam",1236,98],["uvam",1236,99],["a\u010dam",1236,102],["em",-1,119],["acem",1246,124],["ecem",1246,125],["ucem",1246,126],["astadem",1246,110],["istadem",1246,111],["ostadem",1246,112],["ajem",1246,104],["cajem",1253,26],["lajem",1253,30],["rajem",1253,31],["astajem",1253,106],["istajem",1253,107],["ostajem",1253,108],["\u0107ajem",1253,28],["\u010dajem",1253,27],["\u0111ajem",1253,29],["ijem",1246,116],["anjijem", +1263,84],["enjijem",1263,85],["snjijem",1263,123],["\u0161njijem",1263,86],["kijem",1263,95],["skijem",1268,1],["\u0161kijem",1268,2],["lijem",1263,24],["elijem",1271,83],["nijem",1263,13],["rarijem",1263,21],["sijem",1263,23],["osijem",1275,123],["atijem",1263,120],["evitijem",1263,92],["ovitijem",1263,93],["otijem",1263,22],["astijem",1263,94],["avijem",1263,77],["evijem",1263,78],["ivijem",1263,79],["ovijem",1263,80],["o\u0161ijem",1263,91],["anjem",1246,84],["enjem",1246,85],["injem",1246,114], +["snjem",1246,122],["\u0161njem",1246,86],["ujem",1246,25],["lucujem",1292,121],["irujem",1292,100],["lu\u010dujem",1292,117],["kem",1246,95],["skem",1296,1],["\u0161kem",1296,2],["elem",1246,83],["nem",1246,13],["anem",1300,10],["astanem",1301,110],["istanem",1301,111],["ostanem",1301,112],["enem",1300,87],["snem",1300,159],["\u0161nem",1300,88],["basem",1246,135],["gasem",1246,131],["jasem",1246,129],["kasem",1246,133],["nasem",1246,132],["tasem",1246,130],["vasem",1246,134],["esem",1246,152],["isem", +1246,154],["osem",1246,123],["atem",1246,120],["etem",1246,70],["evitem",1246,92],["ovitem",1246,93],["astem",1246,94],["istem",1246,151],["i\u0161tem",1246,75],["avem",1246,77],["evem",1246,78],["ivem",1246,79],["a\u0107em",1246,14],["e\u0107em",1246,15],["u\u0107em",1246,16],["ba\u0161em",1246,63],["ga\u0161em",1246,64],["ja\u0161em",1246,61],["ka\u0161em",1246,62],["na\u0161em",1246,60],["ta\u0161em",1246,59],["va\u0161em",1246,65],["e\u0161em",1246,66],["i\u0161em",1246,67],["o\u0161em",1246, +91],["im",-1,116],["acim",1341,124],["ecim",1341,125],["ucim",1341,126],["lucim",1344,121],["anjijim",1341,84],["enjijim",1341,85],["snjijim",1341,122],["\u0161njijim",1341,86],["kijim",1341,95],["skijim",1350,1],["\u0161kijim",1350,2],["elijim",1341,83],["nijim",1341,13],["osijim",1341,123],["atijim",1341,120],["evitijim",1341,92],["ovitijim",1341,93],["astijim",1341,94],["avijim",1341,77],["evijim",1341,78],["ivijim",1341,79],["ovijim",1341,80],["o\u0161ijim",1341,91],["anjim",1341,84],["enjim", +1341,85],["snjim",1341,122],["\u0161njim",1341,86],["kim",1341,95],["skim",1369,1],["\u0161kim",1369,2],["elim",1341,83],["nim",1341,13],["cinim",1373,137],["\u010dinim",1373,89],["osim",1341,123],["rosim",1376,127],["atim",1341,120],["jetim",1341,118],["evitim",1341,92],["ovitim",1341,93],["astim",1341,94],["avim",1341,77],["evim",1341,78],["ivim",1341,79],["ovim",1341,80],["a\u0107im",1341,14],["e\u0107im",1341,15],["u\u0107im",1341,16],["a\u010dim",1341,101],["lu\u010dim",1341,117],["o\u0161im", +1341,91],["ro\u0161im",1392,90],["acom",-1,124],["ecom",-1,125],["ucom",-1,126],["gom",-1,20],["logom",1397,19],["ugom",1397,18],["bijom",-1,32],["cijom",-1,33],["dijom",-1,34],["fijom",-1,40],["gijom",-1,39],["lijom",-1,35],["mijom",-1,37],["nijom",-1,36],["ganijom",1407,9],["manijom",1407,6],["panijom",1407,7],["ranijom",1407,8],["tanijom",1407,5],["pijom",-1,41],["rijom",-1,42],["sijom",-1,43],["tijom",-1,44],["zijom",-1,45],["\u017eijom",-1,38],["anjom",-1,84],["enjom",-1,85],["snjom",-1,122], +["\u0161njom",-1,86],["kom",-1,95],["skom",1423,1],["\u0161kom",1423,2],["alom",-1,104],["ijalom",1426,47],["nalom",1426,46],["elom",-1,83],["ilom",-1,116],["ozilom",1430,48],["olom",-1,50],["ramom",-1,52],["lemom",-1,51],["nom",-1,13],["anom",1435,10],["inom",1435,11],["cinom",1437,137],["aninom",1437,10],["\u010dinom",1437,89],["onom",1435,12],["arom",-1,53],["drom",-1,54],["erom",-1,55],["orom",-1,56],["basom",-1,135],["gasom",-1,131],["jasom",-1,129],["kasom",-1,133],["nasom",-1,132],["tasom", +-1,130],["vasom",-1,134],["esom",-1,57],["isom",-1,58],["osom",-1,123],["atom",-1,120],["ikatom",1456,68],["latom",1456,69],["etom",-1,70],["evitom",-1,92],["ovitom",-1,93],["astom",-1,94],["estom",-1,71],["istom",-1,72],["kstom",-1,73],["ostom",-1,74],["avom",-1,77],["evom",-1,78],["ivom",-1,79],["ovom",-1,80],["lovom",1470,82],["movom",1470,81],["stvom",-1,3],["\u0161tvom",-1,4],["a\u0107om",-1,14],["e\u0107om",-1,15],["u\u0107om",-1,16],["ba\u0161om",-1,63],["ga\u0161om",-1,64],["ja\u0161om",-1, +61],["ka\u0161om",-1,62],["na\u0161om",-1,60],["ta\u0161om",-1,59],["va\u0161om",-1,65],["e\u0161om",-1,66],["i\u0161om",-1,67],["o\u0161om",-1,91],["an",-1,104],["acan",1488,128],["iran",1488,100],["uran",1488,105],["tan",1488,113],["avan",1488,97],["evan",1488,96],["ivan",1488,98],["uvan",1488,99],["a\u010dan",1488,102],["acen",-1,124],["lucen",-1,121],["a\u010den",-1,101],["lu\u010den",-1,117],["anin",-1,10],["ao",-1,104],["acao",1503,128],["astajao",1503,106],["istajao",1503,107],["ostajao",1503, +108],["injao",1503,114],["irao",1503,100],["urao",1503,105],["tao",1503,113],["astao",1511,110],["istao",1511,111],["ostao",1511,112],["avao",1503,97],["evao",1503,96],["ivao",1503,98],["ovao",1503,76],["uvao",1503,99],["a\u010dao",1503,102],["go",-1,20],["ugo",1521,18],["io",-1,116],["acio",1523,124],["lucio",1523,121],["lio",1523,24],["nio",1523,103],["rario",1523,21],["sio",1523,23],["rosio",1529,127],["jetio",1523,118],["otio",1523,22],["a\u010dio",1523,101],["lu\u010dio",1523,117],["ro\u0161io", +1523,90],["bijo",-1,32],["cijo",-1,33],["dijo",-1,34],["fijo",-1,40],["gijo",-1,39],["lijo",-1,35],["mijo",-1,37],["nijo",-1,36],["pijo",-1,41],["rijo",-1,42],["sijo",-1,43],["tijo",-1,44],["zijo",-1,45],["\u017eijo",-1,38],["anjo",-1,84],["enjo",-1,85],["snjo",-1,122],["\u0161njo",-1,86],["ko",-1,95],["sko",1554,1],["\u0161ko",1554,2],["alo",-1,104],["acalo",1557,128],["astajalo",1557,106],["istajalo",1557,107],["ostajalo",1557,108],["ijalo",1557,47],["injalo",1557,114],["nalo",1557,46],["iralo", +1557,100],["uralo",1557,105],["talo",1557,113],["astalo",1567,110],["istalo",1567,111],["ostalo",1567,112],["avalo",1557,97],["evalo",1557,96],["ivalo",1557,98],["ovalo",1557,76],["uvalo",1557,99],["a\u010dalo",1557,102],["elo",-1,83],["ilo",-1,116],["acilo",1578,124],["lucilo",1578,121],["nilo",1578,103],["rosilo",1578,127],["jetilo",1578,118],["a\u010dilo",1578,101],["lu\u010dilo",1578,117],["ro\u0161ilo",1578,90],["aslo",-1,115],["nulo",-1,13],["amo",-1,104],["acamo",1589,128],["ramo",1589,52], +["iramo",1591,100],["uramo",1591,105],["tamo",1589,113],["avamo",1589,97],["evamo",1589,96],["ivamo",1589,98],["uvamo",1589,99],["a\u010damo",1589,102],["emo",-1,119],["astademo",1600,110],["istademo",1600,111],["ostademo",1600,112],["astajemo",1600,106],["istajemo",1600,107],["ostajemo",1600,108],["ijemo",1600,116],["injemo",1600,114],["ujemo",1600,25],["lucujemo",1609,121],["irujemo",1609,100],["lu\u010dujemo",1609,117],["lemo",1600,51],["nemo",1600,13],["astanemo",1614,110],["istanemo",1614,111], +["ostanemo",1614,112],["etemo",1600,70],["astemo",1600,115],["imo",-1,116],["acimo",1620,124],["lucimo",1620,121],["nimo",1620,13],["astanimo",1623,110],["istanimo",1623,111],["ostanimo",1623,112],["rosimo",1620,127],["etimo",1620,70],["jetimo",1628,118],["astimo",1620,115],["a\u010dimo",1620,101],["lu\u010dimo",1620,117],["ro\u0161imo",1620,90],["ajmo",-1,104],["urajmo",1634,105],["tajmo",1634,113],["astajmo",1636,106],["istajmo",1636,107],["ostajmo",1636,108],["avajmo",1634,97],["evajmo",1634,96], +["ivajmo",1634,98],["uvajmo",1634,99],["ijmo",-1,116],["ujmo",-1,25],["lucujmo",1645,121],["irujmo",1645,100],["lu\u010dujmo",1645,117],["asmo",-1,104],["acasmo",1649,128],["astajasmo",1649,106],["istajasmo",1649,107],["ostajasmo",1649,108],["injasmo",1649,114],["irasmo",1649,100],["urasmo",1649,105],["tasmo",1649,113],["avasmo",1649,97],["evasmo",1649,96],["ivasmo",1649,98],["ovasmo",1649,76],["uvasmo",1649,99],["a\u010dasmo",1649,102],["ismo",-1,116],["acismo",1664,124],["lucismo",1664,121],["nismo", +1664,103],["rosismo",1664,127],["jetismo",1664,118],["a\u010dismo",1664,101],["lu\u010dismo",1664,117],["ro\u0161ismo",1664,90],["astadosmo",-1,110],["istadosmo",-1,111],["ostadosmo",-1,112],["nusmo",-1,13],["no",-1,13],["ano",1677,104],["acano",1678,128],["urano",1678,105],["tano",1678,113],["avano",1678,97],["evano",1678,96],["ivano",1678,98],["uvano",1678,99],["a\u010dano",1678,102],["aceno",1677,124],["luceno",1677,121],["a\u010deno",1677,101],["lu\u010deno",1677,117],["ino",1677,11],["cino", +1691,137],["\u010dino",1691,89],["ato",-1,120],["ikato",1694,68],["lato",1694,69],["eto",-1,70],["evito",-1,92],["ovito",-1,93],["asto",-1,94],["esto",-1,71],["isto",-1,72],["ksto",-1,73],["osto",-1,74],["nuto",-1,13],["nuo",-1,13],["avo",-1,77],["evo",-1,78],["ivo",-1,79],["ovo",-1,80],["stvo",-1,3],["\u0161tvo",-1,4],["as",-1,161],["acas",1713,128],["iras",1713,155],["uras",1713,156],["tas",1713,160],["avas",1713,144],["evas",1713,145],["ivas",1713,146],["uvas",1713,147],["es",-1,163],["astades", +1722,141],["istades",1722,142],["ostades",1722,143],["astajes",1722,138],["istajes",1722,139],["ostajes",1722,140],["ijes",1722,162],["injes",1722,150],["ujes",1722,157],["lucujes",1731,121],["irujes",1731,155],["nes",1722,164],["astanes",1734,141],["istanes",1734,142],["ostanes",1734,143],["etes",1722,153],["astes",1722,136],["is",-1,162],["acis",1740,124],["lucis",1740,121],["nis",1740,158],["rosis",1740,127],["jetis",1740,149],["at",-1,104],["acat",1746,128],["astajat",1746,106],["istajat",1746, +107],["ostajat",1746,108],["injat",1746,114],["irat",1746,100],["urat",1746,105],["tat",1746,113],["astat",1754,110],["istat",1754,111],["ostat",1754,112],["avat",1746,97],["evat",1746,96],["ivat",1746,98],["irivat",1760,100],["ovat",1746,76],["uvat",1746,99],["a\u010dat",1746,102],["it",-1,116],["acit",1765,124],["lucit",1765,121],["rosit",1765,127],["jetit",1765,118],["a\u010dit",1765,101],["lu\u010dit",1765,117],["ro\u0161it",1765,90],["nut",-1,13],["astadu",-1,110],["istadu",-1,111],["ostadu", +-1,112],["gu",-1,20],["logu",1777,19],["ugu",1777,18],["ahu",-1,104],["acahu",1780,128],["astajahu",1780,106],["istajahu",1780,107],["ostajahu",1780,108],["injahu",1780,114],["irahu",1780,100],["urahu",1780,105],["avahu",1780,97],["evahu",1780,96],["ivahu",1780,98],["ovahu",1780,76],["uvahu",1780,99],["a\u010dahu",1780,102],["aju",-1,104],["caju",1794,26],["acaju",1795,128],["laju",1794,30],["raju",1794,31],["iraju",1798,100],["uraju",1798,105],["taju",1794,113],["astaju",1801,106],["istaju",1801, +107],["ostaju",1801,108],["avaju",1794,97],["evaju",1794,96],["ivaju",1794,98],["uvaju",1794,99],["\u0107aju",1794,28],["\u010daju",1794,27],["a\u010daju",1810,102],["\u0111aju",1794,29],["iju",-1,116],["biju",1813,32],["ciju",1813,33],["diju",1813,34],["fiju",1813,40],["giju",1813,39],["anjiju",1813,84],["enjiju",1813,85],["snjiju",1813,122],["\u0161njiju",1813,86],["kiju",1813,95],["liju",1813,24],["eliju",1824,83],["miju",1813,37],["niju",1813,13],["ganiju",1827,9],["maniju",1827,6],["paniju", +1827,7],["raniju",1827,8],["taniju",1827,5],["piju",1813,41],["riju",1813,42],["rariju",1834,21],["siju",1813,23],["osiju",1836,123],["tiju",1813,44],["atiju",1838,120],["otiju",1838,22],["aviju",1813,77],["eviju",1813,78],["iviju",1813,79],["oviju",1813,80],["ziju",1813,45],["o\u0161iju",1813,91],["\u017eiju",1813,38],["anju",-1,84],["enju",-1,85],["snju",-1,122],["\u0161nju",-1,86],["uju",-1,25],["lucuju",1852,121],["iruju",1852,100],["lu\u010duju",1852,117],["ku",-1,95],["sku",1856,1],["\u0161ku", +1856,2],["alu",-1,104],["ijalu",1859,47],["nalu",1859,46],["elu",-1,83],["ilu",-1,116],["ozilu",1863,48],["olu",-1,50],["ramu",-1,52],["acemu",-1,124],["ecemu",-1,125],["ucemu",-1,126],["anjijemu",-1,84],["enjijemu",-1,85],["snjijemu",-1,122],["\u0161njijemu",-1,86],["kijemu",-1,95],["skijemu",1874,1],["\u0161kijemu",1874,2],["elijemu",-1,83],["nijemu",-1,13],["osijemu",-1,123],["atijemu",-1,120],["evitijemu",-1,92],["ovitijemu",-1,93],["astijemu",-1,94],["avijemu",-1,77],["evijemu",-1,78],["ivijemu", +-1,79],["ovijemu",-1,80],["o\u0161ijemu",-1,91],["anjemu",-1,84],["enjemu",-1,85],["snjemu",-1,122],["\u0161njemu",-1,86],["kemu",-1,95],["skemu",1893,1],["\u0161kemu",1893,2],["lemu",-1,51],["elemu",1896,83],["nemu",-1,13],["anemu",1898,10],["enemu",1898,87],["snemu",1898,159],["\u0161nemu",1898,88],["osemu",-1,123],["atemu",-1,120],["evitemu",-1,92],["ovitemu",-1,93],["astemu",-1,94],["avemu",-1,77],["evemu",-1,78],["ivemu",-1,79],["ovemu",-1,80],["a\u0107emu",-1,14],["e\u0107emu",-1,15],["u\u0107emu", +-1,16],["o\u0161emu",-1,91],["acomu",-1,124],["ecomu",-1,125],["ucomu",-1,126],["anjomu",-1,84],["enjomu",-1,85],["snjomu",-1,122],["\u0161njomu",-1,86],["komu",-1,95],["skomu",1923,1],["\u0161komu",1923,2],["elomu",-1,83],["nomu",-1,13],["cinomu",1927,137],["\u010dinomu",1927,89],["osomu",-1,123],["atomu",-1,120],["evitomu",-1,92],["ovitomu",-1,93],["astomu",-1,94],["avomu",-1,77],["evomu",-1,78],["ivomu",-1,79],["ovomu",-1,80],["a\u0107omu",-1,14],["e\u0107omu",-1,15],["u\u0107omu",-1,16],["o\u0161omu", +-1,91],["nu",-1,13],["anu",1943,10],["astanu",1944,110],["istanu",1944,111],["ostanu",1944,112],["inu",1943,11],["cinu",1948,137],["aninu",1948,10],["\u010dinu",1948,89],["onu",1943,12],["aru",-1,53],["dru",-1,54],["eru",-1,55],["oru",-1,56],["basu",-1,135],["gasu",-1,131],["jasu",-1,129],["kasu",-1,133],["nasu",-1,132],["tasu",-1,130],["vasu",-1,134],["esu",-1,57],["isu",-1,58],["osu",-1,123],["atu",-1,120],["ikatu",1967,68],["latu",1967,69],["etu",-1,70],["evitu",-1,92],["ovitu",-1,93],["astu", +-1,94],["estu",-1,71],["istu",-1,72],["kstu",-1,73],["ostu",-1,74],["i\u0161tu",-1,75],["avu",-1,77],["evu",-1,78],["ivu",-1,79],["ovu",-1,80],["lovu",1982,82],["movu",1982,81],["stvu",-1,3],["\u0161tvu",-1,4],["ba\u0161u",-1,63],["ga\u0161u",-1,64],["ja\u0161u",-1,61],["ka\u0161u",-1,62],["na\u0161u",-1,60],["ta\u0161u",-1,59],["va\u0161u",-1,65],["e\u0161u",-1,66],["i\u0161u",-1,67],["o\u0161u",-1,91],["avav",-1,97],["evav",-1,96],["ivav",-1,98],["uvav",-1,99],["kov",-1,95],["a\u0161",-1,104],["ira\u0161", +2002,100],["ura\u0161",2002,105],["ta\u0161",2002,113],["ava\u0161",2002,97],["eva\u0161",2002,96],["iva\u0161",2002,98],["uva\u0161",2002,99],["a\u010da\u0161",2002,102],["e\u0161",-1,119],["astade\u0161",2011,110],["istade\u0161",2011,111],["ostade\u0161",2011,112],["astaje\u0161",2011,106],["istaje\u0161",2011,107],["ostaje\u0161",2011,108],["ije\u0161",2011,116],["inje\u0161",2011,114],["uje\u0161",2011,25],["iruje\u0161",2020,100],["lu\u010duje\u0161",2020,117],["ne\u0161",2011,13],["astane\u0161", +2023,110],["istane\u0161",2023,111],["ostane\u0161",2023,112],["ete\u0161",2011,70],["aste\u0161",2011,115],["i\u0161",-1,116],["ni\u0161",2029,103],["jeti\u0161",2029,118],["a\u010di\u0161",2029,101],["lu\u010di\u0161",2029,117],["ro\u0161i\u0161",2029,90]],e=[["a",-1,1],["oga",0,1],["ama",0,1],["ima",0,1],["ena",0,1],["e",-1,1],["og",-1,1],["anog",6,1],["enog",6,1],["anih",-1,1],["enih",-1,1],["i",-1,1],["ani",11,1],["eni",11,1],["anoj",-1,1],["enoj",-1,1],["anim",-1,1],["enim",-1,1],["om",-1,1], +["enom",18,1],["o",-1,1],["ano",20,1],["eno",20,1],["ost",-1,1],["u",-1,1],["enu",24,1]],h=[17,65,16],u=[65,4,0,0,0,0,0,0,0,0,0,4,0,0,128],t=[119,95,23,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,136,0,0,0,0,0,0,0,0,0,128,0,0,0,16],s=[1],r=0,p=q;this.m=function(){g();d();p=f;var b=a.cursor;a:{b:for(;;){if(a.i(u,263,382))break b;if(a.cursor>=a.a)break a;a.cursor++}p=q}a.cursor=b;r=a.a;b=a.cursor;a:{b:for(;;){if(a.i(h,97,117))break b;if(a.cursor>=a.a)break a;a.cursor++}r=a.cursor;if(!(2<=r)){b:for(;;){if(a.k(h, +97,117))break b;if(a.cursor>=a.a)break a;a.cursor++}r=a.cursor}}a.cursor=b;b=a.cursor;a:{b:for(;;){if(a.n("r"))break b;if(a.cursor>=a.a)break a;a.cursor++}b:{if(!(2>a.cursor))break b;a.cursor=a.cursor;c:for(;;){if(a.k(s,114,114))break c;if(a.cursor>=a.a)break a;a.cursor++}}1>=r-a.cursor||(r=a.cursor)}a.cursor=b;a.f=a.cursor;a.cursor=a.a;var b=a.a-a.cursor,n;a.d=a.cursor;n=a.g(l);if(0!=n)switch(a.c=a.cursor,n){case 1:if(!a.b("loga"))break;break;case 2:if(!a.b("peh"))break;break;case 3:if(!a.b("vojka"))break; +break;case 4:if(!a.b("bojka"))break;break;case 5:if(!a.b("jak"))break;break;case 6:if(!a.b("\u010dajni"))break;break;case 7:if(!p||!a.b("cajni"))break;break;case 8:if(!a.b("erni"))break;break;case 9:if(!a.b("larni"))break;break;case 10:if(!a.b("esni"))break;break;case 11:if(!a.b("anjca"))break;break;case 12:if(!a.b("ajca"))break;break;case 13:if(!a.b("ljca"))break;break;case 14:if(!a.b("ejca"))break;break;case 15:if(!a.b("ojca"))break;break;case 16:if(!a.b("ajka"))break;break;case 17:if(!a.b("ojka"))break; +break;case 18:if(!a.b("\u0161ca"))break;break;case 19:if(!a.b("ing"))break;break;case 20:if(!a.b("tvenik"))break;break;case 21:if(!a.b("tetika"))break;break;case 22:if(!a.b("nstva"))break;break;case 23:if(!a.b("nik"))break;break;case 24:if(!a.b("tik"))break;break;case 25:if(!a.b("zik"))break;break;case 26:if(!a.b("snik"))break;break;case 27:if(!a.b("kusi"))break;break;case 28:if(!a.b("kusni"))break;break;case 29:if(!a.b("kustva"))break;break;case 30:if(!a.b("du\u0161ni"))break;break;case 31:if(!p|| +!a.b("dusni"))break;break;case 32:if(!a.b("antni"))break;break;case 33:if(!a.b("bilni"))break;break;case 34:if(!a.b("tilni"))break;break;case 35:if(!a.b("avilni"))break;break;case 36:if(!a.b("silni"))break;break;case 37:if(!a.b("gilni"))break;break;case 38:if(!a.b("rilni"))break;break;case 39:if(!a.b("nilni"))break;break;case 40:if(!a.b("alni"))break;break;case 41:if(!a.b("ozni"))break;break;case 42:if(!a.b("ravi"))break;break;case 43:if(!a.b("stavni"))break;break;case 44:if(!a.b("pravni"))break; +break;case 45:if(!a.b("tivni"))break;break;case 46:if(!a.b("sivni"))break;break;case 47:if(!a.b("atni"))break;break;case 48:if(!a.b("enta"))break;break;case 49:if(!a.b("tetni"))break;break;case 50:if(!a.b("pletni"))break;break;case 51:if(!a.b("\u0161avi"))break;break;case 52:if(!p||!a.b("savi"))break;break;case 53:if(!a.b("anta"))break;break;case 54:if(!a.b("a\u010dka"))break;break;case 55:if(!p||!a.b("acka"))break;break;case 56:if(!a.b("u\u0161ka"))break;break;case 57:if(!p||!a.b("uska"))break;break; +case 58:if(!a.b("atka"))break;break;case 59:if(!a.b("etka"))break;break;case 60:if(!a.b("itka"))break;break;case 61:if(!a.b("otka"))break;break;case 62:if(!a.b("utka"))break;break;case 63:if(!a.b("eskna"))break;break;case 64:if(!a.b("ti\u010dni"))break;break;case 65:if(!p||!a.b("ticni"))break;break;case 66:if(!a.b("ojska"))break;break;case 67:if(!a.b("esma"))break;break;case 68:if(!a.b("metra"))break;break;case 69:if(!a.b("centra"))break;break;case 70:if(!a.b("istra"))break;break;case 71:if(!a.b("osti"))break; +break;case 72:if(!p||!a.b("osti"))break;break;case 73:if(!a.b("dba"))break;break;case 74:if(!a.b("\u010dka"))break;break;case 75:if(!a.b("mca"))break;break;case 76:if(!a.b("nca"))break;break;case 77:if(!a.b("voljni"))break;break;case 78:if(!a.b("anki"))break;break;case 79:if(!a.b("vca"))break;break;case 80:if(!a.b("sca"))break;break;case 81:if(!a.b("rca"))break;break;case 82:if(!a.b("alca"))break;break;case 83:if(!a.b("elca"))break;break;case 84:if(!a.b("olca"))break;break;case 85:if(!a.b("njca"))break; +break;case 86:if(!a.b("ekta"))break;break;case 87:if(!a.b("izma"))break;break;case 88:if(!a.b("jebi"))break;break;case 89:if(!a.b("baci"))break;break;case 90:if(!a.b("a\u0161ni"))break;break;case 91:!p||a.b("asni")}a.cursor=a.a-b;b=a.a-a.cursor;b:{n=a.a-a.cursor;if(m())break b;a.cursor=a.a-n;a.d=a.cursor;0!=a.g(e)&&(a.c=a.cursor,!(r<=a.cursor)||a.b(""))}a.cursor=a.a-b;a.cursor=a.f;return f};this.stemWord=function(b){a.p(b);this.m();return a.j}};window.SpanishStemmer=function(){function g(){for(var a;;){var d=b.cursor;a:{b.c=b.cursor;a=b.o(l);b.d=b.cursor;switch(a){case 1:if(!b.b("a"))return;break;case 2:if(!b.b("e"))return;break;case 3:if(!b.b("i"))return;break;case 4:if(!b.b("o"))return;break;case 5:if(!b.b("u"))return;break;case 6:if(b.cursor>=b.a)break a;b.cursor++}continue}b.cursor=d;break}}function d(){return w<=b.cursor}function m(){var a;b.d=b.cursor;a=b.g(s);if(0==a)return q;b.c=b.cursor;switch(a){case 1:if(!d()||!b.e())return q; +break;case 2:if(!d()||!b.e())return q;a=b.a-b.cursor;b.d=b.cursor;if(b.h("ic"))if(b.c=b.cursor,d()){if(!b.e())return q}else b.cursor=b.a-a;else b.cursor=b.a-a;break;case 3:if(!d()||!b.b("log"))return q;break;case 4:if(!d()||!b.b("u"))return q;break;case 5:if(!d()||!b.b("ente"))return q;break;case 6:if(!(y<=b.cursor)||!b.e())return q;var e=b.a-b.cursor;a:if(b.d=b.cursor,a=b.g(h),0==a)b.cursor=b.a-e;else if(b.c=b.cursor,d()){if(!b.e())return q;switch(a){case 1:b.d=b.cursor;if(!b.h("at")){b.cursor=b.a- +e;break a}b.c=b.cursor;if(!d()){b.cursor=b.a-e;break a}if(!b.e())return q}}else b.cursor=b.a-e;break;case 7:if(!d()||!b.e())return q;a=b.a-b.cursor;b.d=b.cursor;if(0==b.g(u))b.cursor=b.a-a;else if(b.c=b.cursor,d()){if(!b.e())return q}else b.cursor=b.a-a;break;case 8:if(!d()||!b.e())return q;a=b.a-b.cursor;b.d=b.cursor;if(0==b.g(t))b.cursor=b.a-a;else if(b.c=b.cursor,d()){if(!b.e())return q}else b.cursor=b.a-a;break;case 9:if(!d()||!b.e())return q;a=b.a-b.cursor;b.d=b.cursor;if(b.h("at"))if(b.c=b.cursor, +d()){if(!b.e())return q}else b.cursor=b.a-a;else b.cursor=b.a-a}return f}function a(){if(b.cursor=b.a)break e;b.cursor++}break d}b.cursor=h;if(!b.i(v,97,252))break c;e:for(;;){if(b.k(v,97,252))break e;if(b.cursor>=b.a)break c;b.cursor++}}break b}b.cursor=d;if(!b.k(v,97,252))break a;c:{d=b.cursor;d:if(b.k(v,97,252)){e:for(;;){if(b.i(v,97,252))break e;if(b.cursor>=b.a)break d;b.cursor++}break c}b.cursor=d;if(!b.i(v,97,252))break a;if(b.cursor>=b.a)break a;b.cursor++}}z=b.cursor}b.cursor=c;c=b.cursor;a:{b:for(;;){if(b.i(v, +97,252))break b;if(b.cursor>=b.a)break a;b.cursor++}b:for(;;){if(b.k(v,97,252))break b;if(b.cursor>=b.a)break a;b.cursor++}y=b.cursor;b:for(;;){if(b.i(v,97,252))break b;if(b.cursor>=b.a)break a;b.cursor++}b:for(;;){if(b.k(v,97,252))break b;if(b.cursor>=b.a)break a;b.cursor++}w=b.cursor}b.cursor=c;b.f=b.cursor;b.cursor=b.a;c=b.a-b.cursor;b.d=b.cursor;if(0!=b.g(n)&&(b.c=b.cursor,d=b.g(e),0!=d&&z<=b.cursor))switch(d){case 1:b.c=b.cursor;if(!b.b("iendo"))break;break;case 2:b.c=b.cursor;if(!b.b("ando"))break; +break;case 3:b.c=b.cursor;if(!b.b("ar"))break;break;case 4:b.c=b.cursor;if(!b.b("er"))break;break;case 5:b.c=b.cursor;if(!b.b("ir"))break;break;case 6:if(!b.e())break;break;case 7:!b.h("u")||b.e()}b.cursor=b.a-c;c=b.a-b.cursor;b:{d=b.a-b.cursor;if(m())break b;b.cursor=b.a-d;if(a())break b;b.cursor=b.a-d;if(!(b.cursord.a)){d.cursor=s;e=d.cursor;d.cursor=t;b:for(;;){t=d.cursor;if(d.i(l,97,246)){d.cursor=t;break b}d.cursor=t;if(d.cursor>=d.a)break a;d.cursor++}b:for(;;){if(d.k(l,97,246))break b;if(d.cursor>=d.a)break a;d.cursor++}h=d.cursor;h>=e||(h=e)}}d.cursor=b;d.f=d.cursor;d.cursor=d.a;b=d.a-d.cursor;if(!(d.cursor=h.j.length)return q;h.f=h.cursor;h.cursor=h.a;a:{var b=h.a-h.cursor;b:if(h.d=h.cursor,a=h.g(k),0!=a){h.c=h.cursor; +switch(a){case 1:if(!h.e())return q;break;case 2:a=h.a-h.cursor;if(0==h.g(s))break b;h.cursor=h.a-a;if(!h.e())return q;break;case 3:if(!h.b("\u0bb3\u0bcd"))return q;break;case 4:if(!h.b("\u0bb2\u0bcd"))return q;break;case 5:if(!h.b("\u0b9f\u0bc1"))return q;break;case 6:if(!K)break b;a=h.a-h.cursor;if(h.h("\u0bc8"))break b;h.cursor=h.a-a;if(!h.b("\u0bae\u0bcd"))return q;break;case 7:if(!h.b("\u0bcd"))return q;break;case 8:a=h.a-h.cursor;if(0!=h.g(r))break b;h.cursor=h.a-a;if(!h.e())return q;break; +case 9:switch(a=h.g(p),a){case 1:if(!h.e())return q;break;case 2:if(!h.b("\u0bae\u0bcd"))return q}}break a}h.cursor=h.a-b;h.d=h.cursor;if(!h.h("\u0bcd"))return q;b:{b=h.a-h.cursor;if(0!=h.g(v)){b=h.a-h.cursor;h.h("\u0bcd")?0==h.g(w)&&(h.cursor=h.a-b):h.cursor=h.a-b;h.c=h.cursor;if(!h.e())return q;break b}h.cursor=h.a-b;if(0!=h.g(y)&&(h.c=h.cursor,h.h("\u0bcd"))){if(!h.e())return q;break b}h.cursor=h.a-b;b=h.a-h.cursor;if(0==h.g(z))return q;h.cursor=h.a-b;h.c=h.cursor;if(!h.e())return q}}h.cursor= +h.f;return f}function b(){var a;h.f=h.cursor;h.cursor=h.a;h.d=h.cursor;a=h.g(x);if(0!=a){h.c=h.cursor;switch(a){case 1:a:{a=h.a-h.cursor;if(0!=h.g(D)){if(!h.b("\u0bc1\u0b99\u0bcd"))return;break a}h.cursor=h.a-a;if(!h.b("\u0bcd"))return}break;case 2:if(!h.b("\u0bb2\u0bcd"))return;break;case 3:if(!h.b("\u0bb3\u0bcd"))return;break;case 4:if(!h.e())return}h.cursor=h.f}}function l(){var a;if(g()&&(h.f=h.cursor,h.cursor=h.a,h.d=h.cursor,a=h.g(O),0!=a)){h.c=h.cursor;switch(a){case 1:if(!h.b("\u0bcd"))return; +break;case 2:a=h.a-h.cursor;if(0!=h.g(J))return;h.cursor=h.a-a;if(!h.b("\u0bcd"))return;break;case 3:if(!h.e())return}h.cursor=h.f;m()}}function n(){var a;K=q;if(g()){h.f=h.cursor;h.cursor=h.a;a:{var b=h.a-h.cursor;b:{var c=h.a-h.cursor;h.d=h.cursor;a=h.g(Q);if(0!=a){h.c=h.cursor;switch(a){case 1:if(!h.e())return;break;case 2:if(!h.b("\u0bcd"))return;break;case 3:a=h.a-h.cursor;if(h.h("\u0bae"))break b;h.cursor=h.a-a;if(!h.b("\u0bcd"))return;break;case 4:if(7>h.j.length)break b;if(!h.b("\u0bcd"))return; +break;case 5:a=h.a-h.cursor;if(0!=h.g(N))break b;h.cursor=h.a-a;if(!h.b("\u0bcd"))return;break;case 6:a=h.a-h.cursor;if(0!=h.g(P))break b;h.cursor=h.a-a;if(!h.e())return;break;case 7:if(!h.b("\u0bbf"))return}h.cursor=h.a-c;break a}}h.cursor=h.a-b;b=h.a-h.cursor;h.d=h.cursor;if(!h.h("\u0bc8"))return;b:{c=h.a-h.cursor;c:{a=h.a-h.cursor;if(0!=h.g(T))break c;h.cursor=h.a-a;break b}h.cursor=h.a-c;c=h.a-h.cursor;if(0==h.g(U)||!h.h("\u0bcd"))return;h.cursor=h.a-c}h.c=h.cursor;if(!h.b("\u0bcd"))return;h.cursor= +h.a-b}K=f;b=h.a-h.cursor;h.d=h.cursor;if(h.h("\u0bbf\u0ba9\u0bcd")&&(h.c=h.cursor,!h.b("\u0bcd")))return;h.cursor=h.a-b;h.cursor=h.f;m()}}function e(){var a;H=q;if(g()){h.f=h.cursor;h.cursor=h.a;var b=h.a-h.cursor;a:{var c=h.a-h.cursor;h.d=h.cursor;a=h.g(V);if(0!=a){h.c=h.cursor;switch(a){case 1:if(!h.e())return;break;case 2:a=h.a-h.cursor;if(0!=h.g(R))break a;h.cursor=h.a-a;if(!h.e())return;break;case 3:a=h.a-h.cursor;if(0!=h.g(S))break a;h.cursor=h.a-a;if(!h.e())return;break;case 4:a=h.a-h.cursor; +if(h.h("\u0b9a"))break a;h.cursor=h.a-a;if(!h.b("\u0bcd"))return;break;case 5:if(!h.b("\u0bcd"))return;break;case 6:a=h.a-h.cursor;if(!h.h("\u0bcd"))break a;h.cursor=h.a-a;if(!h.e())return}H=f;h.cursor=h.a-c}}h.cursor=h.a-b;b=h.a-h.cursor;h.d=h.cursor;if(0!=h.g(M)){h.c=h.cursor;if(!h.e())return;H=f}h.cursor=h.a-b;h.cursor=h.f;m()}}var h=new C,u=[["\u0bb5\u0bc1",-1,3],["\u0bb5\u0bc2",-1,4],["\u0bb5\u0bca",-1,2],["\u0bb5\u0bcb",-1,1]],t=[["\u0b95",-1,-1],["\u0b99",-1,-1],["\u0b9a",-1,-1],["\u0b9e", +-1,-1],["\u0ba4",-1,-1],["\u0ba8",-1,-1],["\u0baa",-1,-1],["\u0bae",-1,-1],["\u0baf",-1,-1],["\u0bb5",-1,-1]],s=[["\u0bbf",-1,-1],["\u0bc0",-1,-1],["\u0bc8",-1,-1]],r=[["\u0bbe",-1,-1],["\u0bbf",-1,-1],["\u0bc0",-1,-1],["\u0bc1",-1,-1],["\u0bc2",-1,-1],["\u0bc6",-1,-1],["\u0bc7",-1,-1],["\u0bc8",-1,-1]],p=[["",-1,2],["\u0bc8",0,1],["\u0bcd",0,1]],k=[["\u0ba8\u0bcd\u0ba4",-1,1],["\u0baf",-1,1],["\u0bb5",-1,1],["\u0ba9\u0bc1",-1,8],["\u0bc1\u0b95\u0bcd",-1,7],["\u0bc1\u0b95\u0bcd\u0b95\u0bcd",-1,7], +["\u0b9f\u0bcd\u0b95\u0bcd",-1,3],["\u0bb1\u0bcd\u0b95\u0bcd",-1,4],["\u0b99\u0bcd",-1,9],["\u0b9f\u0bcd\u0b9f\u0bcd",-1,5],["\u0ba4\u0bcd\u0ba4\u0bcd",-1,6],["\u0ba8\u0bcd\u0ba4\u0bcd",-1,1],["\u0ba8\u0bcd",-1,1],["\u0b9f\u0bcd\u0baa\u0bcd",-1,3],["\u0baf\u0bcd",-1,2],["\u0ba9\u0bcd\u0bb1\u0bcd",-1,4],["\u0bb5\u0bcd",-1,1]],v=[["\u0b95",-1,-1],["\u0b9a",-1,-1],["\u0b9f",-1,-1],["\u0ba4",-1,-1],["\u0baa",-1,-1],["\u0bb1",-1,-1]],w=[["\u0b95",-1,-1],["\u0b9a",-1,-1],["\u0b9f",-1,-1],["\u0ba4",-1,-1], +["\u0baa",-1,-1],["\u0bb1",-1,-1]],y=[["\u0b9e",-1,-1],["\u0ba3",-1,-1],["\u0ba8",-1,-1],["\u0ba9",-1,-1],["\u0bae",-1,-1],["\u0baf",-1,-1],["\u0bb0",-1,-1],["\u0bb2",-1,-1],["\u0bb3",-1,-1],["\u0bb4",-1,-1],["\u0bb5",-1,-1]],z=[["\u0bbe",-1,-1],["\u0bbf",-1,-1],["\u0bc0",-1,-1],["\u0bc1",-1,-1],["\u0bc2",-1,-1],["\u0bc6",-1,-1],["\u0bc7",-1,-1],["\u0bc8",-1,-1],["\u0bcd",-1,-1]],c=[["\u0b85",-1,-1],["\u0b87",-1,-1],["\u0b89",-1,-1]],A=[["\u0b95",-1,-1],["\u0b99",-1,-1],["\u0b9a",-1,-1],["\u0b9e", +-1,-1],["\u0ba4",-1,-1],["\u0ba8",-1,-1],["\u0baa",-1,-1],["\u0bae",-1,-1],["\u0baf",-1,-1],["\u0bb5",-1,-1]],D=[["\u0b95",-1,-1],["\u0b9a",-1,-1],["\u0b9f",-1,-1],["\u0ba4",-1,-1],["\u0baa",-1,-1],["\u0bb1",-1,-1]],x=[["\u0b95\u0bb3\u0bcd",-1,4],["\u0bc1\u0b99\u0bcd\u0b95\u0bb3\u0bcd",0,1],["\u0b9f\u0bcd\u0b95\u0bb3\u0bcd",0,3],["\u0bb1\u0bcd\u0b95\u0bb3\u0bcd",0,2]],G=[["\u0bbe",-1,-1],["\u0bc7",-1,-1],["\u0bcb",-1,-1]],E=[["\u0baa\u0bbf",-1,-1],["\u0bb5\u0bbf",-1,-1]],J=[["\u0bbe",-1,-1],["\u0bbf", +-1,-1],["\u0bc0",-1,-1],["\u0bc1",-1,-1],["\u0bc2",-1,-1],["\u0bc6",-1,-1],["\u0bc7",-1,-1],["\u0bc8",-1,-1]],O=[["\u0baa\u0b9f\u0bcd\u0b9f",-1,3],["\u0baa\u0b9f\u0bcd\u0b9f\u0ba3",-1,3],["\u0ba4\u0bbe\u0ba9",-1,3],["\u0baa\u0b9f\u0bbf\u0ba4\u0bbe\u0ba9",2,3],["\u0bc6\u0ba9",-1,1],["\u0bbe\u0b95\u0bbf\u0baf",-1,1],["\u0b95\u0bc1\u0bb0\u0bbf\u0baf",-1,3],["\u0bc1\u0b9f\u0bc8\u0baf",-1,1],["\u0bb2\u0bcd\u0bb2",-1,2],["\u0bc1\u0bb3\u0bcd\u0bb3",-1,1],["\u0bbe\u0b95\u0bbf",-1,1],["\u0baa\u0b9f\u0bbf", +-1,3],["\u0bbf\u0ba9\u0bcd\u0bb1\u0bbf",-1,1],["\u0baa\u0bb1\u0bcd\u0bb1\u0bbf",-1,3],["\u0baa\u0b9f\u0bc1",-1,3],["\u0bb5\u0bbf\u0b9f\u0bc1",-1,3],["\u0baa\u0b9f\u0bcd\u0b9f\u0bc1",-1,3],["\u0bb5\u0bbf\u0b9f\u0bcd\u0b9f\u0bc1",-1,3],["\u0baa\u0b9f\u0bcd\u0b9f\u0ba4\u0bc1",-1,3],["\u0bc6\u0ba9\u0bcd\u0bb1\u0bc1",-1,1],["\u0bc1\u0b9f\u0bc8",-1,1],["\u0bbf\u0bb2\u0bcd\u0bb2\u0bc8",-1,1],["\u0bc1\u0b9f\u0ba9\u0bcd",-1,1],["\u0bbf\u0b9f\u0bae\u0bcd",-1,1],["\u0bc6\u0bb2\u0bcd\u0bb2\u0bbe\u0bae\u0bcd", +-1,3],["\u0bc6\u0ba9\u0bc1\u0bae\u0bcd",-1,1]],N=[["\u0bbe",-1,-1],["\u0bbf",-1,-1],["\u0bc0",-1,-1],["\u0bc1",-1,-1],["\u0bc2",-1,-1],["\u0bc6",-1,-1],["\u0bc7",-1,-1],["\u0bc8",-1,-1]],P=[["\u0bbe",-1,-1],["\u0bbf",-1,-1],["\u0bc0",-1,-1],["\u0bc1",-1,-1],["\u0bc2",-1,-1],["\u0bc6",-1,-1],["\u0bc7",-1,-1],["\u0bc8",-1,-1]],Q=[["\u0bb5\u0bbf\u0b9f",-1,2],["\u0bc0",-1,7],["\u0bca\u0b9f\u0bc1",-1,2],["\u0bcb\u0b9f\u0bc1",-1,2],["\u0ba4\u0bc1",-1,6],["\u0bbf\u0bb0\u0bc1\u0ba8\u0bcd\u0ba4\u0bc1",4,2], +["\u0bbf\u0ba9\u0bcd\u0bb1\u0bc1",-1,2],["\u0bc1\u0b9f\u0bc8",-1,2],["\u0ba9\u0bc8",-1,1],["\u0b95\u0ba3\u0bcd",-1,1],["\u0bbf\u0ba9\u0bcd",-1,3],["\u0bae\u0bc1\u0ba9\u0bcd",-1,1],["\u0bbf\u0b9f\u0bae\u0bcd",-1,4],["\u0bbf\u0bb1\u0bcd",-1,2],["\u0bae\u0bc7\u0bb1\u0bcd",-1,1],["\u0bb2\u0bcd",-1,5],["\u0bbe\u0bae\u0bb2\u0bcd",15,2],["\u0bbe\u0bb2\u0bcd",15,2],["\u0bbf\u0bb2\u0bcd",15,2],["\u0bae\u0bc7\u0bb2\u0bcd",15,1],["\u0bc1\u0bb3\u0bcd",-1,2],["\u0b95\u0bc0\u0bb4\u0bcd",-1,1]],T=[["\u0b95",-1, +-1],["\u0b9a",-1,-1],["\u0b9f",-1,-1],["\u0ba4",-1,-1],["\u0baa",-1,-1],["\u0bb1",-1,-1]],U=[["\u0b95",-1,-1],["\u0b9a",-1,-1],["\u0b9f",-1,-1],["\u0ba4",-1,-1],["\u0baa",-1,-1],["\u0bb1",-1,-1]],R=[["\u0b85",-1,-1],["\u0b86",-1,-1],["\u0b87",-1,-1],["\u0b88",-1,-1],["\u0b89",-1,-1],["\u0b8a",-1,-1],["\u0b8e",-1,-1],["\u0b8f",-1,-1],["\u0b90",-1,-1],["\u0b92",-1,-1],["\u0b93",-1,-1],["\u0b94",-1,-1]],S=[["\u0bbe",-1,-1],["\u0bbf",-1,-1],["\u0bc0",-1,-1],["\u0bc1",-1,-1],["\u0bc2",-1,-1],["\u0bc6", +-1,-1],["\u0bc7",-1,-1],["\u0bc8",-1,-1]],V=[["\u0b95",-1,1],["\u0ba4",-1,1],["\u0ba9",-1,1],["\u0baa",-1,1],["\u0baf",-1,1],["\u0bbe",-1,5],["\u0b95\u0bc1",-1,6],["\u0baa\u0b9f\u0bc1",-1,1],["\u0ba4\u0bc1",-1,3],["\u0bbf\u0bb1\u0bcd\u0bb1\u0bc1",-1,1],["\u0ba9\u0bc8",-1,1],["\u0bb5\u0bc8",-1,1],["\u0ba9\u0ba9\u0bcd",-1,1],["\u0baa\u0ba9\u0bcd",-1,1],["\u0bb5\u0ba9\u0bcd",-1,2],["\u0bbe\u0ba9\u0bcd",-1,4],["\u0ba9\u0bbe\u0ba9\u0bcd",15,1],["\u0bae\u0bbf\u0ba9\u0bcd",-1,1],["\u0ba9\u0bc6\u0ba9\u0bcd", +-1,1],["\u0bc7\u0ba9\u0bcd",-1,5],["\u0ba9\u0bae\u0bcd",-1,1],["\u0baa\u0bae\u0bcd",-1,1],["\u0bbe\u0bae\u0bcd",-1,5],["\u0b95\u0bc1\u0bae\u0bcd",-1,1],["\u0b9f\u0bc1\u0bae\u0bcd",-1,5],["\u0ba4\u0bc1\u0bae\u0bcd",-1,1],["\u0bb1\u0bc1\u0bae\u0bcd",-1,1],["\u0bc6\u0bae\u0bcd",-1,5],["\u0bc7\u0bae\u0bcd",-1,5],["\u0bcb\u0bae\u0bcd",-1,5],["\u0bbe\u0baf\u0bcd",-1,5],["\u0ba9\u0bb0\u0bcd",-1,1],["\u0baa\u0bb0\u0bcd",-1,1],["\u0bc0\u0baf\u0bb0\u0bcd",-1,5],["\u0bb5\u0bb0\u0bcd",-1,1],["\u0bbe\u0bb0\u0bcd", +-1,5],["\u0ba9\u0bbe\u0bb0\u0bcd",35,1],["\u0bae\u0bbe\u0bb0\u0bcd",35,1],["\u0b95\u0bca\u0ba3\u0bcd\u0b9f\u0bbf\u0bb0\u0bcd",-1,1],["\u0ba9\u0bbf\u0bb0\u0bcd",-1,5],["\u0bc0\u0bb0\u0bcd",-1,5],["\u0ba9\u0bb3\u0bcd",-1,1],["\u0baa\u0bb3\u0bcd",-1,1],["\u0bb5\u0bb3\u0bcd",-1,1],["\u0bbe\u0bb3\u0bcd",-1,5],["\u0ba9\u0bbe\u0bb3\u0bcd",44,1]],M=[["\u0b95\u0bbf\u0bb1",-1,-1],["\u0b95\u0bbf\u0ba9\u0bcd\u0bb1",-1,-1],["\u0bbe\u0ba8\u0bbf\u0ba9\u0bcd\u0bb1",-1,-1],["\u0b95\u0bbf\u0bb1\u0bcd",-1,-1],["\u0b95\u0bbf\u0ba9\u0bcd\u0bb1\u0bcd", +-1,-1],["\u0bbe\u0ba8\u0bbf\u0ba9\u0bcd\u0bb1\u0bcd",-1,-1]],K=q,H=q;this.m=function(){K=q;var k=h.cursor;a();h.cursor=k;if(!g())return q;k=h.cursor;h.c=h.cursor;if(h.n("\u0b8e")&&0!=h.o(t)&&h.n("\u0bcd")&&(h.d=h.cursor,h.e())){var p=h.cursor;d();h.cursor=p}h.cursor=k;k=h.cursor;h.c=h.cursor;if(!(0==h.o(c)||0==h.o(A))&&h.n("\u0bcd"))h.d=h.cursor,h.e()&&(p=h.cursor,d(),h.cursor=p);h.cursor=k;k=h.cursor;a:if(g()){h.f=h.cursor;h.cursor=h.a;p=h.a-h.cursor;h.d=h.cursor;if(0!=h.g(G)&&(h.c=h.cursor,!h.b("\u0bcd")))break a; +h.cursor=h.a-p;h.cursor=h.f;m()}h.cursor=k;k=h.cursor;g()&&(h.f=h.cursor,h.cursor=h.a,h.d=h.cursor,h.h("\u0bc1\u0bae\u0bcd")&&(h.c=h.cursor,h.b("\u0bcd")&&(h.cursor=h.f,p=h.cursor,a(),h.cursor=p)));h.cursor=k;k=h.cursor;l();h.cursor=k;k=h.cursor;n();h.cursor=k;k=h.cursor;b();h.cursor=k;k=h.cursor;g()&&(h.f=h.cursor,h.cursor=h.a,h.d=h.cursor,0!=h.g(E)&&(h.c=h.cursor,h.e()&&(h.cursor=h.f)));h.cursor=k;k=h.cursor;for(H=f;;){p=h.cursor;if(H){p=h.cursor;e();h.cursor=p;continue}h.cursor=p;break}h.cursor= +k;return f};this.stemWord=function(a){h.p(a);this.m();return h.j}};window.TurkishStemmer=function(){function g(){var a=c.a-c.cursor;a:for(;;){var b=c.a-c.cursor;if(c.l(F,97,305)){c.cursor=c.a-b;break a}c.cursor=c.a-b;if(c.cursor<=c.f)return q;c.cursor--}a:{b=c.a-c.cursor;b:if(c.h("a")){c:for(;;){var d=c.a-c.cursor;if(c.l(da,97,305)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor=c.a-b;b:if(c.h("e")){c:for(;;){d=c.a-c.cursor;if(c.l(ea,101,252)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor= +c.a-b;b:if(c.h("\u0131")){c:for(;;){d=c.a-c.cursor;if(c.l(fa,97,305)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor=c.a-b;b:if(c.h("i")){c:for(;;){d=c.a-c.cursor;if(c.l(ga,101,105)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor=c.a-b;b:if(c.h("o")){c:for(;;){d=c.a-c.cursor;if(c.l(Y,111,117)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor=c.a-b;b:if(c.h("\u00f6")){c:for(;;){d= +c.a-c.cursor;if(c.l(Z,246,252)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor=c.a-b;b:if(c.h("u")){c:for(;;){d=c.a-c.cursor;if(c.l(Y,111,117)){c.cursor=c.a-d;break c}c.cursor=c.a-d;if(c.cursor<=c.f)break b;c.cursor--}break a}c.cursor=c.a-b;if(!c.h("\u00fc"))return q;b:for(;;){b=c.a-c.cursor;if(c.l(Z,246,252)){c.cursor=c.a-b;break b}c.cursor=c.a-b;if(c.cursor<=c.f)return q;c.cursor--}}c.cursor=c.a-a;return f}function d(){a:{var a=c.a-c.cursor;if(c.h("n")){var b= +c.a-c.cursor;if(c.l(F,97,305)){c.cursor=c.a-b;break a}}c.cursor=c.a-a;a=c.a-c.cursor;b=c.a-c.cursor;if(c.h("n"))return c.cursor=c.a-b,q;c.cursor=c.a-a;a=c.a-c.cursor;if(c.cursor<=c.f)return q;c.cursor--;if(!c.l(F,97,305))return q;c.cursor=c.a-a}return f}function m(){a:{var a=c.a-c.cursor;if(c.h("s")){var b=c.a-c.cursor;if(c.l(F,97,305)){c.cursor=c.a-b;break a}}c.cursor=c.a-a;a=c.a-c.cursor;b=c.a-c.cursor;if(c.h("s"))return c.cursor=c.a-b,q;c.cursor=c.a-a;a=c.a-c.cursor;if(c.cursor<=c.f)return q;c.cursor--; +if(!c.l(F,97,305))return q;c.cursor=c.a-a}return f}function a(){a:{var a=c.a-c.cursor;if(c.h("y")){var b=c.a-c.cursor;if(c.l(F,97,305)){c.cursor=c.a-b;break a}}c.cursor=c.a-a;a=c.a-c.cursor;b=c.a-c.cursor;if(c.h("y"))return c.cursor=c.a-b,q;c.cursor=c.a-a;a=c.a-c.cursor;if(c.cursor<=c.f)return q;c.cursor--;if(!c.l(F,97,305))return q;c.cursor=c.a-a}return f}function b(){a:{var a=c.a-c.cursor;if(c.l(W,105,305)){var b=c.a-c.cursor;if(c.q(F,97,305)){c.cursor=c.a-b;break a}}c.cursor=c.a-a;a=c.a-c.cursor; +b=c.a-c.cursor;if(c.l(W,105,305))return c.cursor=c.a-b,q;c.cursor=c.a-a;a=c.a-c.cursor;if(c.cursor<=c.f)return q;c.cursor--;if(!c.q(F,97,305))return q;c.cursor=c.a-a}return f}function l(){return 0==c.g(A)||!b()?q:f}function n(){return!g()||!c.l(W,105,305)||!m()?q:f}function e(){return 0==c.g(D)?q:f}function h(){return!g()||0==c.g(R)||!a()?q:f}function u(){return!g()||0==c.g(S)?q:f}function t(){return!g()||0==c.g(V)||!a()?q:f}function s(){return!g()||0==c.g(K)?q:f}function r(){return!g()||0==c.g($)|| +!a()?q:f}function p(){return 0==c.g(aa)||!a()?q:f}function k(){return!g()||0==c.g(ba)||!a()?q:f}function v(){c.d=c.cursor;X=f;a:{var b=c.a-c.cursor;b:{c:{var d=c.a-c.cursor;if(k())break c;c.cursor=c.a-d;if(r())break c;c.cursor=c.a-d;if(p())break c;c.cursor=c.a-d;if(!c.h("ken")||!a())break b}break a}c.cursor=c.a-b;if(0!=c.g(L)){c:{d=c.a-c.cursor;if(0!=c.g(M))break c;c.cursor=c.a-d;if(s())break c;c.cursor=c.a-d;if(h())break c;c.cursor=c.a-d;if(u())break c;c.cursor=c.a-d;if(t())break c;c.cursor=c.a- +d}if(k())break a}c.cursor=c.a-b;if(s()){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;d:{d=c.a-c.cursor;if(g()&&0!=c.g(I))break d;c.cursor=c.a-d;if(r())break d;c.cursor=c.a-d;if(p())break d;c.cursor=c.a-d;k()||(c.cursor=c.a-b)}X=q;break a}c.cursor=c.a-b;b:if(g()&&0!=c.g(H)){c:{d=c.a-c.cursor;if(r())break c;c.cursor=c.a-d;if(!p())break b}break a}c.cursor=c.a-b;b:{c:{d=c.a-c.cursor;if(0!=c.g(M))break c;c.cursor=c.a-d;if(t())break c;c.cursor=c.a-d;if(u())break c;c.cursor=c.a-d;if(!h())break b}c.c= +c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;k()||(c.cursor=c.a-b);break a}c.cursor=c.a-b;if(!g()||0==c.g(I))return;c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;c:{d=c.a-c.cursor;if(0!=c.g(M))break c;c.cursor=c.a-d;if(s())break c;c.cursor=c.a-d;if(h())break c;c.cursor=c.a-d;if(u())break c;c.cursor=c.a-d;if(t())break c;c.cursor=c.a-d}k()||(c.cursor=c.a-b)}c.c=c.cursor;c.e()}function w(){c.d=c.cursor;if(!c.h("ki"))return q;a:{var a=c.a-c.cursor;if(g()&&0!=c.g(O)){c.c=c.cursor; +if(!c.e())return q;a=c.a-c.cursor;c.d=c.cursor;d:{var b=c.a-c.cursor;if(s()){c.c=c.cursor;if(!c.e())return q;a=c.a-c.cursor;w()||(c.cursor=c.a-a);break d}c.cursor=c.a-b;if(l()){c.c=c.cursor;if(!c.e())return q;a=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return q;w()||(c.cursor=c.a-a)}else c.cursor=c.a-a}else c.cursor=c.a-a}break a}c.cursor=c.a-a;if(g()&&0!=c.g(G)&&d()){c.c=c.cursor;if(!c.e())return q;a=c.a-c.cursor;c.d=c.cursor;d:{b=c.a-c.cursor;if(e()){c.c=c.cursor;if(!c.e())return q; +break d}c.cursor=c.a-b;e:{c.d=c.cursor;f:{var h=c.a-c.cursor;if(l())break f;c.cursor=c.a-h;if(!n())break e}c.c=c.cursor;if(!c.e())return q;a=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return q;w()||(c.cursor=c.a-a)}else c.cursor=c.a-a;break d}c.cursor=c.a-b;w()||(c.cursor=c.a-a)}break a}c.cursor=c.a-a;if(!g()||0==c.g(N))return q;b:{a=c.a-c.cursor;if(e()){c.c=c.cursor;if(!c.e())return q;break b}c.cursor=c.a-a;if(n()){c.c=c.cursor;if(!c.e())return q;a=c.a-c.cursor;c.d=c.cursor;if(s()){c.c= +c.cursor;if(!c.e())return q;w()||(c.cursor=c.a-a)}else c.cursor=c.a-a;break b}c.cursor=c.a-a;if(!w())return q}}return f}function y(){a:{var b=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;w()||(c.cursor=c.a-b);break a}c.cursor=c.a-b;c.d=c.cursor;if(g()&&0!=c.g(U)&&d()){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;d:{var h=c.a-c.cursor;c.d=c.cursor;if(e()){c.c=c.cursor;if(!c.e())return;break d}c.cursor=c.a-h;e:{c.d=c.cursor;f:{var k=c.a-c.cursor;if(l())break f;c.cursor= +c.a-k;if(!n())break e}c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;w()||(c.cursor=c.a-b)}else c.cursor=c.a-b;break d}c.cursor=c.a-h;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;w()||(c.cursor=c.a-b)}else c.cursor=c.a-b}break a}c.cursor=c.a-b;b:{c.d=c.cursor;c:{h=c.a-c.cursor;if(g()&&0!=c.g(N))break c;c.cursor=c.a-h;if(!g()||0==c.g(J))break b}c:{h=c.a-c.cursor;if(e()){c.c=c.cursor;if(!c.e())return;break c}c.cursor=c.a-h;if(n()){c.c=c.cursor; +if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;w()||(c.cursor=c.a-b)}else c.cursor=c.a-b;break c}c.cursor=c.a-h;if(!w())break b}break a}c.cursor=c.a-b;b:{c.d=c.cursor;c:{h=c.a-c.cursor;if(g()&&0!=c.g(Q))break c;c.cursor=c.a-h;if(!g()||0==c.g(x))break b}c:{h=c.a-c.cursor;if(n()){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;w()||(c.cursor=c.a-b)}else c.cursor=c.a-b;break c}c.cursor=c.a-h;if(!e())break b}break a}c.cursor= +c.a-b;c.d=c.cursor;if(g()&&0!=c.g(P)){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;d:{h=c.a-c.cursor;if(l()){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;w()||(c.cursor=c.a-b)}else c.cursor=c.a-b;break d}c.cursor=c.a-h;if(s()){c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;w()||(c.cursor=c.a-b);break d}c.cursor=c.a-h;w()||(c.cursor=c.a-b)}break a}c.cursor=c.a-b;b:{c.d=c.cursor;c:{h=c.a-c.cursor;if(g()&&0!=c.g(G)&&d())break c;c.cursor=c.a- +h;if(!g()||0==c.g(T)||!a())break b}c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;d:{h=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;if(w())break d}c.cursor=c.a-h;e:{c.d=c.cursor;f:{k=c.a-c.cursor;if(l())break f;c.cursor=c.a-k;if(!n())break e}c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c.d=c.cursor;if(s()){c.c=c.cursor;if(!c.e())return;w()||(c.cursor=c.a-b)}else c.cursor=c.a-b;break d}c.cursor=c.a-h;w()||(c.cursor=c.a-b)}break a}c.cursor=c.a-b;c.d=c.cursor;if(e()){c.c=c.cursor;if(!c.e())return; +break a}c.cursor=c.a-b;if(w())break a;c.cursor=c.a-b;b:{c.d=c.cursor;c:{h=c.a-c.cursor;if(g()&&0!=c.g(O))break c;c.cursor=c.a-h;if(g()&&c.l(W,105,305)&&a())break c;c.cursor=c.a-h;if(!g()||0==c.g(E)||!a())break b}c.c=c.cursor;if(!c.e())return;b=c.a-c.cursor;c:{c.d=c.cursor;d:{h=c.a-c.cursor;if(l()){c.c=c.cursor;if(!c.e())return;h=c.a-c.cursor;c.d=c.cursor;s()||(c.cursor=c.a-h);break d}c.cursor=c.a-h;if(!s()){c.cursor=c.a-b;break c}}c.c=c.cursor;if(!c.e())return;c.d=c.cursor;w()||(c.cursor=c.a-b)}break a}c.cursor= +c.a-b;c.d=c.cursor;b:{b=c.a-c.cursor;if(l())break b;c.cursor=c.a-b;if(!n())return}c.c=c.cursor;c.e()&&(b=c.a-c.cursor,c.d=c.cursor,s()?(c.c=c.cursor,c.e()&&!w()&&(c.cursor=c.a-b)):c.cursor=c.a-b)}}function z(){c.d=c.cursor;c.c=c.cursor;a:{var a=c.a-c.cursor;if(c.h("d"))break a;c.cursor=c.a-a;if(!c.h("g"))return}a:for(;;){a=c.a-c.cursor;if(c.l(F,97,305)){c.cursor=c.a-a;break a}c.cursor=c.a-a;if(c.cursor<=c.f)return;c.cursor--}a:{a=c.a-c.cursor;b:{c:{var b=c.a-c.cursor;if(c.h("a"))break c;c.cursor= +c.a-b;if(!c.h("\u0131"))break b}if(!c.b("\u0131"))return;break a}c.cursor=c.a-a;b:{c:{b=c.a-c.cursor;if(c.h("e"))break c;c.cursor=c.a-b;if(!c.h("i"))break b}if(!c.b("i"))return;break a}c.cursor=c.a-a;b:{c:{b=c.a-c.cursor;if(c.h("o"))break c;c.cursor=c.a-b;if(!c.h("u"))break b}if(!c.b("u"))return;break a}c.cursor=c.a-a;b:{a=c.a-c.cursor;if(c.h("\u00f6"))break b;c.cursor=c.a-a;if(!c.h("\u00fc"))return}c.b("\u00fc")}}var c=new C,A=[["m",-1,-1],["n",-1,-1],["miz",-1,-1],["niz",-1,-1],["muz",-1,-1],["nuz", +-1,-1],["m\u00fcz",-1,-1],["n\u00fcz",-1,-1],["m\u0131z",-1,-1],["n\u0131z",-1,-1]],D=[["leri",-1,-1],["lar\u0131",-1,-1]],x=[["ni",-1,-1],["nu",-1,-1],["n\u00fc",-1,-1],["n\u0131",-1,-1]],G=[["in",-1,-1],["un",-1,-1],["\u00fcn",-1,-1],["\u0131n",-1,-1]],E=[["a",-1,-1],["e",-1,-1]],J=[["na",-1,-1],["ne",-1,-1]],O=[["da",-1,-1],["ta",-1,-1],["de",-1,-1],["te",-1,-1]],N=[["nda",-1,-1],["nde",-1,-1]],P=[["dan",-1,-1],["tan",-1,-1],["den",-1,-1],["ten",-1,-1]],Q=[["ndan",-1,-1],["nden",-1,-1]],T=[["la", +-1,-1],["le",-1,-1]],U=[["ca",-1,-1],["ce",-1,-1]],R=[["im",-1,-1],["um",-1,-1],["\u00fcm",-1,-1],["\u0131m",-1,-1]],S=[["sin",-1,-1],["sun",-1,-1],["s\u00fcn",-1,-1],["s\u0131n",-1,-1]],V=[["iz",-1,-1],["uz",-1,-1],["\u00fcz",-1,-1],["\u0131z",-1,-1]],M=[["siniz",-1,-1],["sunuz",-1,-1],["s\u00fcn\u00fcz",-1,-1],["s\u0131n\u0131z",-1,-1]],K=[["lar",-1,-1],["ler",-1,-1]],H=[["niz",-1,-1],["nuz",-1,-1],["n\u00fcz",-1,-1],["n\u0131z",-1,-1]],I=[["dir",-1,-1],["tir",-1,-1],["dur",-1,-1],["tur",-1,-1], +["d\u00fcr",-1,-1],["t\u00fcr",-1,-1],["d\u0131r",-1,-1],["t\u0131r",-1,-1]],L=[["cas\u0131na",-1,-1],["cesine",-1,-1]],$=[["di",-1,-1],["ti",-1,-1],["dik",-1,-1],["tik",-1,-1],["duk",-1,-1],["tuk",-1,-1],["d\u00fck",-1,-1],["t\u00fck",-1,-1],["d\u0131k",-1,-1],["t\u0131k",-1,-1],["dim",-1,-1],["tim",-1,-1],["dum",-1,-1],["tum",-1,-1],["d\u00fcm",-1,-1],["t\u00fcm",-1,-1],["d\u0131m",-1,-1],["t\u0131m",-1,-1],["din",-1,-1],["tin",-1,-1],["dun",-1,-1],["tun",-1,-1],["d\u00fcn",-1,-1],["t\u00fcn",-1, +-1],["d\u0131n",-1,-1],["t\u0131n",-1,-1],["du",-1,-1],["tu",-1,-1],["d\u00fc",-1,-1],["t\u00fc",-1,-1],["d\u0131",-1,-1],["t\u0131",-1,-1]],aa=[["sa",-1,-1],["se",-1,-1],["sak",-1,-1],["sek",-1,-1],["sam",-1,-1],["sem",-1,-1],["san",-1,-1],["sen",-1,-1]],ba=[["mi\u015f",-1,-1],["mu\u015f",-1,-1],["m\u00fc\u015f",-1,-1],["m\u0131\u015f",-1,-1]],ca=[["b",-1,1],["c",-1,2],["d",-1,3],["\u011f",-1,4]],F=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,8,0,0,0,0,0,0,1],W=[1,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,8,0,0,0,0,0,0,1],da=[1,64,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],ea=[17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130],fa=[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],ga=[17],Y=[65],Z=[65],X=q;this.m=function(){var a;a:{a=c.cursor;for(var b=2;0=c.a){a=q;break a}c.cursor++}c.cursor=a;a=f}if(!a)return q;c.f=c.cursor;c.cursor=c.a;a=c.a-c.cursor;v();c.cursor=c.a-a;if(!X)return q;a=c.a-c.cursor;y();c.cursor=c.a-a;c.cursor=c.f;a:{c.f= +c.cursor;c.cursor=c.a;a=c.a-c.cursor;c.h("ad")?(b=c.a-c.cursor,c.h("soy")||(c.cursor=c.a-b),b=c.cursor>c.f?q:f):b=q;if(b){a=q;break a}c.cursor=c.a-a;a=c.a-c.cursor;z();c.cursor=c.a-a;a=c.a-c.cursor;c.d=c.cursor;b=c.g(ca);if(0!=b)switch(c.c=c.cursor,b){case 1:if(!c.b("p"))break;break;case 2:if(!c.b("\u00e7"))break;break;case 3:if(!c.b("t"))break;break;case 4:c.b("k")}c.cursor=c.a-a;c.cursor=c.f;a=f}return!a?q:f};this.stemWord=function(a){c.p(a);this.m();return c.j}};window.YiddishStemmer=function(){function g(){var a,d=b.cursor;for(;;){var c=b.cursor;b:{c:for(;;){var e=b.cursor;d:if(b.c=b.cursor,a=b.o(l),0!=a){b.d=b.cursor;switch(a){case 1:a=b.cursor;if(b.n("\u05bc"))break d;b.cursor=a;if(!b.b("\u05f0"))return;break;case 2:a=b.cursor;if(b.n("\u05b4"))break d;b.cursor=a;if(!b.b("\u05f1"))return;break;case 3:a=b.cursor;if(b.n("\u05b4"))break d;b.cursor=a;if(!b.b("\u05f2"))return;break;case 4:if(!b.b("\u05db"))return;break;case 5:if(!b.b("\u05de"))return;break; +case 6:if(!b.b("\u05e0"))return;break;case 7:if(!b.b("\u05e4"))return;break;case 8:if(!b.b("\u05e6"))return}b.cursor=e;break c}b.cursor=e;if(b.cursor>=b.a)break b;b.cursor++}continue}b.cursor=c;break}b.cursor=d;d=b.cursor;for(;;){c=b.cursor;b:{c:for(;;){e=b.cursor;b.c=b.cursor;if(b.i(r,1456,1474)){b.d=b.cursor;if(!b.e())return;b.cursor=e;break c}b.cursor=e;if(b.cursor>=b.a)break b;b.cursor++}continue}b.cursor=c;break}b.cursor=d}function d(){w=b.a;var a=b.cursor;a:if(b.c=b.cursor,b.n("\u05d2\u05e2")){var d= +b.d=b.cursor;b:{c:{var c=b.cursor;if(b.n("\u05dc\u05d8"))break c;b.cursor=c;if(!b.n("\u05d1\u05e0"))break b}b.cursor=a;break a}b.cursor=d;if(!b.b("GE"))return}else b.cursor=a;a=b.cursor;if(0==b.o(n))b.cursor=a;else b:{d=b.cursor;c:{c=b.cursor;d:{var g=b.cursor;if(b.n("\u05e6\u05d5\u05d2\u05e0"))break d;b.cursor=g;if(b.n("\u05e6\u05d5\u05e7\u05d8"))break d;b.cursor=g;if(!b.n("\u05e6\u05d5\u05e7\u05e0"))break c}if(!(b.cursorb.a)){b.cursor=d;v=b.cursor;b.cursor=a;a=b.cursor;0==b.o(e)&&(b.cursor=a);a=b.cursor;if(b.i(k,1489,1520)&&b.i(k,1489,1520)&&b.i(k,1489,1520)){w=b.cursor;return}b.cursor=a;a:for(;;){a=b.cursor;if(b.i(p,1488,1522)){b.cursor=a;break a}b.cursor=a;if(b.cursor>=b.a)return;b.cursor++}for(;;){if(b.i(p, +1488,1522))continue;break}w=b.cursor;w>=v||(w=v)}}function m(){return w<=b.cursor}function a(){var a,d=b.a-b.cursor;a:if(b.d=b.cursor,a=b.g(u),0!=a)switch(b.c=b.cursor,a){case 1:if(!m())break a;if(!b.e())return;break;case 2:if(!m())break a;if(!b.b("\u05d9\u05e2"))return;break;case 3:if(!m())break a;if(!b.e())return;b.d=b.cursor;a=b.g(h);if(0==a)break a;b.c=b.cursor;switch(a){case 1:if(!b.b("\u05d2\u05f2"))return;break;case 2:if(!b.b("\u05e0\u05e2\u05de"))return;break;case 3:if(!b.b("\u05de\u05f2\u05d3"))return; +break;case 4:if(!b.b("\u05d1\u05f2\u05d8"))return;break;case 5:if(!b.b("\u05d1\u05f2\u05e1"))return;break;case 6:if(!b.b("\u05f0\u05f2\u05d6"))return;break;case 7:if(!b.b("\u05d8\u05e8\u05f2\u05d1"))return;break;case 8:if(!b.b("\u05dc\u05f2\u05d8"))return;break;case 9:if(!b.b("\u05e7\u05dc\u05f2\u05d1"))return;break;case 10:if(!b.b("\u05e8\u05f2\u05d1"))return;break;case 11:if(!b.b("\u05e8\u05f2\u05e1"))return;break;case 12:if(!b.b("\u05e9\u05f0\u05f2\u05d2"))return;break;case 13:if(!b.b("\u05e9\u05de\u05f2\u05e1"))return; +break;case 14:if(!b.b("\u05e9\u05e0\u05f2\u05d3"))return;break;case 15:if(!b.b("\u05e9\u05e8\u05f2\u05d1"))return;break;case 16:if(!b.b("\u05d1\u05d9\u05e0\u05d3"))return;break;case 17:if(!b.b("\u05f0\u05d9\u05d8\u05e9"))return;break;case 18:if(!b.b("\u05d6\u05d9\u05e0\u05d2"))return;break;case 19:if(!b.b("\u05d8\u05e8\u05d9\u05e0\u05e7"))return;break;case 20:if(!b.b("\u05e6\u05f0\u05d9\u05e0\u05d2"))return;break;case 21:if(!b.b("\u05e9\u05dc\u05d9\u05e0\u05d2"))return;break;case 22:if(!b.b("\u05d1\u05f2\u05d2"))return; +break;case 23:if(!b.b("\u05d4\u05f2\u05d1"))return;break;case 24:if(!b.b("\u05e4\u05d0\u05e8\u05dc\u05d9\u05e8"))return;break;case 25:if(!b.b("\u05e9\u05d8\u05f2"))return;break;case 26:if(!b.b("\u05e9\u05f0\u05e2\u05e8"))return}break;case 4:b:{a=b.a-b.cursor;if(m()){if(!b.e())return;break b}b.cursor=b.a-a;if(!b.b("\u05d8"))return}b.d=b.cursor;if(!b.h("\u05d1\u05e8\u05d0\u05db"))break a;a=b.a-b.cursor;b.h("\u05d2\u05e2")||(b.cursor=b.a-a);b.c=b.cursor;if(!b.b("\u05d1\u05e8\u05e2\u05e0\u05d2"))return; +break;case 5:if(!b.b("\u05d2\u05f2"))return;break;case 6:if(!b.b("\u05e0\u05e2\u05de"))return;break;case 7:if(!b.b("\u05e9\u05e8\u05f2\u05d1"))return;break;case 8:if(!b.b("\u05de\u05f2\u05d3"))return;break;case 9:if(!b.b("\u05d1\u05f2\u05d8"))return;break;case 10:if(!b.b("\u05d1\u05f2\u05e1"))return;break;case 11:if(!b.b("\u05f0\u05f2\u05d6"))return;break;case 12:if(!b.b("\u05d8\u05e8\u05f2\u05d1"))return;break;case 13:if(!b.b("\u05dc\u05f2\u05d8"))return;break;case 14:if(!b.b("\u05e7\u05dc\u05f2\u05d1"))return; +break;case 15:if(!b.b("\u05e8\u05f2\u05d1"))return;break;case 16:if(!b.b("\u05e8\u05f2\u05e1"))return;break;case 17:if(!b.b("\u05e9\u05f0\u05f2\u05d2"))return;break;case 18:if(!b.b("\u05e9\u05de\u05f2\u05e1"))return;break;case 19:if(!b.b("\u05e9\u05e0\u05f2\u05d3"))return;break;case 20:if(!b.b("\u05d1\u05d9\u05e0\u05d3"))return;break;case 21:if(!b.b("\u05f0\u05d9\u05d8\u05e9"))return;break;case 22:if(!b.b("\u05d6\u05d9\u05e0\u05d2"))return;break;case 23:if(!b.b("\u05d8\u05e8\u05d9\u05e0\u05e7"))return; +break;case 24:if(!b.b("\u05e6\u05f0\u05d9\u05e0\u05d2"))return;break;case 25:if(!b.b("\u05e9\u05dc\u05d9\u05e0\u05d2"))return;break;case 26:if(!b.b("\u05d1\u05f2\u05d2"))return;break;case 27:if(!b.b("\u05d4\u05f2\u05d1"))return;break;case 28:if(!b.b("\u05e4\u05d0\u05e8\u05dc\u05d9\u05e8"))return;break;case 29:if(!b.b("\u05e9\u05d8\u05f2"))return;break;case 30:if(!b.b("\u05e9\u05f0\u05e2\u05e8"))return;break;case 31:if(!b.b("\u05d1\u05e8\u05e2\u05e0\u05d2"))return;break;case 32:if(!m())break a;if(!b.b("\u05d4"))return; +break;case 33:b:{a=b.a-b.cursor;c:{d:{var c=b.a-b.cursor;if(b.h("\u05d2"))break d;b.cursor=b.a-c;if(!b.h("\u05e9"))break c}a=b.a-b.cursor;if(w<=b.cursor+3){if(!b.b("\u05d9\u05e1"))return}else b.cursor=b.a-a;break b}b.cursor=b.a-a;if(!m())break a;if(!b.e())return}}b.cursor=b.a-d;d=b.a-b.cursor;a:if(b.d=b.cursor,a=b.g(t),0!=a)switch(b.c=b.cursor,a){case 1:if(!m())break a;if(!b.e())return;break;case 2:if(!m())break a;if(!b.l(k,1489,1520))break a;if(!b.e())return}b.cursor=b.a-d;d=b.a-b.cursor;a:if(b.d= +b.cursor,a=b.g(s),0!=a)switch(b.c=b.cursor,a){case 1:if(!m())break a;if(!b.e())return}b.cursor=b.a-d;d=b.a-b.cursor;for(;;){a=b.a-b.cursor;b:{c:for(;;){c=b.a-b.cursor;d:{b.d=b.cursor;e:{var e=b.a-b.cursor;if(b.h("GE"))break e;b.cursor=b.a-e;if(!b.h("TSU"))break d}b.c=b.cursor;if(!b.e())return;b.cursor=b.a-c;break c}b.cursor=b.a-c;if(b.cursor<=b.f)break b;b.cursor--}continue}b.cursor=b.a-a;break}b.cursor=b.a-d}var b=new C,l=[["\u05d5\u05d5",-1,1],["\u05d5\u05d9",-1,2],["\u05d9\u05d9",-1,3],["\u05da", +-1,4],["\u05dd",-1,5],["\u05df",-1,6],["\u05e3",-1,7],["\u05e5",-1,8]],n=[["\u05d0\u05d3\u05d5\u05e8\u05db",-1,1],["\u05d0\u05d4\u05d9\u05e0",-1,1],["\u05d0\u05d4\u05e2\u05e8",-1,1],["\u05d0\u05d4\u05f2\u05de",-1,1],["\u05d0\u05d5\u05de",-1,1],["\u05d0\u05d5\u05e0\u05d8\u05e2\u05e8",-1,1],["\u05d0\u05d9\u05d1\u05e2\u05e8",-1,1],["\u05d0\u05e0",-1,1],["\u05d0\u05e0\u05d8",7,1],["\u05d0\u05e0\u05d8\u05e7\u05e2\u05d2\u05e0",8,1],["\u05d0\u05e0\u05d9\u05d3\u05e2\u05e8",7,1],["\u05d0\u05e4",-1,1],["\u05d0\u05e4\u05d9\u05e8", +11,1],["\u05d0\u05e7\u05e2\u05d2\u05e0",-1,1],["\u05d0\u05e8\u05d0\u05e4",-1,1],["\u05d0\u05e8\u05d5\u05de",-1,1],["\u05d0\u05e8\u05d5\u05e0\u05d8\u05e2\u05e8",-1,1],["\u05d0\u05e8\u05d9\u05d1\u05e2\u05e8",-1,1],["\u05d0\u05e8\u05f1\u05e1",-1,1],["\u05d0\u05e8\u05f1\u05e4",-1,1],["\u05d0\u05e8\u05f2\u05e0",-1,1],["\u05d0\u05f0\u05e2\u05e7",-1,1],["\u05d0\u05f1\u05e1",-1,1],["\u05d0\u05f1\u05e4",-1,1],["\u05d0\u05f2\u05e0",-1,1],["\u05d1\u05d0",-1,1],["\u05d1\u05f2",-1,1],["\u05d3\u05d5\u05e8\u05db", +-1,1],["\u05d3\u05e2\u05e8",-1,1],["\u05de\u05d9\u05d8",-1,1],["\u05e0\u05d0\u05db",-1,1],["\u05e4\u05d0\u05e8",-1,1],["\u05e4\u05d0\u05e8\u05d1\u05f2",31,1],["\u05e4\u05d0\u05e8\u05f1\u05e1",31,1],["\u05e4\u05d5\u05e0\u05d0\u05e0\u05d3\u05e2\u05e8",-1,1],["\u05e6\u05d5",-1,1],["\u05e6\u05d5\u05d6\u05d0\u05de\u05e2\u05e0",35,1],["\u05e6\u05d5\u05e0\u05f1\u05e4",35,1],["\u05e6\u05d5\u05e8\u05d9\u05e7",35,1],["\u05e6\u05e2",-1,1]],e=[["\u05d3\u05d6\u05e9",-1,-1],["\u05e9\u05d8\u05e8",-1,-1],["\u05e9\u05d8\u05e9", +-1,-1],["\u05e9\u05e4\u05e8",-1,-1]],h=[["\u05e7\u05dc\u05d9\u05d1",-1,9],["\u05e8\u05d9\u05d1",-1,10],["\u05d8\u05e8\u05d9\u05d1",1,7],["\u05e9\u05e8\u05d9\u05d1",1,15],["\u05d4\u05f1\u05d1",-1,23],["\u05e9\u05f0\u05d9\u05d2",-1,12],["\u05d2\u05d0\u05e0\u05d2",-1,1],["\u05d6\u05d5\u05e0\u05d2",-1,18],["\u05e9\u05dc\u05d5\u05e0\u05d2",-1,21],["\u05e6\u05f0\u05d5\u05e0\u05d2",-1,20],["\u05d1\u05f1\u05d2",-1,22],["\u05d1\u05d5\u05e0\u05d3",-1,16],["\u05f0\u05d9\u05d6",-1,6],["\u05d1\u05d9\u05d8",-1, +4],["\u05dc\u05d9\u05d8",-1,8],["\u05de\u05d9\u05d8",-1,3],["\u05e9\u05e0\u05d9\u05d8",-1,14],["\u05e0\u05d5\u05de",-1,2],["\u05e9\u05d8\u05d0\u05e0",-1,25],["\u05d1\u05d9\u05e1",-1,5],["\u05e9\u05de\u05d9\u05e1",-1,13],["\u05e8\u05d9\u05e1",-1,11],["\u05d8\u05e8\u05d5\u05e0\u05e7",-1,19],["\u05e4\u05d0\u05e8\u05dc\u05f1\u05e8",-1,24],["\u05e9\u05f0\u05f1\u05e8",-1,26],["\u05f0\u05d5\u05d8\u05e9",-1,17]],u=[["\u05d5\u05e0\u05d2",-1,1],["\u05e1\u05d8\u05d5",-1,1],["\u05d8",-1,1],["\u05d1\u05e8\u05d0\u05db\u05d8", +2,31],["\u05e1\u05d8",2,1],["\u05d9\u05e1\u05d8",4,33],["\u05e2\u05d8",2,1],["\u05e9\u05d0\u05e4\u05d8",2,1],["\u05d4\u05f2\u05d8",2,1],["\u05e7\u05f2\u05d8",2,1],["\u05d9\u05e7\u05f2\u05d8",9,1],["\u05dc\u05e2\u05db",-1,1],["\u05e2\u05dc\u05e2\u05db",11,1],["\u05d9\u05d6\u05de",-1,1],["\u05d9\u05de",-1,1],["\u05e2\u05de",-1,1],["\u05e2\u05e0\u05e2\u05de",15,3],["\u05d8\u05e2\u05e0\u05e2\u05de",16,4],["\u05e0",-1,1],["\u05e7\u05dc\u05d9\u05d1\u05e0",18,14],["\u05e8\u05d9\u05d1\u05e0",18,15],["\u05d8\u05e8\u05d9\u05d1\u05e0", +20,12],["\u05e9\u05e8\u05d9\u05d1\u05e0",20,7],["\u05d4\u05f1\u05d1\u05e0",18,27],["\u05e9\u05f0\u05d9\u05d2\u05e0",18,17],["\u05d6\u05d5\u05e0\u05d2\u05e0",18,22],["\u05e9\u05dc\u05d5\u05e0\u05d2\u05e0",18,25],["\u05e6\u05f0\u05d5\u05e0\u05d2\u05e0",18,24],["\u05d1\u05f1\u05d2\u05e0",18,26],["\u05d1\u05d5\u05e0\u05d3\u05e0",18,20],["\u05f0\u05d9\u05d6\u05e0",18,11],["\u05d8\u05e0",18,4],["GE\u05d1\u05d9\u05d8\u05e0",31,9],["GE\u05dc\u05d9\u05d8\u05e0",31,13],["GE\u05de\u05d9\u05d8\u05e0",31,8],["\u05e9\u05e0\u05d9\u05d8\u05e0", +31,19],["\u05e1\u05d8\u05e0",31,1],["\u05d9\u05e1\u05d8\u05e0",36,1],["\u05e2\u05d8\u05e0",31,1],["GE\u05d1\u05d9\u05e1\u05e0",18,10],["\u05e9\u05de\u05d9\u05e1\u05e0",18,18],["GE\u05e8\u05d9\u05e1\u05e0",18,16],["\u05e2\u05e0",18,1],["\u05d2\u05d0\u05e0\u05d2\u05e2\u05e0",42,5],["\u05e2\u05dc\u05e2\u05e0",42,1],["\u05e0\u05d5\u05de\u05e2\u05e0",42,6],["\u05d9\u05d6\u05de\u05e2\u05e0",42,1],["\u05e9\u05d8\u05d0\u05e0\u05e2\u05e0",42,29],["\u05d8\u05e8\u05d5\u05e0\u05e7\u05e0",18,23],["\u05e4\u05d0\u05e8\u05dc\u05f1\u05e8\u05e0", +18,28],["\u05e9\u05f0\u05f1\u05e8\u05e0",18,30],["\u05f0\u05d5\u05d8\u05e9\u05e0",18,21],["\u05d2\u05f2\u05e0",18,5],["\u05e1",-1,1],["\u05d8\u05e1",53,4],["\u05e2\u05d8\u05e1",54,1],["\u05e0\u05e1",53,1],["\u05d8\u05e0\u05e1",56,4],["\u05e2\u05e0\u05e1",56,3],["\u05e2\u05e1",53,1],["\u05d9\u05e2\u05e1",59,2],["\u05e2\u05dc\u05e2\u05e1",59,1],["\u05e2\u05e8\u05e1",53,1],["\u05e2\u05e0\u05e2\u05e8\u05e1",62,1],["\u05e2",-1,1],["\u05d8\u05e2",64,4],["\u05e1\u05d8\u05e2",65,1],["\u05e2\u05d8\u05e2", +65,1],["\u05d9\u05e2",64,-1],["\u05e2\u05dc\u05e2",64,1],["\u05e2\u05e0\u05e2",64,3],["\u05d8\u05e2\u05e0\u05e2",70,4],["\u05e2\u05e8",-1,1],["\u05d8\u05e2\u05e8",72,4],["\u05e1\u05d8\u05e2\u05e8",73,1],["\u05e2\u05d8\u05e2\u05e8",73,1],["\u05e2\u05e0\u05e2\u05e8",72,3],["\u05d8\u05e2\u05e0\u05e2\u05e8",76,4],["\u05d5\u05ea",-1,32]],t=[["\u05d5\u05e0\u05d2",-1,1],["\u05e9\u05d0\u05e4\u05d8",-1,1],["\u05d4\u05f2\u05d8",-1,1],["\u05e7\u05f2\u05d8",-1,1],["\u05d9\u05e7\u05f2\u05d8",3,1],["\u05dc",-1, +2]],s=[["\u05d9\u05d2",-1,1],["\u05d9\u05e7",-1,1],["\u05d3\u05d9\u05e7",1,1],["\u05e0\u05d3\u05d9\u05e7",2,1],["\u05e2\u05e0\u05d3\u05d9\u05e7",3,1],["\u05d1\u05dc\u05d9\u05e7",1,-1],["\u05d2\u05dc\u05d9\u05e7",1,-1],["\u05e0\u05d9\u05e7",1,1],["\u05d9\u05e9",-1,1]],r=[255,155,6],p=[33,2,4,0,6],k=[239,254,253,131],v=0,w=0;this.m=function(){g();var e=b.cursor;d();b.cursor=e;b.f=b.cursor;b.cursor=b.a;a();b.cursor=b.f;return f};this.stemWord=function(a){b.p(a);this.m();return b.j}}; diff --git a/js/swedish-stemmer.js b/js/swedish-stemmer.js new file mode 100644 index 0000000..1e636f7 --- /dev/null +++ b/js/swedish-stemmer.js @@ -0,0 +1,267 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var SwedishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["a", -1, 1], + ["arna", 0, 1], + ["erna", 0, 1], + ["heterna", 2, 1], + ["orna", 0, 1], + ["ad", -1, 1], + ["e", -1, 1], + ["ade", 6, 1], + ["ande", 6, 1], + ["arne", 6, 1], + ["are", 6, 1], + ["aste", 6, 1], + ["en", -1, 1], + ["anden", 12, 1], + ["aren", 12, 1], + ["heten", 12, 1], + ["ern", -1, 1], + ["ar", -1, 1], + ["er", -1, 1], + ["heter", 18, 1], + ["or", -1, 1], + ["s", -1, 2], + ["as", 21, 1], + ["arnas", 22, 1], + ["ernas", 22, 1], + ["ornas", 22, 1], + ["es", 21, 1], + ["ades", 26, 1], + ["andes", 26, 1], + ["ens", 21, 1], + ["arens", 29, 1], + ["hetens", 29, 1], + ["erns", 21, 1], + ["at", -1, 1], + ["andet", -1, 1], + ["het", -1, 1], + ["ast", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["dd", -1, -1], + ["gd", -1, -1], + ["nn", -1, -1], + ["dt", -1, -1], + ["gt", -1, -1], + ["kt", -1, -1], + ["tt", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["ig", -1, 1], + ["lig", 0, 1], + ["els", -1, 1], + ["fullt", -1, 3], + ["l\u00F6st", -1, 2] + ]; + + /** @const */ var /** Array */ g_v = [17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32]; + + /** @const */ var /** Array */ g_s_ending = [119, 127, 149]; + + var /** number */ I_x = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + var /** number */ v_1 = base.cursor; + { + var /** number */ c1 = base.cursor + 3; + if (c1 > base.limit) + { + return false; + } + base.cursor = c1; + } + I_x = base.cursor; + base.cursor = v_1; + golab0: while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + if (!(base.in_grouping(g_v, 97, 246))) + { + break lab1; + } + base.cursor = v_2; + break golab0; + } + base.cursor = v_2; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + golab2: while(true) + { + lab3: { + if (!(base.out_grouping(g_v, 97, 246))) + { + break lab3; + } + break golab2; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + I_p1 = base.cursor; + lab4: { + if (I_p1 >= I_x) + { + break lab4; + } + I_p1 = I_x; + } + return true; + }; + + /** @return {boolean} */ + function r_main_suffix() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_0); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + base.limit_backward = v_2; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.in_grouping_b(g_s_ending, 98, 121))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_consonant_pair() { + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + var /** number */ v_3 = base.limit - base.cursor; + if (base.find_among_b(a_1) == 0) + { + base.limit_backward = v_2; + return false; + } + base.cursor = base.limit - v_3; + base.ket = base.cursor; + if (base.cursor <= base.limit_backward) + { + base.limit_backward = v_2; + return false; + } + base.cursor--; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.limit_backward = v_2; + return true; + }; + + /** @return {boolean} */ + function r_other_suffix() { + var /** number */ among_var; + if (base.cursor < I_p1) + { + return false; + } + var /** number */ v_2 = base.limit_backward; + base.limit_backward = I_p1; + base.ket = base.cursor; + among_var = base.find_among_b(a_2); + if (among_var == 0) + { + base.limit_backward = v_2; + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("l\u00F6s")) + { + return false; + } + break; + case 3: + if (!base.slice_from("full")) + { + return false; + } + break; + } + base.limit_backward = v_2; + return true; + }; + + this.stem = /** @return {boolean} */ function() { + var /** number */ v_1 = base.cursor; + r_mark_regions(); + base.cursor = v_1; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_2 = base.limit - base.cursor; + r_main_suffix(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_consonant_pair(); + base.cursor = base.limit - v_3; + var /** number */ v_4 = base.limit - base.cursor; + r_other_suffix(); + base.cursor = base.limit - v_4; + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['SwedishStemmer'] = SwedishStemmer; diff --git a/js/tamil-stemmer.js b/js/tamil-stemmer.js new file mode 100644 index 0000000..2cc4b12 --- /dev/null +++ b/js/tamil-stemmer.js @@ -0,0 +1,1190 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var TamilStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0BB5\u0BC1", -1, 3], + ["\u0BB5\u0BC2", -1, 4], + ["\u0BB5\u0BCA", -1, 2], + ["\u0BB5\u0BCB", -1, 1] + ]; + + /** @const */ var a_1 = [ + ["\u0B95", -1, -1], + ["\u0B99", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9E", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BA8", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BAE", -1, -1], + ["\u0BAF", -1, -1], + ["\u0BB5", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC8", -1, -1] + ]; + + /** @const */ var a_3 = [ + ["\u0BBE", -1, -1], + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC1", -1, -1], + ["\u0BC2", -1, -1], + ["\u0BC6", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BC8", -1, -1] + ]; + + /** @const */ var a_4 = [ + ["", -1, 2], + ["\u0BC8", 0, 1], + ["\u0BCD", 0, 1] + ]; + + /** @const */ var a_5 = [ + ["\u0BA8\u0BCD\u0BA4", -1, 1], + ["\u0BAF", -1, 1], + ["\u0BB5", -1, 1], + ["\u0BA9\u0BC1", -1, 8], + ["\u0BC1\u0B95\u0BCD", -1, 7], + ["\u0BC1\u0B95\u0BCD\u0B95\u0BCD", -1, 7], + ["\u0B9F\u0BCD\u0B95\u0BCD", -1, 3], + ["\u0BB1\u0BCD\u0B95\u0BCD", -1, 4], + ["\u0B99\u0BCD", -1, 9], + ["\u0B9F\u0BCD\u0B9F\u0BCD", -1, 5], + ["\u0BA4\u0BCD\u0BA4\u0BCD", -1, 6], + ["\u0BA8\u0BCD\u0BA4\u0BCD", -1, 1], + ["\u0BA8\u0BCD", -1, 1], + ["\u0B9F\u0BCD\u0BAA\u0BCD", -1, 3], + ["\u0BAF\u0BCD", -1, 2], + ["\u0BA9\u0BCD\u0BB1\u0BCD", -1, 4], + ["\u0BB5\u0BCD", -1, 1] + ]; + + /** @const */ var a_6 = [ + ["\u0B95", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9F", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BB1", -1, -1] + ]; + + /** @const */ var a_7 = [ + ["\u0B95", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9F", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BB1", -1, -1] + ]; + + /** @const */ var a_8 = [ + ["\u0B9E", -1, -1], + ["\u0BA3", -1, -1], + ["\u0BA8", -1, -1], + ["\u0BA9", -1, -1], + ["\u0BAE", -1, -1], + ["\u0BAF", -1, -1], + ["\u0BB0", -1, -1], + ["\u0BB2", -1, -1], + ["\u0BB3", -1, -1], + ["\u0BB4", -1, -1], + ["\u0BB5", -1, -1] + ]; + + /** @const */ var a_9 = [ + ["\u0BBE", -1, -1], + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC1", -1, -1], + ["\u0BC2", -1, -1], + ["\u0BC6", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BC8", -1, -1], + ["\u0BCD", -1, -1] + ]; + + /** @const */ var a_10 = [ + ["\u0B85", -1, -1], + ["\u0B87", -1, -1], + ["\u0B89", -1, -1] + ]; + + /** @const */ var a_11 = [ + ["\u0B95", -1, -1], + ["\u0B99", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9E", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BA8", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BAE", -1, -1], + ["\u0BAF", -1, -1], + ["\u0BB5", -1, -1] + ]; + + /** @const */ var a_12 = [ + ["\u0B95", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9F", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BB1", -1, -1] + ]; + + /** @const */ var a_13 = [ + ["\u0B95\u0BB3\u0BCD", -1, 4], + ["\u0BC1\u0B99\u0BCD\u0B95\u0BB3\u0BCD", 0, 1], + ["\u0B9F\u0BCD\u0B95\u0BB3\u0BCD", 0, 3], + ["\u0BB1\u0BCD\u0B95\u0BB3\u0BCD", 0, 2] + ]; + + /** @const */ var a_14 = [ + ["\u0BBE", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BCB", -1, -1] + ]; + + /** @const */ var a_15 = [ + ["\u0BAA\u0BBF", -1, -1], + ["\u0BB5\u0BBF", -1, -1] + ]; + + /** @const */ var a_16 = [ + ["\u0BBE", -1, -1], + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC1", -1, -1], + ["\u0BC2", -1, -1], + ["\u0BC6", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BC8", -1, -1] + ]; + + /** @const */ var a_17 = [ + ["\u0BAA\u0B9F\u0BCD\u0B9F", -1, 3], + ["\u0BAA\u0B9F\u0BCD\u0B9F\u0BA3", -1, 3], + ["\u0BA4\u0BBE\u0BA9", -1, 3], + ["\u0BAA\u0B9F\u0BBF\u0BA4\u0BBE\u0BA9", 2, 3], + ["\u0BC6\u0BA9", -1, 1], + ["\u0BBE\u0B95\u0BBF\u0BAF", -1, 1], + ["\u0B95\u0BC1\u0BB0\u0BBF\u0BAF", -1, 3], + ["\u0BC1\u0B9F\u0BC8\u0BAF", -1, 1], + ["\u0BB2\u0BCD\u0BB2", -1, 2], + ["\u0BC1\u0BB3\u0BCD\u0BB3", -1, 1], + ["\u0BBE\u0B95\u0BBF", -1, 1], + ["\u0BAA\u0B9F\u0BBF", -1, 3], + ["\u0BBF\u0BA9\u0BCD\u0BB1\u0BBF", -1, 1], + ["\u0BAA\u0BB1\u0BCD\u0BB1\u0BBF", -1, 3], + ["\u0BAA\u0B9F\u0BC1", -1, 3], + ["\u0BB5\u0BBF\u0B9F\u0BC1", -1, 3], + ["\u0BAA\u0B9F\u0BCD\u0B9F\u0BC1", -1, 3], + ["\u0BB5\u0BBF\u0B9F\u0BCD\u0B9F\u0BC1", -1, 3], + ["\u0BAA\u0B9F\u0BCD\u0B9F\u0BA4\u0BC1", -1, 3], + ["\u0BC6\u0BA9\u0BCD\u0BB1\u0BC1", -1, 1], + ["\u0BC1\u0B9F\u0BC8", -1, 1], + ["\u0BBF\u0BB2\u0BCD\u0BB2\u0BC8", -1, 1], + ["\u0BC1\u0B9F\u0BA9\u0BCD", -1, 1], + ["\u0BBF\u0B9F\u0BAE\u0BCD", -1, 1], + ["\u0BC6\u0BB2\u0BCD\u0BB2\u0BBE\u0BAE\u0BCD", -1, 3], + ["\u0BC6\u0BA9\u0BC1\u0BAE\u0BCD", -1, 1] + ]; + + /** @const */ var a_18 = [ + ["\u0BBE", -1, -1], + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC1", -1, -1], + ["\u0BC2", -1, -1], + ["\u0BC6", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BC8", -1, -1] + ]; + + /** @const */ var a_19 = [ + ["\u0BBE", -1, -1], + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC1", -1, -1], + ["\u0BC2", -1, -1], + ["\u0BC6", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BC8", -1, -1] + ]; + + /** @const */ var a_20 = [ + ["\u0BB5\u0BBF\u0B9F", -1, 2], + ["\u0BC0", -1, 7], + ["\u0BCA\u0B9F\u0BC1", -1, 2], + ["\u0BCB\u0B9F\u0BC1", -1, 2], + ["\u0BA4\u0BC1", -1, 6], + ["\u0BBF\u0BB0\u0BC1\u0BA8\u0BCD\u0BA4\u0BC1", 4, 2], + ["\u0BBF\u0BA9\u0BCD\u0BB1\u0BC1", -1, 2], + ["\u0BC1\u0B9F\u0BC8", -1, 2], + ["\u0BA9\u0BC8", -1, 1], + ["\u0B95\u0BA3\u0BCD", -1, 1], + ["\u0BBF\u0BA9\u0BCD", -1, 3], + ["\u0BAE\u0BC1\u0BA9\u0BCD", -1, 1], + ["\u0BBF\u0B9F\u0BAE\u0BCD", -1, 4], + ["\u0BBF\u0BB1\u0BCD", -1, 2], + ["\u0BAE\u0BC7\u0BB1\u0BCD", -1, 1], + ["\u0BB2\u0BCD", -1, 5], + ["\u0BBE\u0BAE\u0BB2\u0BCD", 15, 2], + ["\u0BBE\u0BB2\u0BCD", 15, 2], + ["\u0BBF\u0BB2\u0BCD", 15, 2], + ["\u0BAE\u0BC7\u0BB2\u0BCD", 15, 1], + ["\u0BC1\u0BB3\u0BCD", -1, 2], + ["\u0B95\u0BC0\u0BB4\u0BCD", -1, 1] + ]; + + /** @const */ var a_21 = [ + ["\u0B95", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9F", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BB1", -1, -1] + ]; + + /** @const */ var a_22 = [ + ["\u0B95", -1, -1], + ["\u0B9A", -1, -1], + ["\u0B9F", -1, -1], + ["\u0BA4", -1, -1], + ["\u0BAA", -1, -1], + ["\u0BB1", -1, -1] + ]; + + /** @const */ var a_23 = [ + ["\u0B85", -1, -1], + ["\u0B86", -1, -1], + ["\u0B87", -1, -1], + ["\u0B88", -1, -1], + ["\u0B89", -1, -1], + ["\u0B8A", -1, -1], + ["\u0B8E", -1, -1], + ["\u0B8F", -1, -1], + ["\u0B90", -1, -1], + ["\u0B92", -1, -1], + ["\u0B93", -1, -1], + ["\u0B94", -1, -1] + ]; + + /** @const */ var a_24 = [ + ["\u0BBE", -1, -1], + ["\u0BBF", -1, -1], + ["\u0BC0", -1, -1], + ["\u0BC1", -1, -1], + ["\u0BC2", -1, -1], + ["\u0BC6", -1, -1], + ["\u0BC7", -1, -1], + ["\u0BC8", -1, -1] + ]; + + /** @const */ var a_25 = [ + ["\u0B95", -1, 1], + ["\u0BA4", -1, 1], + ["\u0BA9", -1, 1], + ["\u0BAA", -1, 1], + ["\u0BAF", -1, 1], + ["\u0BBE", -1, 5], + ["\u0B95\u0BC1", -1, 6], + ["\u0BAA\u0B9F\u0BC1", -1, 1], + ["\u0BA4\u0BC1", -1, 3], + ["\u0BBF\u0BB1\u0BCD\u0BB1\u0BC1", -1, 1], + ["\u0BA9\u0BC8", -1, 1], + ["\u0BB5\u0BC8", -1, 1], + ["\u0BA9\u0BA9\u0BCD", -1, 1], + ["\u0BAA\u0BA9\u0BCD", -1, 1], + ["\u0BB5\u0BA9\u0BCD", -1, 2], + ["\u0BBE\u0BA9\u0BCD", -1, 4], + ["\u0BA9\u0BBE\u0BA9\u0BCD", 15, 1], + ["\u0BAE\u0BBF\u0BA9\u0BCD", -1, 1], + ["\u0BA9\u0BC6\u0BA9\u0BCD", -1, 1], + ["\u0BC7\u0BA9\u0BCD", -1, 5], + ["\u0BA9\u0BAE\u0BCD", -1, 1], + ["\u0BAA\u0BAE\u0BCD", -1, 1], + ["\u0BBE\u0BAE\u0BCD", -1, 5], + ["\u0B95\u0BC1\u0BAE\u0BCD", -1, 1], + ["\u0B9F\u0BC1\u0BAE\u0BCD", -1, 5], + ["\u0BA4\u0BC1\u0BAE\u0BCD", -1, 1], + ["\u0BB1\u0BC1\u0BAE\u0BCD", -1, 1], + ["\u0BC6\u0BAE\u0BCD", -1, 5], + ["\u0BC7\u0BAE\u0BCD", -1, 5], + ["\u0BCB\u0BAE\u0BCD", -1, 5], + ["\u0BBE\u0BAF\u0BCD", -1, 5], + ["\u0BA9\u0BB0\u0BCD", -1, 1], + ["\u0BAA\u0BB0\u0BCD", -1, 1], + ["\u0BC0\u0BAF\u0BB0\u0BCD", -1, 5], + ["\u0BB5\u0BB0\u0BCD", -1, 1], + ["\u0BBE\u0BB0\u0BCD", -1, 5], + ["\u0BA9\u0BBE\u0BB0\u0BCD", 35, 1], + ["\u0BAE\u0BBE\u0BB0\u0BCD", 35, 1], + ["\u0B95\u0BCA\u0BA3\u0BCD\u0B9F\u0BBF\u0BB0\u0BCD", -1, 1], + ["\u0BA9\u0BBF\u0BB0\u0BCD", -1, 5], + ["\u0BC0\u0BB0\u0BCD", -1, 5], + ["\u0BA9\u0BB3\u0BCD", -1, 1], + ["\u0BAA\u0BB3\u0BCD", -1, 1], + ["\u0BB5\u0BB3\u0BCD", -1, 1], + ["\u0BBE\u0BB3\u0BCD", -1, 5], + ["\u0BA9\u0BBE\u0BB3\u0BCD", 44, 1] + ]; + + /** @const */ var a_26 = [ + ["\u0B95\u0BBF\u0BB1", -1, -1], + ["\u0B95\u0BBF\u0BA9\u0BCD\u0BB1", -1, -1], + ["\u0BBE\u0BA8\u0BBF\u0BA9\u0BCD\u0BB1", -1, -1], + ["\u0B95\u0BBF\u0BB1\u0BCD", -1, -1], + ["\u0B95\u0BBF\u0BA9\u0BCD\u0BB1\u0BCD", -1, -1], + ["\u0BBE\u0BA8\u0BBF\u0BA9\u0BCD\u0BB1\u0BCD", -1, -1] + ]; + + var /** boolean */ B_found_vetrumai_urupu = false; + var /** boolean */ B_found_a_match = false; + + + /** @return {boolean} */ + function r_has_min_length() { + return base.current.length > 4; + }; + + /** @return {boolean} */ + function r_fix_va_start() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u0B93")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u0B92")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u0B89")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u0B8A")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_fix_endings() { + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + if (!r_fix_ending()) + { + break lab1; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_remove_question_prefixes() { + base.bra = base.cursor; + if (!(base.eq_s("\u0B8E"))) + { + return false; + } + if (base.find_among(a_1) == 0) + { + return false; + } + if (!(base.eq_s("\u0BCD"))) + { + return false; + } + base.ket = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.cursor; + r_fix_va_start(); + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_fix_ending() { + var /** number */ among_var; + if (base.current.length <= 3) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + break lab1; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + var /** number */ v_2 = base.limit - base.cursor; + if (base.find_among_b(a_2) == 0) + { + break lab1; + } + base.cursor = base.limit - v_2; + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u0BB3\u0BCD")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u0BB2\u0BCD")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u0B9F\u0BC1")) + { + return false; + } + break; + case 6: + if (!B_found_vetrumai_urupu) + { + break lab1; + } + { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + if (!(base.eq_s_b("\u0BC8"))) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_3; + } + if (!base.slice_from("\u0BAE\u0BCD")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 8: + { + var /** number */ v_4 = base.limit - base.cursor; + lab3: { + if (base.find_among_b(a_3) == 0) + { + break lab3; + } + break lab1; + } + base.cursor = base.limit - v_4; + } + if (!base.slice_del()) + { + return false; + } + break; + case 9: + among_var = base.find_among_b(a_4); + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u0BAE\u0BCD")) + { + return false; + } + break; + } + break; + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + if (!(base.eq_s_b("\u0BCD"))) + { + return false; + } + lab4: { + var /** number */ v_5 = base.limit - base.cursor; + lab5: { + if (base.find_among_b(a_6) == 0) + { + break lab5; + } + var /** number */ v_6 = base.limit - base.cursor; + lab6: { + if (!(base.eq_s_b("\u0BCD"))) + { + base.cursor = base.limit - v_6; + break lab6; + } + if (base.find_among_b(a_7) == 0) + { + base.cursor = base.limit - v_6; + break lab6; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_5; + lab7: { + if (base.find_among_b(a_8) == 0) + { + break lab7; + } + base.bra = base.cursor; + if (!(base.eq_s_b("\u0BCD"))) + { + break lab7; + } + if (!base.slice_del()) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_5; + var /** number */ v_7 = base.limit - base.cursor; + if (base.find_among_b(a_9) == 0) + { + return false; + } + base.cursor = base.limit - v_7; + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + } + } + base.cursor = base.limit_backward; + return true; + }; + + /** @return {boolean} */ + function r_remove_pronoun_prefixes() { + base.bra = base.cursor; + if (base.find_among(a_10) == 0) + { + return false; + } + if (base.find_among(a_11) == 0) + { + return false; + } + if (!(base.eq_s("\u0BCD"))) + { + return false; + } + base.ket = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_1 = base.cursor; + r_fix_va_start(); + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_remove_plural_suffix() { + var /** number */ among_var; + base.limit_backward = base.cursor; base.cursor = base.limit; + base.ket = base.cursor; + among_var = base.find_among_b(a_13); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (base.find_among_b(a_12) == 0) + { + break lab1; + } + if (!base.slice_from("\u0BC1\u0B99\u0BCD")) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!base.slice_from("\u0BCD")) + { + return false; + } + } + break; + case 2: + if (!base.slice_from("\u0BB2\u0BCD")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u0BB3\u0BCD")) + { + return false; + } + break; + case 4: + if (!base.slice_del()) + { + return false; + } + break; + } + base.cursor = base.limit_backward; + return true; + }; + + /** @return {boolean} */ + function r_remove_question_suffixes() { + if (!r_has_min_length()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + if (base.find_among_b(a_14) == 0) + { + break lab0; + } + base.bra = base.cursor; + if (!base.slice_from("\u0BCD")) + { + return false; + } + } + base.cursor = base.limit - v_1; + base.cursor = base.limit_backward; + r_fix_endings(); + return true; + }; + + /** @return {boolean} */ + function r_remove_command_suffixes() { + if (!r_has_min_length()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + base.ket = base.cursor; + if (base.find_among_b(a_15) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.cursor = base.limit_backward; + return true; + }; + + /** @return {boolean} */ + function r_remove_um() { + if (!r_has_min_length()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + base.ket = base.cursor; + if (!(base.eq_s_b("\u0BC1\u0BAE\u0BCD"))) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_from("\u0BCD")) + { + return false; + } + base.cursor = base.limit_backward; + var /** number */ v_1 = base.cursor; + r_fix_ending(); + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_remove_common_word_endings() { + var /** number */ among_var; + if (!r_has_min_length()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + base.ket = base.cursor; + among_var = base.find_among_b(a_17); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 2: + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (base.find_among_b(a_16) == 0) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_1; + } + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + base.cursor = base.limit_backward; + r_fix_endings(); + return true; + }; + + /** @return {boolean} */ + function r_remove_vetrumai_urupukal() { + var /** number */ among_var; + B_found_vetrumai_urupu = false; + if (!r_has_min_length()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + base.ket = base.cursor; + among_var = base.find_among_b(a_20); + if (among_var == 0) + { + break lab1; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 3: + { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + if (!(base.eq_s_b("\u0BAE"))) + { + break lab2; + } + break lab1; + } + base.cursor = base.limit - v_3; + } + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 4: + if (base.current.length < 7) + { + break lab1; + } + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 5: + { + var /** number */ v_4 = base.limit - base.cursor; + lab3: { + if (base.find_among_b(a_18) == 0) + { + break lab3; + } + break lab1; + } + base.cursor = base.limit - v_4; + } + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 6: + { + var /** number */ v_5 = base.limit - base.cursor; + lab4: { + if (base.find_among_b(a_19) == 0) + { + break lab4; + } + break lab1; + } + base.cursor = base.limit - v_5; + } + if (!base.slice_del()) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u0BBF")) + { + return false; + } + break; + } + base.cursor = base.limit - v_2; + break lab0; + } + base.cursor = base.limit - v_1; + var /** number */ v_6 = base.limit - base.cursor; + base.ket = base.cursor; + if (!(base.eq_s_b("\u0BC8"))) + { + return false; + } + lab5: { + var /** number */ v_7 = base.limit - base.cursor; + lab6: { + { + var /** number */ v_8 = base.limit - base.cursor; + lab7: { + if (base.find_among_b(a_21) == 0) + { + break lab7; + } + break lab6; + } + base.cursor = base.limit - v_8; + } + break lab5; + } + base.cursor = base.limit - v_7; + var /** number */ v_9 = base.limit - base.cursor; + if (base.find_among_b(a_22) == 0) + { + return false; + } + if (!(base.eq_s_b("\u0BCD"))) + { + return false; + } + base.cursor = base.limit - v_9; + } + base.bra = base.cursor; + if (!base.slice_from("\u0BCD")) + { + return false; + } + base.cursor = base.limit - v_6; + } + B_found_vetrumai_urupu = true; + var /** number */ v_10 = base.limit - base.cursor; + lab8: { + base.ket = base.cursor; + if (!(base.eq_s_b("\u0BBF\u0BA9\u0BCD"))) + { + break lab8; + } + base.bra = base.cursor; + if (!base.slice_from("\u0BCD")) + { + return false; + } + } + base.cursor = base.limit - v_10; + base.cursor = base.limit_backward; + r_fix_endings(); + return true; + }; + + /** @return {boolean} */ + function r_remove_tense_suffixes() { + B_found_a_match = true; + while(true) + { + var /** number */ v_1 = base.cursor; + lab0: { + if (!B_found_a_match) + { + break lab0; + } + var /** number */ v_2 = base.cursor; + r_remove_tense_suffix(); + base.cursor = v_2; + continue; + } + base.cursor = v_1; + break; + } + return true; + }; + + /** @return {boolean} */ + function r_remove_tense_suffix() { + var /** number */ among_var; + B_found_a_match = false; + if (!r_has_min_length()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + var /** number */ v_2 = base.limit - base.cursor; + base.ket = base.cursor; + among_var = base.find_among_b(a_25); + if (among_var == 0) + { + break lab0; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_del()) + { + return false; + } + break; + case 2: + { + var /** number */ v_3 = base.limit - base.cursor; + lab1: { + if (base.find_among_b(a_23) == 0) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_3; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + { + var /** number */ v_4 = base.limit - base.cursor; + lab2: { + if (base.find_among_b(a_24) == 0) + { + break lab2; + } + break lab0; + } + base.cursor = base.limit - v_4; + } + if (!base.slice_del()) + { + return false; + } + break; + case 4: + { + var /** number */ v_5 = base.limit - base.cursor; + lab3: { + if (!(base.eq_s_b("\u0B9A"))) + { + break lab3; + } + break lab0; + } + base.cursor = base.limit - v_5; + } + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u0BCD")) + { + return false; + } + break; + case 6: + var /** number */ v_6 = base.limit - base.cursor; + if (!(base.eq_s_b("\u0BCD"))) + { + break lab0; + } + base.cursor = base.limit - v_6; + if (!base.slice_del()) + { + return false; + } + break; + } + B_found_a_match = true; + base.cursor = base.limit - v_2; + } + base.cursor = base.limit - v_1; + var /** number */ v_7 = base.limit - base.cursor; + lab4: { + base.ket = base.cursor; + if (base.find_among_b(a_26) == 0) + { + break lab4; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + B_found_a_match = true; + } + base.cursor = base.limit - v_7; + base.cursor = base.limit_backward; + r_fix_endings(); + return true; + }; + + this.stem = /** @return {boolean} */ function() { + B_found_vetrumai_urupu = false; + var /** number */ v_1 = base.cursor; + r_fix_ending(); + base.cursor = v_1; + if (!r_has_min_length()) + { + return false; + } + var /** number */ v_2 = base.cursor; + r_remove_question_prefixes(); + base.cursor = v_2; + var /** number */ v_3 = base.cursor; + r_remove_pronoun_prefixes(); + base.cursor = v_3; + var /** number */ v_4 = base.cursor; + r_remove_question_suffixes(); + base.cursor = v_4; + var /** number */ v_5 = base.cursor; + r_remove_um(); + base.cursor = v_5; + var /** number */ v_6 = base.cursor; + r_remove_common_word_endings(); + base.cursor = v_6; + var /** number */ v_7 = base.cursor; + r_remove_vetrumai_urupukal(); + base.cursor = v_7; + var /** number */ v_8 = base.cursor; + r_remove_plural_suffix(); + base.cursor = v_8; + var /** number */ v_9 = base.cursor; + r_remove_command_suffixes(); + base.cursor = v_9; + var /** number */ v_10 = base.cursor; + r_remove_tense_suffixes(); + base.cursor = v_10; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['TamilStemmer'] = TamilStemmer; diff --git a/js/turkish-stemmer.js b/js/turkish-stemmer.js new file mode 100644 index 0000000..84f6255 --- /dev/null +++ b/js/turkish-stemmer.js @@ -0,0 +1,2361 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var TurkishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["m", -1, -1], + ["n", -1, -1], + ["miz", -1, -1], + ["niz", -1, -1], + ["muz", -1, -1], + ["nuz", -1, -1], + ["m\u00FCz", -1, -1], + ["n\u00FCz", -1, -1], + ["m\u0131z", -1, -1], + ["n\u0131z", -1, -1] + ]; + + /** @const */ var a_1 = [ + ["leri", -1, -1], + ["lar\u0131", -1, -1] + ]; + + /** @const */ var a_2 = [ + ["ni", -1, -1], + ["nu", -1, -1], + ["n\u00FC", -1, -1], + ["n\u0131", -1, -1] + ]; + + /** @const */ var a_3 = [ + ["in", -1, -1], + ["un", -1, -1], + ["\u00FCn", -1, -1], + ["\u0131n", -1, -1] + ]; + + /** @const */ var a_4 = [ + ["a", -1, -1], + ["e", -1, -1] + ]; + + /** @const */ var a_5 = [ + ["na", -1, -1], + ["ne", -1, -1] + ]; + + /** @const */ var a_6 = [ + ["da", -1, -1], + ["ta", -1, -1], + ["de", -1, -1], + ["te", -1, -1] + ]; + + /** @const */ var a_7 = [ + ["nda", -1, -1], + ["nde", -1, -1] + ]; + + /** @const */ var a_8 = [ + ["dan", -1, -1], + ["tan", -1, -1], + ["den", -1, -1], + ["ten", -1, -1] + ]; + + /** @const */ var a_9 = [ + ["ndan", -1, -1], + ["nden", -1, -1] + ]; + + /** @const */ var a_10 = [ + ["la", -1, -1], + ["le", -1, -1] + ]; + + /** @const */ var a_11 = [ + ["ca", -1, -1], + ["ce", -1, -1] + ]; + + /** @const */ var a_12 = [ + ["im", -1, -1], + ["um", -1, -1], + ["\u00FCm", -1, -1], + ["\u0131m", -1, -1] + ]; + + /** @const */ var a_13 = [ + ["sin", -1, -1], + ["sun", -1, -1], + ["s\u00FCn", -1, -1], + ["s\u0131n", -1, -1] + ]; + + /** @const */ var a_14 = [ + ["iz", -1, -1], + ["uz", -1, -1], + ["\u00FCz", -1, -1], + ["\u0131z", -1, -1] + ]; + + /** @const */ var a_15 = [ + ["siniz", -1, -1], + ["sunuz", -1, -1], + ["s\u00FCn\u00FCz", -1, -1], + ["s\u0131n\u0131z", -1, -1] + ]; + + /** @const */ var a_16 = [ + ["lar", -1, -1], + ["ler", -1, -1] + ]; + + /** @const */ var a_17 = [ + ["niz", -1, -1], + ["nuz", -1, -1], + ["n\u00FCz", -1, -1], + ["n\u0131z", -1, -1] + ]; + + /** @const */ var a_18 = [ + ["dir", -1, -1], + ["tir", -1, -1], + ["dur", -1, -1], + ["tur", -1, -1], + ["d\u00FCr", -1, -1], + ["t\u00FCr", -1, -1], + ["d\u0131r", -1, -1], + ["t\u0131r", -1, -1] + ]; + + /** @const */ var a_19 = [ + ["cas\u0131na", -1, -1], + ["cesine", -1, -1] + ]; + + /** @const */ var a_20 = [ + ["di", -1, -1], + ["ti", -1, -1], + ["dik", -1, -1], + ["tik", -1, -1], + ["duk", -1, -1], + ["tuk", -1, -1], + ["d\u00FCk", -1, -1], + ["t\u00FCk", -1, -1], + ["d\u0131k", -1, -1], + ["t\u0131k", -1, -1], + ["dim", -1, -1], + ["tim", -1, -1], + ["dum", -1, -1], + ["tum", -1, -1], + ["d\u00FCm", -1, -1], + ["t\u00FCm", -1, -1], + ["d\u0131m", -1, -1], + ["t\u0131m", -1, -1], + ["din", -1, -1], + ["tin", -1, -1], + ["dun", -1, -1], + ["tun", -1, -1], + ["d\u00FCn", -1, -1], + ["t\u00FCn", -1, -1], + ["d\u0131n", -1, -1], + ["t\u0131n", -1, -1], + ["du", -1, -1], + ["tu", -1, -1], + ["d\u00FC", -1, -1], + ["t\u00FC", -1, -1], + ["d\u0131", -1, -1], + ["t\u0131", -1, -1] + ]; + + /** @const */ var a_21 = [ + ["sa", -1, -1], + ["se", -1, -1], + ["sak", -1, -1], + ["sek", -1, -1], + ["sam", -1, -1], + ["sem", -1, -1], + ["san", -1, -1], + ["sen", -1, -1] + ]; + + /** @const */ var a_22 = [ + ["mi\u015F", -1, -1], + ["mu\u015F", -1, -1], + ["m\u00FC\u015F", -1, -1], + ["m\u0131\u015F", -1, -1] + ]; + + /** @const */ var a_23 = [ + ["b", -1, 1], + ["c", -1, 2], + ["d", -1, 3], + ["\u011F", -1, 4] + ]; + + /** @const */ var /** Array */ g_vowel = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 8, 0, 0, 0, 0, 0, 0, 1]; + + /** @const */ var /** Array */ g_U = [1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 1]; + + /** @const */ var /** Array */ g_vowel1 = [1, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]; + + /** @const */ var /** Array */ g_vowel2 = [17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130]; + + /** @const */ var /** Array */ g_vowel3 = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]; + + /** @const */ var /** Array */ g_vowel4 = [17]; + + /** @const */ var /** Array */ g_vowel5 = [65]; + + /** @const */ var /** Array */ g_vowel6 = [65]; + + var /** boolean */ B_continue_stemming_noun_suffixes = false; + + + /** @return {boolean} */ + function r_check_vowel_harmony() { + var /** number */ v_1 = base.limit - base.cursor; + golab0: while(true) + { + var /** number */ v_2 = base.limit - base.cursor; + lab1: { + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + break lab1; + } + base.cursor = base.limit - v_2; + break golab0; + } + base.cursor = base.limit - v_2; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + lab2: { + var /** number */ v_3 = base.limit - base.cursor; + lab3: { + if (!(base.eq_s_b("a"))) + { + break lab3; + } + golab4: while(true) + { + var /** number */ v_4 = base.limit - base.cursor; + lab5: { + if (!(base.in_grouping_b(g_vowel1, 97, 305))) + { + break lab5; + } + base.cursor = base.limit - v_4; + break golab4; + } + base.cursor = base.limit - v_4; + if (base.cursor <= base.limit_backward) + { + break lab3; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + lab6: { + if (!(base.eq_s_b("e"))) + { + break lab6; + } + golab7: while(true) + { + var /** number */ v_5 = base.limit - base.cursor; + lab8: { + if (!(base.in_grouping_b(g_vowel2, 101, 252))) + { + break lab8; + } + base.cursor = base.limit - v_5; + break golab7; + } + base.cursor = base.limit - v_5; + if (base.cursor <= base.limit_backward) + { + break lab6; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + lab9: { + if (!(base.eq_s_b("\u0131"))) + { + break lab9; + } + golab10: while(true) + { + var /** number */ v_6 = base.limit - base.cursor; + lab11: { + if (!(base.in_grouping_b(g_vowel3, 97, 305))) + { + break lab11; + } + base.cursor = base.limit - v_6; + break golab10; + } + base.cursor = base.limit - v_6; + if (base.cursor <= base.limit_backward) + { + break lab9; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + lab12: { + if (!(base.eq_s_b("i"))) + { + break lab12; + } + golab13: while(true) + { + var /** number */ v_7 = base.limit - base.cursor; + lab14: { + if (!(base.in_grouping_b(g_vowel4, 101, 105))) + { + break lab14; + } + base.cursor = base.limit - v_7; + break golab13; + } + base.cursor = base.limit - v_7; + if (base.cursor <= base.limit_backward) + { + break lab12; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + lab15: { + if (!(base.eq_s_b("o"))) + { + break lab15; + } + golab16: while(true) + { + var /** number */ v_8 = base.limit - base.cursor; + lab17: { + if (!(base.in_grouping_b(g_vowel5, 111, 117))) + { + break lab17; + } + base.cursor = base.limit - v_8; + break golab16; + } + base.cursor = base.limit - v_8; + if (base.cursor <= base.limit_backward) + { + break lab15; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + lab18: { + if (!(base.eq_s_b("\u00F6"))) + { + break lab18; + } + golab19: while(true) + { + var /** number */ v_9 = base.limit - base.cursor; + lab20: { + if (!(base.in_grouping_b(g_vowel6, 246, 252))) + { + break lab20; + } + base.cursor = base.limit - v_9; + break golab19; + } + base.cursor = base.limit - v_9; + if (base.cursor <= base.limit_backward) + { + break lab18; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + lab21: { + if (!(base.eq_s_b("u"))) + { + break lab21; + } + golab22: while(true) + { + var /** number */ v_10 = base.limit - base.cursor; + lab23: { + if (!(base.in_grouping_b(g_vowel5, 111, 117))) + { + break lab23; + } + base.cursor = base.limit - v_10; + break golab22; + } + base.cursor = base.limit - v_10; + if (base.cursor <= base.limit_backward) + { + break lab21; + } + base.cursor--; + } + break lab2; + } + base.cursor = base.limit - v_3; + if (!(base.eq_s_b("\u00FC"))) + { + return false; + } + golab24: while(true) + { + var /** number */ v_11 = base.limit - base.cursor; + lab25: { + if (!(base.in_grouping_b(g_vowel6, 246, 252))) + { + break lab25; + } + base.cursor = base.limit - v_11; + break golab24; + } + base.cursor = base.limit - v_11; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + } + base.cursor = base.limit - v_1; + return true; + }; + + /** @return {boolean} */ + function r_mark_suffix_with_optional_n_consonant() { + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("n"))) + { + break lab1; + } + var /** number */ v_2 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + break lab1; + } + base.cursor = base.limit - v_2; + break lab0; + } + base.cursor = base.limit - v_1; + { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + var /** number */ v_4 = base.limit - base.cursor; + if (!(base.eq_s_b("n"))) + { + break lab2; + } + base.cursor = base.limit - v_4; + return false; + } + base.cursor = base.limit - v_3; + } + var /** number */ v_5 = base.limit - base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + return false; + } + base.cursor = base.limit - v_5; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_suffix_with_optional_s_consonant() { + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("s"))) + { + break lab1; + } + var /** number */ v_2 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + break lab1; + } + base.cursor = base.limit - v_2; + break lab0; + } + base.cursor = base.limit - v_1; + { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + var /** number */ v_4 = base.limit - base.cursor; + if (!(base.eq_s_b("s"))) + { + break lab2; + } + base.cursor = base.limit - v_4; + return false; + } + base.cursor = base.limit - v_3; + } + var /** number */ v_5 = base.limit - base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + return false; + } + base.cursor = base.limit - v_5; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_suffix_with_optional_y_consonant() { + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("y"))) + { + break lab1; + } + var /** number */ v_2 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + break lab1; + } + base.cursor = base.limit - v_2; + break lab0; + } + base.cursor = base.limit - v_1; + { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + var /** number */ v_4 = base.limit - base.cursor; + if (!(base.eq_s_b("y"))) + { + break lab2; + } + base.cursor = base.limit - v_4; + return false; + } + base.cursor = base.limit - v_3; + } + var /** number */ v_5 = base.limit - base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + return false; + } + base.cursor = base.limit - v_5; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_suffix_with_optional_U_vowel() { + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.in_grouping_b(g_U, 105, 305))) + { + break lab1; + } + var /** number */ v_2 = base.limit - base.cursor; + if (!(base.out_grouping_b(g_vowel, 97, 305))) + { + break lab1; + } + base.cursor = base.limit - v_2; + break lab0; + } + base.cursor = base.limit - v_1; + { + var /** number */ v_3 = base.limit - base.cursor; + lab2: { + var /** number */ v_4 = base.limit - base.cursor; + if (!(base.in_grouping_b(g_U, 105, 305))) + { + break lab2; + } + base.cursor = base.limit - v_4; + return false; + } + base.cursor = base.limit - v_3; + } + var /** number */ v_5 = base.limit - base.cursor; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + if (!(base.out_grouping_b(g_vowel, 97, 305))) + { + return false; + } + base.cursor = base.limit - v_5; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_possessives() { + if (base.find_among_b(a_0) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_U_vowel()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_sU() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (!(base.in_grouping_b(g_U, 105, 305))) + { + return false; + } + if (!r_mark_suffix_with_optional_s_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_lArI() { + if (base.find_among_b(a_1) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_yU() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (!(base.in_grouping_b(g_U, 105, 305))) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_nU() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_2) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_nUn() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_3) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_n_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_yA() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_4) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_nA() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_5) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_DA() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_6) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ndA() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_7) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_DAn() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_8) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ndAn() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_9) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ylA() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_10) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ki() { + if (!(base.eq_s_b("ki"))) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ncA() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_11) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_n_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_yUm() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_12) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_sUn() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_13) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_yUz() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_14) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_sUnUz() { + if (base.find_among_b(a_15) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_lAr() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_16) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_nUz() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_17) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_DUr() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_18) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_cAsInA() { + if (base.find_among_b(a_19) == 0) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_yDU() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_20) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ysA() { + if (base.find_among_b(a_21) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_ymUs_() { + if (!r_check_vowel_harmony()) + { + return false; + } + if (base.find_among_b(a_22) == 0) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_mark_yken() { + if (!(base.eq_s_b("ken"))) + { + return false; + } + if (!r_mark_suffix_with_optional_y_consonant()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_stem_nominal_verb_suffixes() { + base.ket = base.cursor; + B_continue_stemming_noun_suffixes = true; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + lab2: { + var /** number */ v_2 = base.limit - base.cursor; + lab3: { + if (!r_mark_ymUs_()) + { + break lab3; + } + break lab2; + } + base.cursor = base.limit - v_2; + lab4: { + if (!r_mark_yDU()) + { + break lab4; + } + break lab2; + } + base.cursor = base.limit - v_2; + lab5: { + if (!r_mark_ysA()) + { + break lab5; + } + break lab2; + } + base.cursor = base.limit - v_2; + if (!r_mark_yken()) + { + break lab1; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab6: { + if (!r_mark_cAsInA()) + { + break lab6; + } + lab7: { + var /** number */ v_3 = base.limit - base.cursor; + lab8: { + if (!r_mark_sUnUz()) + { + break lab8; + } + break lab7; + } + base.cursor = base.limit - v_3; + lab9: { + if (!r_mark_lAr()) + { + break lab9; + } + break lab7; + } + base.cursor = base.limit - v_3; + lab10: { + if (!r_mark_yUm()) + { + break lab10; + } + break lab7; + } + base.cursor = base.limit - v_3; + lab11: { + if (!r_mark_sUn()) + { + break lab11; + } + break lab7; + } + base.cursor = base.limit - v_3; + lab12: { + if (!r_mark_yUz()) + { + break lab12; + } + break lab7; + } + base.cursor = base.limit - v_3; + } + if (!r_mark_ymUs_()) + { + break lab6; + } + break lab0; + } + base.cursor = base.limit - v_1; + lab13: { + if (!r_mark_lAr()) + { + break lab13; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + lab14: { + base.ket = base.cursor; + lab15: { + var /** number */ v_5 = base.limit - base.cursor; + lab16: { + if (!r_mark_DUr()) + { + break lab16; + } + break lab15; + } + base.cursor = base.limit - v_5; + lab17: { + if (!r_mark_yDU()) + { + break lab17; + } + break lab15; + } + base.cursor = base.limit - v_5; + lab18: { + if (!r_mark_ysA()) + { + break lab18; + } + break lab15; + } + base.cursor = base.limit - v_5; + if (!r_mark_ymUs_()) + { + base.cursor = base.limit - v_4; + break lab14; + } + } + } + B_continue_stemming_noun_suffixes = false; + break lab0; + } + base.cursor = base.limit - v_1; + lab19: { + if (!r_mark_nUz()) + { + break lab19; + } + lab20: { + var /** number */ v_6 = base.limit - base.cursor; + lab21: { + if (!r_mark_yDU()) + { + break lab21; + } + break lab20; + } + base.cursor = base.limit - v_6; + if (!r_mark_ysA()) + { + break lab19; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab22: { + lab23: { + var /** number */ v_7 = base.limit - base.cursor; + lab24: { + if (!r_mark_sUnUz()) + { + break lab24; + } + break lab23; + } + base.cursor = base.limit - v_7; + lab25: { + if (!r_mark_yUz()) + { + break lab25; + } + break lab23; + } + base.cursor = base.limit - v_7; + lab26: { + if (!r_mark_sUn()) + { + break lab26; + } + break lab23; + } + base.cursor = base.limit - v_7; + if (!r_mark_yUm()) + { + break lab22; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_8 = base.limit - base.cursor; + lab27: { + base.ket = base.cursor; + if (!r_mark_ymUs_()) + { + base.cursor = base.limit - v_8; + break lab27; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!r_mark_DUr()) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_9 = base.limit - base.cursor; + lab28: { + base.ket = base.cursor; + lab29: { + var /** number */ v_10 = base.limit - base.cursor; + lab30: { + if (!r_mark_sUnUz()) + { + break lab30; + } + break lab29; + } + base.cursor = base.limit - v_10; + lab31: { + if (!r_mark_lAr()) + { + break lab31; + } + break lab29; + } + base.cursor = base.limit - v_10; + lab32: { + if (!r_mark_yUm()) + { + break lab32; + } + break lab29; + } + base.cursor = base.limit - v_10; + lab33: { + if (!r_mark_sUn()) + { + break lab33; + } + break lab29; + } + base.cursor = base.limit - v_10; + lab34: { + if (!r_mark_yUz()) + { + break lab34; + } + break lab29; + } + base.cursor = base.limit - v_10; + } + if (!r_mark_ymUs_()) + { + base.cursor = base.limit - v_9; + break lab28; + } + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_stem_suffix_chain_before_ki() { + base.ket = base.cursor; + if (!r_mark_ki()) + { + return false; + } + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!r_mark_DA()) + { + break lab1; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + base.ket = base.cursor; + lab3: { + var /** number */ v_3 = base.limit - base.cursor; + lab4: { + if (!r_mark_lAr()) + { + break lab4; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_4 = base.limit - base.cursor; + lab5: { + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_4; + break lab5; + } + } + break lab3; + } + base.cursor = base.limit - v_3; + if (!r_mark_possessives()) + { + base.cursor = base.limit - v_2; + break lab2; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_5 = base.limit - base.cursor; + lab6: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_5; + break lab6; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_5; + break lab6; + } + } + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab7: { + if (!r_mark_nUn()) + { + break lab7; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_6 = base.limit - base.cursor; + lab8: { + base.ket = base.cursor; + lab9: { + var /** number */ v_7 = base.limit - base.cursor; + lab10: { + if (!r_mark_lArI()) + { + break lab10; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break lab9; + } + base.cursor = base.limit - v_7; + lab11: { + base.ket = base.cursor; + lab12: { + var /** number */ v_8 = base.limit - base.cursor; + lab13: { + if (!r_mark_possessives()) + { + break lab13; + } + break lab12; + } + base.cursor = base.limit - v_8; + if (!r_mark_sU()) + { + break lab11; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_9 = base.limit - base.cursor; + lab14: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_9; + break lab14; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_9; + break lab14; + } + } + break lab9; + } + base.cursor = base.limit - v_7; + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_6; + break lab8; + } + } + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!r_mark_ndA()) + { + return false; + } + lab15: { + var /** number */ v_10 = base.limit - base.cursor; + lab16: { + if (!r_mark_lArI()) + { + break lab16; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break lab15; + } + base.cursor = base.limit - v_10; + lab17: { + if (!r_mark_sU()) + { + break lab17; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_11 = base.limit - base.cursor; + lab18: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_11; + break lab18; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_11; + break lab18; + } + } + break lab15; + } + base.cursor = base.limit - v_10; + if (!r_stem_suffix_chain_before_ki()) + { + return false; + } + } + } + return true; + }; + + /** @return {boolean} */ + function r_stem_noun_suffixes() { + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + break lab1; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_2; + break lab2; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab3: { + base.ket = base.cursor; + if (!r_mark_ncA()) + { + break lab3; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_3 = base.limit - base.cursor; + lab4: { + lab5: { + var /** number */ v_4 = base.limit - base.cursor; + lab6: { + base.ket = base.cursor; + if (!r_mark_lArI()) + { + break lab6; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break lab5; + } + base.cursor = base.limit - v_4; + lab7: { + base.ket = base.cursor; + lab8: { + var /** number */ v_5 = base.limit - base.cursor; + lab9: { + if (!r_mark_possessives()) + { + break lab9; + } + break lab8; + } + base.cursor = base.limit - v_5; + if (!r_mark_sU()) + { + break lab7; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_6 = base.limit - base.cursor; + lab10: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_6; + break lab10; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_6; + break lab10; + } + } + break lab5; + } + base.cursor = base.limit - v_4; + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_3; + break lab4; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_3; + break lab4; + } + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab11: { + base.ket = base.cursor; + lab12: { + var /** number */ v_7 = base.limit - base.cursor; + lab13: { + if (!r_mark_ndA()) + { + break lab13; + } + break lab12; + } + base.cursor = base.limit - v_7; + if (!r_mark_nA()) + { + break lab11; + } + } + lab14: { + var /** number */ v_8 = base.limit - base.cursor; + lab15: { + if (!r_mark_lArI()) + { + break lab15; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break lab14; + } + base.cursor = base.limit - v_8; + lab16: { + if (!r_mark_sU()) + { + break lab16; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_9 = base.limit - base.cursor; + lab17: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_9; + break lab17; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_9; + break lab17; + } + } + break lab14; + } + base.cursor = base.limit - v_8; + if (!r_stem_suffix_chain_before_ki()) + { + break lab11; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab18: { + base.ket = base.cursor; + lab19: { + var /** number */ v_10 = base.limit - base.cursor; + lab20: { + if (!r_mark_ndAn()) + { + break lab20; + } + break lab19; + } + base.cursor = base.limit - v_10; + if (!r_mark_nU()) + { + break lab18; + } + } + lab21: { + var /** number */ v_11 = base.limit - base.cursor; + lab22: { + if (!r_mark_sU()) + { + break lab22; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_12 = base.limit - base.cursor; + lab23: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_12; + break lab23; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_12; + break lab23; + } + } + break lab21; + } + base.cursor = base.limit - v_11; + if (!r_mark_lArI()) + { + break lab18; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab24: { + base.ket = base.cursor; + if (!r_mark_DAn()) + { + break lab24; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_13 = base.limit - base.cursor; + lab25: { + base.ket = base.cursor; + lab26: { + var /** number */ v_14 = base.limit - base.cursor; + lab27: { + if (!r_mark_possessives()) + { + break lab27; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_15 = base.limit - base.cursor; + lab28: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_15; + break lab28; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_15; + break lab28; + } + } + break lab26; + } + base.cursor = base.limit - v_14; + lab29: { + if (!r_mark_lAr()) + { + break lab29; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_16 = base.limit - base.cursor; + lab30: { + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_16; + break lab30; + } + } + break lab26; + } + base.cursor = base.limit - v_14; + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_13; + break lab25; + } + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab31: { + base.ket = base.cursor; + lab32: { + var /** number */ v_17 = base.limit - base.cursor; + lab33: { + if (!r_mark_nUn()) + { + break lab33; + } + break lab32; + } + base.cursor = base.limit - v_17; + if (!r_mark_ylA()) + { + break lab31; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_18 = base.limit - base.cursor; + lab34: { + lab35: { + var /** number */ v_19 = base.limit - base.cursor; + lab36: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + break lab36; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + break lab36; + } + break lab35; + } + base.cursor = base.limit - v_19; + lab37: { + base.ket = base.cursor; + lab38: { + var /** number */ v_20 = base.limit - base.cursor; + lab39: { + if (!r_mark_possessives()) + { + break lab39; + } + break lab38; + } + base.cursor = base.limit - v_20; + if (!r_mark_sU()) + { + break lab37; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_21 = base.limit - base.cursor; + lab40: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_21; + break lab40; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_21; + break lab40; + } + } + break lab35; + } + base.cursor = base.limit - v_19; + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_18; + break lab34; + } + } + } + break lab0; + } + base.cursor = base.limit - v_1; + lab41: { + base.ket = base.cursor; + if (!r_mark_lArI()) + { + break lab41; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + break lab0; + } + base.cursor = base.limit - v_1; + lab42: { + if (!r_stem_suffix_chain_before_ki()) + { + break lab42; + } + break lab0; + } + base.cursor = base.limit - v_1; + lab43: { + base.ket = base.cursor; + lab44: { + var /** number */ v_22 = base.limit - base.cursor; + lab45: { + if (!r_mark_DA()) + { + break lab45; + } + break lab44; + } + base.cursor = base.limit - v_22; + lab46: { + if (!r_mark_yU()) + { + break lab46; + } + break lab44; + } + base.cursor = base.limit - v_22; + if (!r_mark_yA()) + { + break lab43; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_23 = base.limit - base.cursor; + lab47: { + base.ket = base.cursor; + lab48: { + var /** number */ v_24 = base.limit - base.cursor; + lab49: { + if (!r_mark_possessives()) + { + break lab49; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_25 = base.limit - base.cursor; + lab50: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_25; + break lab50; + } + } + break lab48; + } + base.cursor = base.limit - v_24; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_23; + break lab47; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_23; + break lab47; + } + } + break lab0; + } + base.cursor = base.limit - v_1; + base.ket = base.cursor; + lab51: { + var /** number */ v_26 = base.limit - base.cursor; + lab52: { + if (!r_mark_possessives()) + { + break lab52; + } + break lab51; + } + base.cursor = base.limit - v_26; + if (!r_mark_sU()) + { + return false; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + var /** number */ v_27 = base.limit - base.cursor; + lab53: { + base.ket = base.cursor; + if (!r_mark_lAr()) + { + base.cursor = base.limit - v_27; + break lab53; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + if (!r_stem_suffix_chain_before_ki()) + { + base.cursor = base.limit - v_27; + break lab53; + } + } + } + return true; + }; + + /** @return {boolean} */ + function r_post_process_last_consonants() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_23); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("p")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u00E7")) + { + return false; + } + break; + case 3: + if (!base.slice_from("t")) + { + return false; + } + break; + case 4: + if (!base.slice_from("k")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_append_U_to_stems_ending_with_d_or_g() { + base.ket = base.cursor; + base.bra = base.cursor; + lab0: { + var /** number */ v_1 = base.limit - base.cursor; + lab1: { + if (!(base.eq_s_b("d"))) + { + break lab1; + } + break lab0; + } + base.cursor = base.limit - v_1; + if (!(base.eq_s_b("g"))) + { + return false; + } + } + golab2: while(true) + { + var /** number */ v_2 = base.limit - base.cursor; + lab3: { + if (!(base.in_grouping_b(g_vowel, 97, 305))) + { + break lab3; + } + base.cursor = base.limit - v_2; + break golab2; + } + base.cursor = base.limit - v_2; + if (base.cursor <= base.limit_backward) + { + return false; + } + base.cursor--; + } + lab4: { + var /** number */ v_3 = base.limit - base.cursor; + lab5: { + lab6: { + var /** number */ v_4 = base.limit - base.cursor; + lab7: { + if (!(base.eq_s_b("a"))) + { + break lab7; + } + break lab6; + } + base.cursor = base.limit - v_4; + if (!(base.eq_s_b("\u0131"))) + { + break lab5; + } + } + if (!base.slice_from("\u0131")) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_3; + lab8: { + lab9: { + var /** number */ v_5 = base.limit - base.cursor; + lab10: { + if (!(base.eq_s_b("e"))) + { + break lab10; + } + break lab9; + } + base.cursor = base.limit - v_5; + if (!(base.eq_s_b("i"))) + { + break lab8; + } + } + if (!base.slice_from("i")) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_3; + lab11: { + lab12: { + var /** number */ v_6 = base.limit - base.cursor; + lab13: { + if (!(base.eq_s_b("o"))) + { + break lab13; + } + break lab12; + } + base.cursor = base.limit - v_6; + if (!(base.eq_s_b("u"))) + { + break lab11; + } + } + if (!base.slice_from("u")) + { + return false; + } + break lab4; + } + base.cursor = base.limit - v_3; + lab14: { + var /** number */ v_7 = base.limit - base.cursor; + lab15: { + if (!(base.eq_s_b("\u00F6"))) + { + break lab15; + } + break lab14; + } + base.cursor = base.limit - v_7; + if (!(base.eq_s_b("\u00FC"))) + { + return false; + } + } + if (!base.slice_from("\u00FC")) + { + return false; + } + } + return true; + }; + + /** @return {boolean} */ + function r_is_reserved_word() { + if (!(base.eq_s_b("ad"))) + { + return false; + } + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (!(base.eq_s_b("soy"))) + { + base.cursor = base.limit - v_1; + break lab0; + } + } + if (base.cursor > base.limit_backward) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_more_than_one_syllable_word() { + var /** number */ v_1 = base.cursor; + for (var /** number */ v_2 = 2; v_2 > 0; v_2--) + { + golab0: while(true) + { + lab1: { + if (!(base.in_grouping(g_vowel, 97, 305))) + { + break lab1; + } + break golab0; + } + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + } + base.cursor = v_1; + return true; + }; + + /** @return {boolean} */ + function r_postlude() { + base.limit_backward = base.cursor; base.cursor = base.limit; + { + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + if (!r_is_reserved_word()) + { + break lab0; + } + return false; + } + base.cursor = base.limit - v_1; + } + var /** number */ v_2 = base.limit - base.cursor; + r_append_U_to_stems_ending_with_d_or_g(); + base.cursor = base.limit - v_2; + var /** number */ v_3 = base.limit - base.cursor; + r_post_process_last_consonants(); + base.cursor = base.limit - v_3; + base.cursor = base.limit_backward; + return true; + }; + + this.stem = /** @return {boolean} */ function() { + if (!r_more_than_one_syllable_word()) + { + return false; + } + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_1 = base.limit - base.cursor; + r_stem_nominal_verb_suffixes(); + base.cursor = base.limit - v_1; + if (!B_continue_stemming_noun_suffixes) + { + return false; + } + var /** number */ v_2 = base.limit - base.cursor; + r_stem_noun_suffixes(); + base.cursor = base.limit - v_2; + base.cursor = base.limit_backward; + if (!r_postlude()) + { + return false; + } + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['TurkishStemmer'] = TurkishStemmer; diff --git a/js/yiddish-stemmer.js b/js/yiddish-stemmer.js new file mode 100644 index 0000000..f21669d --- /dev/null +++ b/js/yiddish-stemmer.js @@ -0,0 +1,1173 @@ +// Generated by Snowball 2.2.0 - https://snowballstem.org/ + +/**@constructor*/ +var YiddishStemmer = function() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u05D5\u05D5", -1, 1], + ["\u05D5\u05D9", -1, 2], + ["\u05D9\u05D9", -1, 3], + ["\u05DA", -1, 4], + ["\u05DD", -1, 5], + ["\u05DF", -1, 6], + ["\u05E3", -1, 7], + ["\u05E5", -1, 8] + ]; + + /** @const */ var a_1 = [ + ["\u05D0\u05D3\u05D5\u05E8\u05DB", -1, 1], + ["\u05D0\u05D4\u05D9\u05E0", -1, 1], + ["\u05D0\u05D4\u05E2\u05E8", -1, 1], + ["\u05D0\u05D4\u05F2\u05DE", -1, 1], + ["\u05D0\u05D5\u05DE", -1, 1], + ["\u05D0\u05D5\u05E0\u05D8\u05E2\u05E8", -1, 1], + ["\u05D0\u05D9\u05D1\u05E2\u05E8", -1, 1], + ["\u05D0\u05E0", -1, 1], + ["\u05D0\u05E0\u05D8", 7, 1], + ["\u05D0\u05E0\u05D8\u05E7\u05E2\u05D2\u05E0", 8, 1], + ["\u05D0\u05E0\u05D9\u05D3\u05E2\u05E8", 7, 1], + ["\u05D0\u05E4", -1, 1], + ["\u05D0\u05E4\u05D9\u05E8", 11, 1], + ["\u05D0\u05E7\u05E2\u05D2\u05E0", -1, 1], + ["\u05D0\u05E8\u05D0\u05E4", -1, 1], + ["\u05D0\u05E8\u05D5\u05DE", -1, 1], + ["\u05D0\u05E8\u05D5\u05E0\u05D8\u05E2\u05E8", -1, 1], + ["\u05D0\u05E8\u05D9\u05D1\u05E2\u05E8", -1, 1], + ["\u05D0\u05E8\u05F1\u05E1", -1, 1], + ["\u05D0\u05E8\u05F1\u05E4", -1, 1], + ["\u05D0\u05E8\u05F2\u05E0", -1, 1], + ["\u05D0\u05F0\u05E2\u05E7", -1, 1], + ["\u05D0\u05F1\u05E1", -1, 1], + ["\u05D0\u05F1\u05E4", -1, 1], + ["\u05D0\u05F2\u05E0", -1, 1], + ["\u05D1\u05D0", -1, 1], + ["\u05D1\u05F2", -1, 1], + ["\u05D3\u05D5\u05E8\u05DB", -1, 1], + ["\u05D3\u05E2\u05E8", -1, 1], + ["\u05DE\u05D9\u05D8", -1, 1], + ["\u05E0\u05D0\u05DB", -1, 1], + ["\u05E4\u05D0\u05E8", -1, 1], + ["\u05E4\u05D0\u05E8\u05D1\u05F2", 31, 1], + ["\u05E4\u05D0\u05E8\u05F1\u05E1", 31, 1], + ["\u05E4\u05D5\u05E0\u05D0\u05E0\u05D3\u05E2\u05E8", -1, 1], + ["\u05E6\u05D5", -1, 1], + ["\u05E6\u05D5\u05D6\u05D0\u05DE\u05E2\u05E0", 35, 1], + ["\u05E6\u05D5\u05E0\u05F1\u05E4", 35, 1], + ["\u05E6\u05D5\u05E8\u05D9\u05E7", 35, 1], + ["\u05E6\u05E2", -1, 1] + ]; + + /** @const */ var a_2 = [ + ["\u05D3\u05D6\u05E9", -1, -1], + ["\u05E9\u05D8\u05E8", -1, -1], + ["\u05E9\u05D8\u05E9", -1, -1], + ["\u05E9\u05E4\u05E8", -1, -1] + ]; + + /** @const */ var a_3 = [ + ["\u05E7\u05DC\u05D9\u05D1", -1, 9], + ["\u05E8\u05D9\u05D1", -1, 10], + ["\u05D8\u05E8\u05D9\u05D1", 1, 7], + ["\u05E9\u05E8\u05D9\u05D1", 1, 15], + ["\u05D4\u05F1\u05D1", -1, 23], + ["\u05E9\u05F0\u05D9\u05D2", -1, 12], + ["\u05D2\u05D0\u05E0\u05D2", -1, 1], + ["\u05D6\u05D5\u05E0\u05D2", -1, 18], + ["\u05E9\u05DC\u05D5\u05E0\u05D2", -1, 21], + ["\u05E6\u05F0\u05D5\u05E0\u05D2", -1, 20], + ["\u05D1\u05F1\u05D2", -1, 22], + ["\u05D1\u05D5\u05E0\u05D3", -1, 16], + ["\u05F0\u05D9\u05D6", -1, 6], + ["\u05D1\u05D9\u05D8", -1, 4], + ["\u05DC\u05D9\u05D8", -1, 8], + ["\u05DE\u05D9\u05D8", -1, 3], + ["\u05E9\u05E0\u05D9\u05D8", -1, 14], + ["\u05E0\u05D5\u05DE", -1, 2], + ["\u05E9\u05D8\u05D0\u05E0", -1, 25], + ["\u05D1\u05D9\u05E1", -1, 5], + ["\u05E9\u05DE\u05D9\u05E1", -1, 13], + ["\u05E8\u05D9\u05E1", -1, 11], + ["\u05D8\u05E8\u05D5\u05E0\u05E7", -1, 19], + ["\u05E4\u05D0\u05E8\u05DC\u05F1\u05E8", -1, 24], + ["\u05E9\u05F0\u05F1\u05E8", -1, 26], + ["\u05F0\u05D5\u05D8\u05E9", -1, 17] + ]; + + /** @const */ var a_4 = [ + ["\u05D5\u05E0\u05D2", -1, 1], + ["\u05E1\u05D8\u05D5", -1, 1], + ["\u05D8", -1, 1], + ["\u05D1\u05E8\u05D0\u05DB\u05D8", 2, 31], + ["\u05E1\u05D8", 2, 1], + ["\u05D9\u05E1\u05D8", 4, 33], + ["\u05E2\u05D8", 2, 1], + ["\u05E9\u05D0\u05E4\u05D8", 2, 1], + ["\u05D4\u05F2\u05D8", 2, 1], + ["\u05E7\u05F2\u05D8", 2, 1], + ["\u05D9\u05E7\u05F2\u05D8", 9, 1], + ["\u05DC\u05E2\u05DB", -1, 1], + ["\u05E2\u05DC\u05E2\u05DB", 11, 1], + ["\u05D9\u05D6\u05DE", -1, 1], + ["\u05D9\u05DE", -1, 1], + ["\u05E2\u05DE", -1, 1], + ["\u05E2\u05E0\u05E2\u05DE", 15, 3], + ["\u05D8\u05E2\u05E0\u05E2\u05DE", 16, 4], + ["\u05E0", -1, 1], + ["\u05E7\u05DC\u05D9\u05D1\u05E0", 18, 14], + ["\u05E8\u05D9\u05D1\u05E0", 18, 15], + ["\u05D8\u05E8\u05D9\u05D1\u05E0", 20, 12], + ["\u05E9\u05E8\u05D9\u05D1\u05E0", 20, 7], + ["\u05D4\u05F1\u05D1\u05E0", 18, 27], + ["\u05E9\u05F0\u05D9\u05D2\u05E0", 18, 17], + ["\u05D6\u05D5\u05E0\u05D2\u05E0", 18, 22], + ["\u05E9\u05DC\u05D5\u05E0\u05D2\u05E0", 18, 25], + ["\u05E6\u05F0\u05D5\u05E0\u05D2\u05E0", 18, 24], + ["\u05D1\u05F1\u05D2\u05E0", 18, 26], + ["\u05D1\u05D5\u05E0\u05D3\u05E0", 18, 20], + ["\u05F0\u05D9\u05D6\u05E0", 18, 11], + ["\u05D8\u05E0", 18, 4], + ["GE\u05D1\u05D9\u05D8\u05E0", 31, 9], + ["GE\u05DC\u05D9\u05D8\u05E0", 31, 13], + ["GE\u05DE\u05D9\u05D8\u05E0", 31, 8], + ["\u05E9\u05E0\u05D9\u05D8\u05E0", 31, 19], + ["\u05E1\u05D8\u05E0", 31, 1], + ["\u05D9\u05E1\u05D8\u05E0", 36, 1], + ["\u05E2\u05D8\u05E0", 31, 1], + ["GE\u05D1\u05D9\u05E1\u05E0", 18, 10], + ["\u05E9\u05DE\u05D9\u05E1\u05E0", 18, 18], + ["GE\u05E8\u05D9\u05E1\u05E0", 18, 16], + ["\u05E2\u05E0", 18, 1], + ["\u05D2\u05D0\u05E0\u05D2\u05E2\u05E0", 42, 5], + ["\u05E2\u05DC\u05E2\u05E0", 42, 1], + ["\u05E0\u05D5\u05DE\u05E2\u05E0", 42, 6], + ["\u05D9\u05D6\u05DE\u05E2\u05E0", 42, 1], + ["\u05E9\u05D8\u05D0\u05E0\u05E2\u05E0", 42, 29], + ["\u05D8\u05E8\u05D5\u05E0\u05E7\u05E0", 18, 23], + ["\u05E4\u05D0\u05E8\u05DC\u05F1\u05E8\u05E0", 18, 28], + ["\u05E9\u05F0\u05F1\u05E8\u05E0", 18, 30], + ["\u05F0\u05D5\u05D8\u05E9\u05E0", 18, 21], + ["\u05D2\u05F2\u05E0", 18, 5], + ["\u05E1", -1, 1], + ["\u05D8\u05E1", 53, 4], + ["\u05E2\u05D8\u05E1", 54, 1], + ["\u05E0\u05E1", 53, 1], + ["\u05D8\u05E0\u05E1", 56, 4], + ["\u05E2\u05E0\u05E1", 56, 3], + ["\u05E2\u05E1", 53, 1], + ["\u05D9\u05E2\u05E1", 59, 2], + ["\u05E2\u05DC\u05E2\u05E1", 59, 1], + ["\u05E2\u05E8\u05E1", 53, 1], + ["\u05E2\u05E0\u05E2\u05E8\u05E1", 62, 1], + ["\u05E2", -1, 1], + ["\u05D8\u05E2", 64, 4], + ["\u05E1\u05D8\u05E2", 65, 1], + ["\u05E2\u05D8\u05E2", 65, 1], + ["\u05D9\u05E2", 64, -1], + ["\u05E2\u05DC\u05E2", 64, 1], + ["\u05E2\u05E0\u05E2", 64, 3], + ["\u05D8\u05E2\u05E0\u05E2", 70, 4], + ["\u05E2\u05E8", -1, 1], + ["\u05D8\u05E2\u05E8", 72, 4], + ["\u05E1\u05D8\u05E2\u05E8", 73, 1], + ["\u05E2\u05D8\u05E2\u05E8", 73, 1], + ["\u05E2\u05E0\u05E2\u05E8", 72, 3], + ["\u05D8\u05E2\u05E0\u05E2\u05E8", 76, 4], + ["\u05D5\u05EA", -1, 32] + ]; + + /** @const */ var a_5 = [ + ["\u05D5\u05E0\u05D2", -1, 1], + ["\u05E9\u05D0\u05E4\u05D8", -1, 1], + ["\u05D4\u05F2\u05D8", -1, 1], + ["\u05E7\u05F2\u05D8", -1, 1], + ["\u05D9\u05E7\u05F2\u05D8", 3, 1], + ["\u05DC", -1, 2] + ]; + + /** @const */ var a_6 = [ + ["\u05D9\u05D2", -1, 1], + ["\u05D9\u05E7", -1, 1], + ["\u05D3\u05D9\u05E7", 1, 1], + ["\u05E0\u05D3\u05D9\u05E7", 2, 1], + ["\u05E2\u05E0\u05D3\u05D9\u05E7", 3, 1], + ["\u05D1\u05DC\u05D9\u05E7", 1, -1], + ["\u05D2\u05DC\u05D9\u05E7", 1, -1], + ["\u05E0\u05D9\u05E7", 1, 1], + ["\u05D9\u05E9", -1, 1] + ]; + + /** @const */ var /** Array */ g_niked = [255, 155, 6]; + + /** @const */ var /** Array */ g_vowel = [33, 2, 4, 0, 6]; + + /** @const */ var /** Array */ g_consonant = [239, 254, 253, 131]; + + var /** number */ I_x = 0; + var /** number */ I_p1 = 0; + + + /** @return {boolean} */ + function r_prelude() { + var /** number */ among_var; + var /** number */ v_1 = base.cursor; + lab0: { + while(true) + { + var /** number */ v_2 = base.cursor; + lab1: { + golab2: while(true) + { + var /** number */ v_3 = base.cursor; + lab3: { + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + break lab3; + } + base.ket = base.cursor; + switch (among_var) { + case 1: + { + var /** number */ v_4 = base.cursor; + lab4: { + if (!(base.eq_s("\u05BC"))) + { + break lab4; + } + break lab3; + } + base.cursor = v_4; + } + if (!base.slice_from("\u05F0")) + { + return false; + } + break; + case 2: + { + var /** number */ v_5 = base.cursor; + lab5: { + if (!(base.eq_s("\u05B4"))) + { + break lab5; + } + break lab3; + } + base.cursor = v_5; + } + if (!base.slice_from("\u05F1")) + { + return false; + } + break; + case 3: + { + var /** number */ v_6 = base.cursor; + lab6: { + if (!(base.eq_s("\u05B4"))) + { + break lab6; + } + break lab3; + } + base.cursor = v_6; + } + if (!base.slice_from("\u05F2")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u05DB")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u05DE")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u05E0")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u05E4")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u05E6")) + { + return false; + } + break; + } + base.cursor = v_3; + break golab2; + } + base.cursor = v_3; + if (base.cursor >= base.limit) + { + break lab1; + } + base.cursor++; + } + continue; + } + base.cursor = v_2; + break; + } + } + base.cursor = v_1; + var /** number */ v_7 = base.cursor; + lab7: { + while(true) + { + var /** number */ v_8 = base.cursor; + lab8: { + golab9: while(true) + { + var /** number */ v_9 = base.cursor; + lab10: { + base.bra = base.cursor; + if (!(base.in_grouping(g_niked, 1456, 1474))) + { + break lab10; + } + base.ket = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.cursor = v_9; + break golab9; + } + base.cursor = v_9; + if (base.cursor >= base.limit) + { + break lab8; + } + base.cursor++; + } + continue; + } + base.cursor = v_8; + break; + } + } + base.cursor = v_7; + return true; + }; + + /** @return {boolean} */ + function r_mark_regions() { + I_p1 = base.limit; + var /** number */ v_1 = base.cursor; + lab0: { + base.bra = base.cursor; + if (!(base.eq_s("\u05D2\u05E2"))) + { + base.cursor = v_1; + break lab0; + } + base.ket = base.cursor; + { + var /** number */ v_2 = base.cursor; + lab1: { + lab2: { + var /** number */ v_3 = base.cursor; + lab3: { + if (!(base.eq_s("\u05DC\u05D8"))) + { + break lab3; + } + break lab2; + } + base.cursor = v_3; + if (!(base.eq_s("\u05D1\u05E0"))) + { + break lab1; + } + } + base.cursor = v_1; + break lab0; + } + base.cursor = v_2; + } + if (!base.slice_from("GE")) + { + return false; + } + } + var /** number */ v_4 = base.cursor; + lab4: { + if (base.find_among(a_1) == 0) + { + base.cursor = v_4; + break lab4; + } + lab5: { + var /** number */ v_5 = base.cursor; + lab6: { + var /** number */ v_6 = base.cursor; + lab7: { + var /** number */ v_7 = base.cursor; + lab8: { + if (!(base.eq_s("\u05E6\u05D5\u05D2\u05E0"))) + { + break lab8; + } + break lab7; + } + base.cursor = v_7; + lab9: { + if (!(base.eq_s("\u05E6\u05D5\u05E7\u05D8"))) + { + break lab9; + } + break lab7; + } + base.cursor = v_7; + if (!(base.eq_s("\u05E6\u05D5\u05E7\u05E0"))) + { + break lab6; + } + } + if (base.cursor < base.limit) + { + break lab6; + } + base.cursor = v_6; + break lab5; + } + base.cursor = v_5; + lab10: { + var /** number */ v_8 = base.cursor; + if (!(base.eq_s("\u05D2\u05E2\u05D1\u05E0"))) + { + break lab10; + } + base.cursor = v_8; + break lab5; + } + base.cursor = v_5; + lab11: { + base.bra = base.cursor; + if (!(base.eq_s("\u05D2\u05E2"))) + { + break lab11; + } + base.ket = base.cursor; + if (!base.slice_from("GE")) + { + return false; + } + break lab5; + } + base.cursor = v_5; + base.bra = base.cursor; + if (!(base.eq_s("\u05E6\u05D5"))) + { + base.cursor = v_4; + break lab4; + } + base.ket = base.cursor; + if (!base.slice_from("TSU")) + { + return false; + } + } + } + var /** number */ v_9 = base.cursor; + { + var /** number */ c1 = base.cursor + 3; + if (c1 > base.limit) + { + return false; + } + base.cursor = c1; + } + I_x = base.cursor; + base.cursor = v_9; + var /** number */ v_10 = base.cursor; + lab12: { + if (base.find_among(a_2) == 0) + { + base.cursor = v_10; + break lab12; + } + } + { + var /** number */ v_11 = base.cursor; + lab13: { + if (!(base.in_grouping(g_consonant, 1489, 1520))) + { + break lab13; + } + if (!(base.in_grouping(g_consonant, 1489, 1520))) + { + break lab13; + } + if (!(base.in_grouping(g_consonant, 1489, 1520))) + { + break lab13; + } + I_p1 = base.cursor; + return false; + } + base.cursor = v_11; + } + golab14: while(true) + { + var /** number */ v_12 = base.cursor; + lab15: { + if (!(base.in_grouping(g_vowel, 1488, 1522))) + { + break lab15; + } + base.cursor = v_12; + break golab14; + } + base.cursor = v_12; + if (base.cursor >= base.limit) + { + return false; + } + base.cursor++; + } + while(true) + { + lab16: { + if (!(base.in_grouping(g_vowel, 1488, 1522))) + { + break lab16; + } + continue; + } + break; + } + I_p1 = base.cursor; + lab17: { + if (I_p1 >= I_x) + { + break lab17; + } + I_p1 = I_x; + } + return true; + }; + + /** @return {boolean} */ + function r_R1() { + return I_p1 <= base.cursor; + }; + + /** @return {boolean} */ + function r_R1plus3() { + return I_p1 <= (base.cursor + 3); + }; + + /** @return {boolean} */ + function r_standard_suffix() { + var /** number */ among_var; + var /** number */ v_1 = base.limit - base.cursor; + lab0: { + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + break lab0; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + break lab0; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R1()) + { + break lab0; + } + if (!base.slice_from("\u05D9\u05E2")) + { + return false; + } + break; + case 3: + if (!r_R1()) + { + break lab0; + } + if (!base.slice_del()) + { + return false; + } + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + break lab0; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u05D2\u05F2")) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u05E0\u05E2\u05DE")) + { + return false; + } + break; + case 3: + if (!base.slice_from("\u05DE\u05F2\u05D3")) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u05D1\u05F2\u05D8")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u05D1\u05F2\u05E1")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u05F0\u05F2\u05D6")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u05D8\u05E8\u05F2\u05D1")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u05DC\u05F2\u05D8")) + { + return false; + } + break; + case 9: + if (!base.slice_from("\u05E7\u05DC\u05F2\u05D1")) + { + return false; + } + break; + case 10: + if (!base.slice_from("\u05E8\u05F2\u05D1")) + { + return false; + } + break; + case 11: + if (!base.slice_from("\u05E8\u05F2\u05E1")) + { + return false; + } + break; + case 12: + if (!base.slice_from("\u05E9\u05F0\u05F2\u05D2")) + { + return false; + } + break; + case 13: + if (!base.slice_from("\u05E9\u05DE\u05F2\u05E1")) + { + return false; + } + break; + case 14: + if (!base.slice_from("\u05E9\u05E0\u05F2\u05D3")) + { + return false; + } + break; + case 15: + if (!base.slice_from("\u05E9\u05E8\u05F2\u05D1")) + { + return false; + } + break; + case 16: + if (!base.slice_from("\u05D1\u05D9\u05E0\u05D3")) + { + return false; + } + break; + case 17: + if (!base.slice_from("\u05F0\u05D9\u05D8\u05E9")) + { + return false; + } + break; + case 18: + if (!base.slice_from("\u05D6\u05D9\u05E0\u05D2")) + { + return false; + } + break; + case 19: + if (!base.slice_from("\u05D8\u05E8\u05D9\u05E0\u05E7")) + { + return false; + } + break; + case 20: + if (!base.slice_from("\u05E6\u05F0\u05D9\u05E0\u05D2")) + { + return false; + } + break; + case 21: + if (!base.slice_from("\u05E9\u05DC\u05D9\u05E0\u05D2")) + { + return false; + } + break; + case 22: + if (!base.slice_from("\u05D1\u05F2\u05D2")) + { + return false; + } + break; + case 23: + if (!base.slice_from("\u05D4\u05F2\u05D1")) + { + return false; + } + break; + case 24: + if (!base.slice_from("\u05E4\u05D0\u05E8\u05DC\u05D9\u05E8")) + { + return false; + } + break; + case 25: + if (!base.slice_from("\u05E9\u05D8\u05F2")) + { + return false; + } + break; + case 26: + if (!base.slice_from("\u05E9\u05F0\u05E2\u05E8")) + { + return false; + } + break; + } + break; + case 4: + lab1: { + var /** number */ v_2 = base.limit - base.cursor; + lab2: { + if (!r_R1()) + { + break lab2; + } + if (!base.slice_del()) + { + return false; + } + break lab1; + } + base.cursor = base.limit - v_2; + if (!base.slice_from("\u05D8")) + { + return false; + } + } + base.ket = base.cursor; + if (!(base.eq_s_b("\u05D1\u05E8\u05D0\u05DB"))) + { + break lab0; + } + var /** number */ v_3 = base.limit - base.cursor; + lab3: { + if (!(base.eq_s_b("\u05D2\u05E2"))) + { + base.cursor = base.limit - v_3; + break lab3; + } + } + base.bra = base.cursor; + if (!base.slice_from("\u05D1\u05E8\u05E2\u05E0\u05D2")) + { + return false; + } + break; + case 5: + if (!base.slice_from("\u05D2\u05F2")) + { + return false; + } + break; + case 6: + if (!base.slice_from("\u05E0\u05E2\u05DE")) + { + return false; + } + break; + case 7: + if (!base.slice_from("\u05E9\u05E8\u05F2\u05D1")) + { + return false; + } + break; + case 8: + if (!base.slice_from("\u05DE\u05F2\u05D3")) + { + return false; + } + break; + case 9: + if (!base.slice_from("\u05D1\u05F2\u05D8")) + { + return false; + } + break; + case 10: + if (!base.slice_from("\u05D1\u05F2\u05E1")) + { + return false; + } + break; + case 11: + if (!base.slice_from("\u05F0\u05F2\u05D6")) + { + return false; + } + break; + case 12: + if (!base.slice_from("\u05D8\u05E8\u05F2\u05D1")) + { + return false; + } + break; + case 13: + if (!base.slice_from("\u05DC\u05F2\u05D8")) + { + return false; + } + break; + case 14: + if (!base.slice_from("\u05E7\u05DC\u05F2\u05D1")) + { + return false; + } + break; + case 15: + if (!base.slice_from("\u05E8\u05F2\u05D1")) + { + return false; + } + break; + case 16: + if (!base.slice_from("\u05E8\u05F2\u05E1")) + { + return false; + } + break; + case 17: + if (!base.slice_from("\u05E9\u05F0\u05F2\u05D2")) + { + return false; + } + break; + case 18: + if (!base.slice_from("\u05E9\u05DE\u05F2\u05E1")) + { + return false; + } + break; + case 19: + if (!base.slice_from("\u05E9\u05E0\u05F2\u05D3")) + { + return false; + } + break; + case 20: + if (!base.slice_from("\u05D1\u05D9\u05E0\u05D3")) + { + return false; + } + break; + case 21: + if (!base.slice_from("\u05F0\u05D9\u05D8\u05E9")) + { + return false; + } + break; + case 22: + if (!base.slice_from("\u05D6\u05D9\u05E0\u05D2")) + { + return false; + } + break; + case 23: + if (!base.slice_from("\u05D8\u05E8\u05D9\u05E0\u05E7")) + { + return false; + } + break; + case 24: + if (!base.slice_from("\u05E6\u05F0\u05D9\u05E0\u05D2")) + { + return false; + } + break; + case 25: + if (!base.slice_from("\u05E9\u05DC\u05D9\u05E0\u05D2")) + { + return false; + } + break; + case 26: + if (!base.slice_from("\u05D1\u05F2\u05D2")) + { + return false; + } + break; + case 27: + if (!base.slice_from("\u05D4\u05F2\u05D1")) + { + return false; + } + break; + case 28: + if (!base.slice_from("\u05E4\u05D0\u05E8\u05DC\u05D9\u05E8")) + { + return false; + } + break; + case 29: + if (!base.slice_from("\u05E9\u05D8\u05F2")) + { + return false; + } + break; + case 30: + if (!base.slice_from("\u05E9\u05F0\u05E2\u05E8")) + { + return false; + } + break; + case 31: + if (!base.slice_from("\u05D1\u05E8\u05E2\u05E0\u05D2")) + { + return false; + } + break; + case 32: + if (!r_R1()) + { + break lab0; + } + if (!base.slice_from("\u05D4")) + { + return false; + } + break; + case 33: + lab4: { + var /** number */ v_4 = base.limit - base.cursor; + lab5: { + lab6: { + var /** number */ v_5 = base.limit - base.cursor; + lab7: { + if (!(base.eq_s_b("\u05D2"))) + { + break lab7; + } + break lab6; + } + base.cursor = base.limit - v_5; + if (!(base.eq_s_b("\u05E9"))) + { + break lab5; + } + } + var /** number */ v_6 = base.limit - base.cursor; + lab8: { + if (!r_R1plus3()) + { + base.cursor = base.limit - v_6; + break lab8; + } + if (!base.slice_from("\u05D9\u05E1")) + { + return false; + } + } + break lab4; + } + base.cursor = base.limit - v_4; + if (!r_R1()) + { + break lab0; + } + if (!base.slice_del()) + { + return false; + } + } + break; + } + } + base.cursor = base.limit - v_1; + var /** number */ v_7 = base.limit - base.cursor; + lab9: { + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + break lab9; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + break lab9; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!r_R1()) + { + break lab9; + } + if (!(base.in_grouping_b(g_consonant, 1489, 1520))) + { + break lab9; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + base.cursor = base.limit - v_7; + var /** number */ v_8 = base.limit - base.cursor; + lab10: { + base.ket = base.cursor; + among_var = base.find_among_b(a_6); + if (among_var == 0) + { + break lab10; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!r_R1()) + { + break lab10; + } + if (!base.slice_del()) + { + return false; + } + break; + } + } + base.cursor = base.limit - v_8; + var /** number */ v_9 = base.limit - base.cursor; + lab11: { + while(true) + { + var /** number */ v_10 = base.limit - base.cursor; + lab12: { + golab13: while(true) + { + var /** number */ v_11 = base.limit - base.cursor; + lab14: { + base.ket = base.cursor; + lab15: { + var /** number */ v_12 = base.limit - base.cursor; + lab16: { + if (!(base.eq_s_b("GE"))) + { + break lab16; + } + break lab15; + } + base.cursor = base.limit - v_12; + if (!(base.eq_s_b("TSU"))) + { + break lab14; + } + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + base.cursor = base.limit - v_11; + break golab13; + } + base.cursor = base.limit - v_11; + if (base.cursor <= base.limit_backward) + { + break lab12; + } + base.cursor--; + } + continue; + } + base.cursor = base.limit - v_10; + break; + } + } + base.cursor = base.limit - v_9; + return true; + }; + + this.stem = /** @return {boolean} */ function() { + r_prelude(); + var /** number */ v_2 = base.cursor; + r_mark_regions(); + base.cursor = v_2; + base.limit_backward = base.cursor; base.cursor = base.limit; + r_standard_suffix(); + base.cursor = base.limit_backward; + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; + +window['YiddishStemmer'] = YiddishStemmer; diff --git a/license.html b/license.html new file mode 100644 index 0000000..687ee30 --- /dev/null +++ b/license.html @@ -0,0 +1,111 @@ + + + + + + + + + + License - Snowball + + + + + + + + + + +
+
+ +
+

License

+ + +

+Except where explicitly noted, all the software given out on this Snowball site +is covered by the 3-clause BSD License: +

+ +
+Copyright (c) 2001, Dr Martin Porter,
+Copyright (c) 2002, Richard Boulton.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ +

+Essentially, all this means is that you can do what you like with the code, +except claim another Copyright for it, or claim that it is issued under a different +license. The software is also issued without warranties, which means that if anyone +suffers through its use, they cannot come back and sue you. +You also have to alert anyone to whom you give the Snowball +software to the fact that it is covered by the BSD license. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/lists.html b/lists.html new file mode 100644 index 0000000..68ed031 --- /dev/null +++ b/lists.html @@ -0,0 +1,101 @@ + + + + + + + + + + Mailing Lists - Snowball + + + + + + + + + + +
+
+ +
+

Mailing Lists

+ + +

+There's one active mailing list related to Snowball: +

+ +
    +
  • Snowball-discuss is a list for general discussion of anything related to Snowball. +Release announcements will also be posted to this list. +
    +Subscribe | +Archives +
+ +

+Note that this mailing list will reject postings from non-subscribers (due to +the immense amount of spam received otherwise). The list is fairly +low-traffic, but if you don't wish to receive messages (but wish to be able to +post), you can disable sending of messages in the mailing list options after +subscribing. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/otherapps/romanian/index.html b/otherapps/romanian/index.html new file mode 100644 index 0000000..08d06ed --- /dev/null +++ b/otherapps/romanian/index.html @@ -0,0 +1,244 @@ + + + + + + + + + + Two Romanian stemmers - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Two Romanian stemmers

+ + +

Links to resources

+ + + +

+In swift succession, we received in 2006 two stemmers for Romanian +written in Snowball. +Here is the original correspondence, +

+ +
+From: Erwin Glockner <eglockne@ix.urz.uni-heidelberg.de>
+To: snowball-discuss
+Date: Wed, 07 Jun 2006 00:06:30 +0200
+Subject: [Snowball-discuss] romanian stemmer
+
+Hello everyone,
+
+my name is Erwn Glockner, I'm a student of computational linguistics in
+Heidelberg, Germany. Together with my fellow students Doina Gliga and
+Marina Stegarescu we started to write a romanian stemmer in Snowball.
+We planned to finish the stemmer until end of this month. We would be
+happy if the stemmer would be accepted as part of the Snowball-distribution.
+There is still some work to do, e.g. evaluating the stemmer, making a
+stopwords-list, unicode support, etc. After finishing this we will send
+you our stemmer with the corresponding files, but I couldn't find any
+email address to whom the stemmer should be sent to.
+Could please someone tell me the address(es)?
+
+With kind regards,
+E. Glockner, D. Gliga, M. Stegarescu.
+
+ +
+From: Erwin Glockner <eglockne@ix.urz.uni-heidelberg.de>
+To: richard@lemurconsulting.com,
+    martin.porter@grapeshot.co.uk
+Date: Tue Jul 18 19:43:39 2006
+Subject: romanian stemmer
+
+Dear Mr. Porter, dear Mr. Boulton,
+
+we finally finished the Romanian stemmer. Unfortunately evaluation took
+more time than expected.
+However, it was an interesting experience creating the stemmer, and we
+are happy to send you the result of our work.
+The attachment-file is a Tarball-zipped file with (hopefully) all files
+needed. The files and the stemmer as well are encoded in UTF-8. Please
+inform us if something is missing.
+
+We would be happy if the Romanian stemmer would be accepted and
+integrated into the official Snowball distribution. We agree of course
+to license the stemmer under the same terms as the existing snowball
+software.
+
+We're looking forward to hear from you soon.
+
+
+With kind regards,
+
+Marina, Doina and Erwin.
+
+Attachment: [romanian1.tgz]
+
+ +
+From: Irina Tirdea <irina.tirdea@gmail.com>
+To: richard@lemurconsulting.com,
+    martin.porter@grapeshot.co.uk
+Date: Mon Jul 31 10:19:51 2006
+Subject: Romanian stemmer
+
+Hello,
+
+My name is Irina Tirdea and I have developed a Romanian stemmer in Snowball
+as part of my bachelor thesis, in Bucharest, Romania. I am sending you the
+code attached (with vocabulary and stop word list files) and I hope you will
+accept and integrate it as a part of the Snowball project. I am ready to
+release the stemmer under the BSD license, just as the Snowball software.
+The files have been written in UTF-8 encoding (on a Linux system).
+
+Looking forward to hear from you.
+
+Kind regards,
+Irina Tirdea
+
+Attachment: [romanian2.tgz]
+
+ +
+From: martin.porter@grapeshot.co.uk (Martin Porter)
+To: snowball-discuss
+Cc: atordai@science.uva.nl,
+    eglockne@ix.urz.uni-heidelberg.de,
+    irina.tirdea@gmail.com
+Date: Mon Jul 31 10:43:05 BST 2006
+Subject: Tardy response to submissions to Snowball
+
+I am sending this general email as a kind of apology, for having done nothing
+so far on the following generously sent Snowball submissions:
+
+7 June, from E. Glockner: a Romanian stemmer
+8 June, from A. Tordai: a Hungarian stemmer
+
+and this morning another Romanian stemmer arrived,
+
+31 July, from I. Tirdea, a Romanian stemmer
+
+After the first submission I promised to look at it "next week", so Mr Glockner
+has probably been wondering what has happened. [. . .] I will make a point of
+looking at these submissions this week,
+
+More soon,
+
+Martin
+
+ +
+From: martin.porter@grapeshot.co.uk (Martin Porter)
+To: snowball-discuss
+Cc: irina.tirdea@gmail.com,
+    eglockne@ix.urz.uni-heidelberg.de,
+    mstegare@hotmail.com,
+    doina_gliga@yahoo.co.uk,
+    eglockner@hotmail.com
+Date: Wed Sep 06 12:39:16 BST 2006
+Subject: Romanian stemmer
+
+To the originators of the Romanian stemmers,
+
+I have now found time to do some preliminary work on the Romanian stemmer. I
+should explain that part of the complication has been the receipt, no more
+than ten days apart, of two Romanian stemmers in Snowball, the first
+(romanian1) from [Glockner, Gliga, and Stegarescu] in Heidelberg, the second
+(romanian2) from Tirdea in Bucharest.
+
+[. . . .]
+
+I have put together a vocabulary by combining the vocabularies provided with
+romanian1 and romanian2. This appears in column 1. Column 2 is the stemmed
+form produced by romanian1, and column 3 the stemmed form produced by
+romanian2. If the entry in column 3 is blank, both stemmers are producing the
+same result.
+
+You might care to compare the two approaches.
+
+My own feeling is that romanian1 does a more thorough job of ending removal,
+but unlike romanian2 has a habit of discarding too much from short words.
+aberant->ab, abatere->ab, aburi->ab are examples of this. In romanian1 the R2
+test is rarely used (it seems to me that 'R1 or R2' is equivalent to 'R1',
+since p2 is never to the left of p1.)
+
+I might have a go at making some modifications here. Needless to say, I am
+not familiar with Romanian, but the similarity to the other Romance
+languages, especially Italian, enables one to grasp the essential features of
+the morphology.
+
+What we would like to do is to have a single stemmer for release from the
+snowball site, if that is possible, and giving all necessary credits, along
+the lines of the recent addition,
+
+http://snowballstem.org/algorithms/hungarian/stemmer.html
+
+Hope to hear from you,
+
+Martin Porter
+
+ +

+Finally we decided to produce our own Romanian stemmer as described on the +Romanian stemmer page. The submitted stemmers both contain stop word lists, +available inside the tarballs. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/otherapps/romanian/index.tt b/otherapps/romanian/index.tt new file mode 100644 index 0000000..95e66ea --- /dev/null +++ b/otherapps/romanian/index.tt @@ -0,0 +1,180 @@ +[% header('Two Romanian stemmers') %] + +

Links to resources

+ + + +

+In swift succession, we received in 2006 two stemmers for Romanian +written in Snowball. +Here is the original correspondence, +

+ +
+From: Erwin Glockner <eglockne@ix.urz.uni-heidelberg.de>
+To: snowball-discuss
+Date: Wed, 07 Jun 2006 00:06:30 +0200
+Subject: [Snowball-discuss] romanian stemmer
+
+Hello everyone,
+
+my name is Erwn Glockner, I'm a student of computational linguistics in
+Heidelberg, Germany. Together with my fellow students Doina Gliga and
+Marina Stegarescu we started to write a romanian stemmer in Snowball.
+We planned to finish the stemmer until end of this month. We would be
+happy if the stemmer would be accepted as part of the Snowball-distribution.
+There is still some work to do, e.g. evaluating the stemmer, making a
+stopwords-list, unicode support, etc. After finishing this we will send
+you our stemmer with the corresponding files, but I couldn't find any
+email address to whom the stemmer should be sent to.
+Could please someone tell me the address(es)?
+
+With kind regards,
+E. Glockner, D. Gliga, M. Stegarescu.
+
+ +
+From: Erwin Glockner <eglockne@ix.urz.uni-heidelberg.de>
+To: richard@lemurconsulting.com,
+    martin.porter@grapeshot.co.uk
+Date: Tue Jul 18 19:43:39 2006
+Subject: romanian stemmer
+
+Dear Mr. Porter, dear Mr. Boulton,
+
+we finally finished the Romanian stemmer. Unfortunately evaluation took
+more time than expected.
+However, it was an interesting experience creating the stemmer, and we
+are happy to send you the result of our work.
+The attachment-file is a Tarball-zipped file with (hopefully) all files
+needed. The files and the stemmer as well are encoded in UTF-8. Please
+inform us if something is missing.
+
+We would be happy if the Romanian stemmer would be accepted and
+integrated into the official Snowball distribution. We agree of course
+to license the stemmer under the same terms as the existing snowball
+software.
+
+We're looking forward to hear from you soon.
+
+
+With kind regards,
+
+Marina, Doina and Erwin.
+
+Attachment: [romanian1.tgz]
+
+ +
+From: Irina Tirdea <irina.tirdea@gmail.com>
+To: richard@lemurconsulting.com,
+    martin.porter@grapeshot.co.uk
+Date: Mon Jul 31 10:19:51 2006
+Subject: Romanian stemmer
+
+Hello,
+
+My name is Irina Tirdea and I have developed a Romanian stemmer in Snowball
+as part of my bachelor thesis, in Bucharest, Romania. I am sending you the
+code attached (with vocabulary and stop word list files) and I hope you will
+accept and integrate it as a part of the Snowball project. I am ready to
+release the stemmer under the BSD license, just as the Snowball software.
+The files have been written in UTF-8 encoding (on a Linux system).
+
+Looking forward to hear from you.
+
+Kind regards,
+Irina Tirdea
+
+Attachment: [romanian2.tgz]
+
+ +
+From: martin.porter@grapeshot.co.uk (Martin Porter)
+To: snowball-discuss
+Cc: atordai@science.uva.nl,
+    eglockne@ix.urz.uni-heidelberg.de,
+    irina.tirdea@gmail.com
+Date: Mon Jul 31 10:43:05 BST 2006
+Subject: Tardy response to submissions to Snowball
+
+I am sending this general email as a kind of apology, for having done nothing
+so far on the following generously sent Snowball submissions:
+
+7 June, from E. Glockner: a Romanian stemmer
+8 June, from A. Tordai: a Hungarian stemmer
+
+and this morning another Romanian stemmer arrived,
+
+31 July, from I. Tirdea, a Romanian stemmer
+
+After the first submission I promised to look at it "next week", so Mr Glockner
+has probably been wondering what has happened. [. . .] I will make a point of
+looking at these submissions this week,
+
+More soon,
+
+Martin
+
+ +
+From: martin.porter@grapeshot.co.uk (Martin Porter)
+To: snowball-discuss
+Cc: irina.tirdea@gmail.com,
+    eglockne@ix.urz.uni-heidelberg.de,
+    mstegare@hotmail.com,
+    doina_gliga@yahoo.co.uk,
+    eglockner@hotmail.com
+Date: Wed Sep 06 12:39:16 BST 2006
+Subject: Romanian stemmer
+
+To the originators of the Romanian stemmers,
+
+I have now found time to do some preliminary work on the Romanian stemmer. I
+should explain that part of the complication has been the receipt, no more
+than ten days apart, of two Romanian stemmers in Snowball, the first
+(romanian1) from [Glockner, Gliga, and Stegarescu] in Heidelberg, the second
+(romanian2) from Tirdea in Bucharest.
+
+[. . . .]
+
+I have put together a vocabulary by combining the vocabularies provided with
+romanian1 and romanian2. This appears in column 1. Column 2 is the stemmed
+form produced by romanian1, and column 3 the stemmed form produced by
+romanian2. If the entry in column 3 is blank, both stemmers are producing the
+same result.
+
+You might care to compare the two approaches.
+
+My own feeling is that romanian1 does a more thorough job of ending removal,
+but unlike romanian2 has a habit of discarding too much from short words.
+aberant->ab, abatere->ab, aburi->ab are examples of this. In romanian1 the R2
+test is rarely used (it seems to me that 'R1 or R2' is equivalent to 'R1',
+since p2 is never to the left of p1.)
+
+I might have a go at making some modifications here. Needless to say, I am
+not familiar with Romanian, but the similarity to the other Romance
+languages, especially Italian, enables one to grasp the essential features of
+the morphology.
+
+What we would like to do is to have a single stemmer for release from the
+snowball site, if that is possible, and giving all necessary credits, along
+the lines of the recent addition,
+
+http://snowballstem.org/algorithms/hungarian/stemmer.html
+
+Hope to hear from you,
+
+Martin Porter
+
+ +

+Finally we decided to produce our own Romanian stemmer as described on the +Romanian stemmer page. The submitted stemmers both contain stop word lists, +available inside the tarballs. +

+ +[% footer %] diff --git a/otherapps/romanian/romanian1.tgz b/otherapps/romanian/romanian1.tgz new file mode 100755 index 0000000..136dbc5 Binary files /dev/null and b/otherapps/romanian/romanian1.tgz differ diff --git a/otherapps/romanian/romanian2.tgz b/otherapps/romanian/romanian2.tgz new file mode 100755 index 0000000..c96661e Binary files /dev/null and b/otherapps/romanian/romanian2.tgz differ diff --git a/otherapps/schinke/index.html b/otherapps/schinke/index.html new file mode 100644 index 0000000..77703ae --- /dev/null +++ b/otherapps/schinke/index.html @@ -0,0 +1,308 @@ + + + + + + + + + + The Schinke Latin stemming algorithm - Snowball + + + + + + + + + + +
+
+
+ +
+
+

The Schinke Latin stemming algorithm

+ + +

Links to resources

+ + + +

+(A note by Martin Porter.) +

+ +

+The Schinke Latin stemming algorithm is described in, +

+ +
+ Schinke R, Greengrass M, Robertson AM and Willett P (1996) A stemming algorithm for Latin text + databases. Journal of Documentation, 52: 172-187. +
+ +

+It has the feature that it stems each word to two forms, noun and verb. For example, +

+ +
+                NOUN        VERB
+                ----        ----
+    aquila      aquil       aquila
+    portat      portat      porta
+    portis      port        por
+
+ +

+Here (slightly reformatted) are the rules of the stemmer, +

+ +
+1. (start)
+
+2.  Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
+    respectively.
+
+3.  If the word ends in '-que' then
+        if the word is on the list shown in Figure 4, then
+            write the original word to both the noun-based and verb-based
+            stem dictionaries and go to 8.
+        else remove '-que'
+
+    [Figure 4 was
+
+        atque quoque neque itaque absque apsque abusque adaeque adusque denique
+        deque susque oblique peraeque plenisque quandoque quisque quaeque
+        cuiusque cuique quemque quamque quaque quique quorumque quarumque
+        quibusque quosque quasque quotusquisque quousque ubique undique usque
+        uterque utique utroque utribique torque coque concoque contorque
+        detorque decoque excoque extorque obtorque optorque retorque recoque
+        attorque incoque intorque praetorque]
+
+4.  Match the end of the word against the suffix list show in Figure 6(a),
+    removing the longest matching suffix, (if any).
+
+    [Figure 6(a) was
+
+        -ibus -ius  -ae   -am   -as   -em   -es   -ia
+        -is   -nt   -os   -ud   -um   -us   -a    -e
+        -i    -o    -u]
+
+5.  If the resulting stem contains at least two characters then write this stem
+    to the noun-based stem dictionary.
+
+6.  Match the end of the word against the suffix list show in Figure 6(b),
+    identifying the longest matching suffix, (if any).
+
+    [Figure 6(b) was
+
+    -iuntur-beris -erunt -untur -iunt  -mini  -ntur  -stis
+    -bor   -ero   -mur   -mus   -ris   -sti   -tis   -tur
+    -unt   -bo    -ns    -nt    -ri    -m     -r     -s
+    -t]
+
+    If any of the following suffixes are found then convert them as shown:
+
+        '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
+        '-beris', '-bor', and '-bo' to '-bi';
+        '-ero' to '-eri'
+
+    else remove the suffix in the normal way.
+
+7.  If the resulting stem contains at least two characters then write this stem
+    to the verb-based stem dictionary.
+
+8.  (end)
+
+ +

+Unfortunately I was not able to make the rules match the examples given, which +led to the following email correspondence, +

+ +
+From: Martin Porter
+To: Peter Willett
+Date: Mon Sep 10 15:11:51 2001
+Subject: Re: Stemming algorithms
+
+> ... I'm no longer working in the IR area,
+>spending all of my time on computational chemistry/drug discovery
+>research but I guess that Mark Sanderson would be interested in
+>Snowball - do you mind if I pass your email onto him?
+
+Peter,
+
+Well, actually, I do have a question, if you can cast your mind back. I've
+implemented the Latin Stemmer in Snowball (see below: you'll have to guess the
+semantics, but I'm sure you'll agree the syntax looks nice), and find that Fig
+5 of the 1996 Schinke paper doesn't correspond to the algorithm of fig 7, but to
+the algorithm with the extra rules concerning -ba-, -bi-, -sse- mentioned on
+page 182. Which is the "correct" algorithm - with or without those rules? If
+with, what is the exact criterion for their removal? A bigger problem is why
+the -nt is not removed from 'Apparebunt', given -nt as an ending in 6(a). Is
+-nt a misprint?
+
+Sorry to bother you with this, but the paper says you are the one "to whom all
+correspondence should be addressed" :-)
+
+Martin
+
+
+ Here is your algorithm in Snowball. The generated code will do about 1 million
+ Latin word in 5 seconds:
+
+ -------
+
+
+
strings ( noun_form  verb_form )
+
+routines (
+
+   map_letters
+   que_word
+)
+
+externals ( stem )
+
+define map_letters as (
+
+    do repeat ( goto ( ['j'] ) <- 'i' )
+    do repeat ( goto ( ['v'] ) <- 'u' )
+)
+
+backwardmode (
+
+    define que_word as (
+
+        ['que'] (
+            among (
+                'at' 'quo' 'ne' 'ita' 'abs' 'aps' 'abus' 'adae' 'adus'
+                'deni' 'de' 'sus' 'obli' 'perae' 'plenis' 'quando' 'quis'
+                'quae' 'cuius' 'cui' 'quem' 'quam' 'qua' 'qui'
+                'quorum' 'quarum' 'quibus' 'quos' 'quas' 'quotusquis'
+                'quous' 'ubi' 'undi' 'us' 'uter' 'uti' 'utro' 'utribi'
+                'tor' 'co' 'conco' 'contor' 'detor' 'deco' 'exco' 'extor'
+                'obtor' 'optor' 'retor' 'reco' 'attor' 'inco' 'intor'
+                'praetor'
+            ) atlimit ]
+            => noun_form
+            => verb_form
+        ) or fail(delete)
+    )
+)
+
+define stem as (
+
+    map_letters
+
+    backwards (
+        que_word or (
+            => noun_form
+            => verb_form
+
+            $noun_form backwards try (
+                [substring] hop 2
+                among (
+                    'ibus' 'ius' 'ae' 'am' 'as' 'em' 'es' 'ia' 'is' 'nt'
+                    'os' 'ud' 'um' 'us' 'a' 'e' 'i' 'o' 'u'
+                        (delete)
+                )
+            )
+
+            $verb_form backwards try (
+                [substring] hop 2
+                among (
+                    'iuntur' 'erunt' 'untur' 'iunt' 'unt'
+                         (<-'i')
+                    'beris' 'bor' 'bo'
+                         (<-'bi')
+                    'ero'
+                         (<-'eri')
+                    'mini' 'ntur' 'stis' 'mur' 'mus' 'ris' 'sti' 'tis'
+                    'tur' 'ns' 'nt' 'ri' 'm' 'r' 's' 't'
+                         (delete)
+                )
+            )
+        )
+    )
+
+    /* the stemmed words are left in noun-form and verb-form, and can
+       be picked up as C strings at z->S[0] and z->S[1] through the API. */
+)
+
+ +
+ +
+From: Peter Willett
+To: Martin Porter
+Date: Mon Sep 10 20:25:24 2001
+Subject: Re: Stemming algorithms
+
+Martin
+
+Sorry - I just cannot answer.  Robertson has retired to Dorset while
+Schinke is now in - I think - Canada
+
+Peter
+
+ +

+Following this, I was unable to contact Schinke, and so the problems have +remained unresolved. +

+ +

+The linked zip file contains the stemmer, +generated C version, and sample data. +(The stemmer differs slightly from the version in the email above in that +it assembles the noun- and verb-forms of the stem in a single string with +space separation.) +voc.txt is a sample vocabulary, and joined.txt the vocabulary +joined with the two stemmed forms as three column output. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/otherapps/schinke/index.tt b/otherapps/schinke/index.tt new file mode 100644 index 0000000..6589bf4 --- /dev/null +++ b/otherapps/schinke/index.tt @@ -0,0 +1,168 @@ +[% header('The Schinke Latin stemming algorithm') %] + +

Links to resources

+ + + +

+(A note by Martin Porter.) +

+ +

+The Schinke Latin stemming algorithm is described in, +

+ +
+ Schinke R, Greengrass M, Robertson AM and Willett P (1996) A stemming algorithm for Latin text + databases. Journal of Documentation, 52: 172-187. +
+ +

+It has the feature that it stems each word to two forms, noun and verb. For example, +

+ +
+                NOUN        VERB
+                ----        ----
+    aquila      aquil       aquila
+    portat      portat      porta
+    portis      port        por
+
+ +

+Here (slightly reformatted) are the rules of the stemmer, +

+ +
+1. (start)
+
+2.  Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
+    respectively.
+
+3.  If the word ends in '-que' then
+        if the word is on the list shown in Figure 4, then
+            write the original word to both the noun-based and verb-based
+            stem dictionaries and go to 8.
+        else remove '-que'
+
+    [Figure 4 was
+
+        atque quoque neque itaque absque apsque abusque adaeque adusque denique
+        deque susque oblique peraeque plenisque quandoque quisque quaeque
+        cuiusque cuique quemque quamque quaque quique quorumque quarumque
+        quibusque quosque quasque quotusquisque quousque ubique undique usque
+        uterque utique utroque utribique torque coque concoque contorque
+        detorque decoque excoque extorque obtorque optorque retorque recoque
+        attorque incoque intorque praetorque]
+
+4.  Match the end of the word against the suffix list show in Figure 6(a),
+    removing the longest matching suffix, (if any).
+
+    [Figure 6(a) was
+
+        -ibus -ius  -ae   -am   -as   -em   -es   -ia
+        -is   -nt   -os   -ud   -um   -us   -a    -e
+        -i    -o    -u]
+
+5.  If the resulting stem contains at least two characters then write this stem
+    to the noun-based stem dictionary.
+
+6.  Match the end of the word against the suffix list show in Figure 6(b),
+    identifying the longest matching suffix, (if any).
+
+    [Figure 6(b) was
+
+    -iuntur-beris -erunt -untur -iunt  -mini  -ntur  -stis
+    -bor   -ero   -mur   -mus   -ris   -sti   -tis   -tur
+    -unt   -bo    -ns    -nt    -ri    -m     -r     -s
+    -t]
+
+    If any of the following suffixes are found then convert them as shown:
+
+        '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
+        '-beris', '-bor', and '-bo' to '-bi';
+        '-ero' to '-eri'
+
+    else remove the suffix in the normal way.
+
+7.  If the resulting stem contains at least two characters then write this stem
+    to the verb-based stem dictionary.
+
+8.  (end)
+
+ +

+Unfortunately I was not able to make the rules match the examples given, which +led to the following email correspondence, +

+ +
+From: Martin Porter
+To: Peter Willett
+Date: Mon Sep 10 15:11:51 2001
+Subject: Re: Stemming algorithms
+
+> ... I'm no longer working in the IR area,
+>spending all of my time on computational chemistry/drug discovery
+>research but I guess that Mark Sanderson would be interested in
+>Snowball - do you mind if I pass your email onto him?
+
+Peter,
+
+Well, actually, I do have a question, if you can cast your mind back. I've
+implemented the Latin Stemmer in Snowball (see below: you'll have to guess the
+semantics, but I'm sure you'll agree the syntax looks nice), and find that Fig
+5 of the 1996 Schinke paper doesn't correspond to the algorithm of fig 7, but to
+the algorithm with the extra rules concerning -ba-, -bi-, -sse- mentioned on
+page 182. Which is the "correct" algorithm - with or without those rules? If
+with, what is the exact criterion for their removal? A bigger problem is why
+the -nt is not removed from 'Apparebunt', given -nt as an ending in 6(a). Is
+-nt a misprint?
+
+Sorry to bother you with this, but the paper says you are the one "to whom all
+correspondence should be addressed" :-)
+
+Martin
+
+
+ Here is your algorithm in Snowball. The generated code will do about 1 million
+ Latin word in 5 seconds:
+
+ -------
+
+
+[% highlight_file('schinke') %] +
+ +
+From: Peter Willett
+To: Martin Porter
+Date: Mon Sep 10 20:25:24 2001
+Subject: Re: Stemming algorithms
+
+Martin
+
+Sorry - I just cannot answer.  Robertson has retired to Dorset while
+Schinke is now in - I think - Canada
+
+Peter
+
+ +

+Following this, I was unable to contact Schinke, and so the problems have +remained unresolved. +

+ +

+The linked zip file contains the stemmer, +generated C version, and sample data. +(The stemmer differs slightly from the version in the email above in that +it assembles the noun- and verb-forms of the stem in a single string with +space separation.) +voc.txt is a sample vocabulary, and joined.txt the vocabulary +joined with the two stemmed forms as three column output. +

+ +[% footer %] diff --git a/otherapps/schinke/schinke.tgz b/otherapps/schinke/schinke.tgz new file mode 100644 index 0000000..ecae7d7 Binary files /dev/null and b/otherapps/schinke/schinke.tgz differ diff --git a/projects.html b/projects.html new file mode 100644 index 0000000..d97aa22 --- /dev/null +++ b/projects.html @@ -0,0 +1,253 @@ + + + + + + + + + + Projects - Snowball + + + + + + + + + + +
+
+ +
+

Projects

+ + +

+This page lists projects which are related to Snowball in some way. +

+ +
+ +

Wrappers

+ +

+These projects allow Snowball-generated stemmers to be used from other +languages. +

+ +

+Aside from PyStemmer, we've not tested them to see if they successfully wrap +the Snowball-generated code, are well implemented, etc. These projects aren't +endorsed or recommended as such, but we hope they may be of interest. +

+ +

libstemmer in Go

+

+(added Feb 2013) Miki Tebeka has ported libstemmer_c to the Go programming +language. +

+ +

PyStemmer

+ +

+Richard Boulton put together some new Python bindings for snowball, inspired by +Andreas Jung's initial implementation of PyStemmer from 2001, but with a +different API. PyStemmer's current home is as part of the snowballstem github +project. +

+ +

Lingua::Stem::Snowball

+

+Lingua::Stem::Snowball is an XS module which provides a Perl interface to the +C versions of the Snowball stemmers. The Snowball stopwords lists are also +wrapped by Lingua::StopWords. +

+ +

PHP Stemmer

+

+PHP bindings written by Andrea Maccis, and largely inspired by Richard Boulton's PyStemmer. +

+ +

Node.js Stemmer

+

+A Node.js interface to the Snowball stemming algorithms, written by Andrea Maccis and largely inspired by Richard Boulton's PyStemmer. +

+ +
+ +

Reimplementations of the Stemming Algorithms

+ +

+These projects reimplement the Snowball algorithms, either in hand-written +code, or in code manually translated from the generated output for another +language. +

+ +

+We've not tested them to see if they correctly implement the stemming +algorithms, are well implemented, etc. These projects aren't endorsed +or recommended as such, but we hope they may be of interest. +

+ +

+If you want to use one of these stemmers, we suggest you take the sample +vocabulary for the corresponding natural language, and check that the +stemmer produces the corresponding stemmed output. +

+ +

Oleander Stemming Library

+

+A reimplementation of the Snowball stemming algorithms in C++. +

+ +

English (porter2) Stemmer in C++11

+

+(added Oct 2012) Sean Massung of the University of Illinois has implemented the porter2 stemmer in +C++11. +

+ +

NLTK

+

+(added June 2010) Python versions of nearly all the stemmers have been made +available by Peter Stahl at NLTK’s code repository. +

+ +

Javascript stemmers

+

+Javascript versions of nearly all the stemmers, created by Oleg Mazko by hand +from the C/Java output of the Snowball compiler. +

+ +

js-lingua-stem-ru

+

+This is a different implementation of the Russian stemmer in Javascript, +created by Mark A. Prisyazhnyuk. +

+ +

XSLT 3 Porter2 stemmer

+

+(added July 2019). Martin Holmes has implemented Porter2 in XSLT 3. In the same repo, there is also a re-implementation +of Porter2 in JavaScript. +

+ +

Contributed Stemmers on the old Snowball Website

+

+Martin and Richard collected and hosted contributed stemmers in a number +of programming languages, which you can still find on the old website. +

+ +
+ +

Consumers

+ +

+These projects use Snowball stemmers: +

+ +

Xapian

+

+A probabilistic search engine which supports stemming using Snowball. +

+ +

TextIndexNG

+

+A fulltext indexing solution for Zope with support for stemming using Snowball. +

+ +

urim

+

+(added Sep 2010) Developed by Oleg Mazko, Urim is a standalone, +offline tag-cloud builder engine, fully written in JavaScript and so +capable of integration into all Internet browsers. Available as +a Firefox add-on. With a JavaScript port of the Snowball stemmers (danish, +dutch, english, finnish, french, german, hungarian, italian, +norwegian, portuguese, russian, spanish, swedish, romanian, turkish) +also available as a separate library ready for developers. +

+ +

OpenFTS

+

+OpenFTS is an advanced PostgreSQL-based search engine, which supports using +Snowball for stemming. +

+ +

Manticore Search

+

+Manticore Search in an open source search server which supports stemming using Snowball. +

+ +

Snowball support in other tools

+ +

+These projects add Snowball support to other tools, or are tools including +support for Snowball. +

+ +

Pygments

+

+Pygments is a syntax highlighter written in Python, and includes support for +syntax highlighting Snowball code (since version 2.2). +

+ +
+

+Contact us if you have a project that you would like to be listed here. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..c2a49f4 --- /dev/null +++ b/robots.txt @@ -0,0 +1,2 @@ +User-agent: * +Allow: / diff --git a/runtime/use.html b/runtime/use.html new file mode 100644 index 0000000..9216484 --- /dev/null +++ b/runtime/use.html @@ -0,0 +1,464 @@ + + + + + + + + + + Using Snowball - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Using Snowball

+ + +

Links to resources

+ + + +

Compiling Snowball

+ +

+When you download Snowball, +it already contains a make file to allow you to build it, like so: +

+ +
+    make
+
+ +

+You can confirm it's working with a simple test like so: +

+ +
+    echo "running" | ./stemwords -l en
+
+ +

+which should output: run +

+ +

+There's no built in way to install snowball currently - you can either copy +the snowball binary to somewhere that's on your PATH +(e.g. on a typical Linux machine: sudo cp snowball /usr/local/bin) +or just run it from the source tree with ./snowball). +

+ +

Running Snowball

+ +

+The snowball compiler has the following command line syntax, +

+ +
+Usage: snowball SOURCE_FILE... [OPTIONS]
+
+Supported options:
+  -o, -output OUTPUT_BASE
+  -s, -syntax
+  -comments
+  -j, -java
+  -cs, -csharp
+  -c++
+  -pascal
+  -py, -python
+  -js[=TYPE]                       generate Javascript (TYPE values:
+                                   esm global, default: global)
+  -rust
+  -go
+  -ada
+  -w, -widechars
+  -u, -utf8
+  -n, -name CLASS_NAME
+  -ep, -eprefix EXTERNAL_PREFIX
+  -vp, -vprefix VARIABLE_PREFIX
+  -i, -include DIRECTORY
+  -r, -runtime DIRECTORY
+  -p, -parentclassname CLASS_NAME  fully qualified parent class name
+  -P, -Package PACKAGE_NAME        package name for stemmers
+  -S, -Stringclass STRING_CLASS    StringBuffer-compatible class
+  -a, -amongclass AMONG_CLASS      fully qualified name of the Among class
+  -gop, -gopackage PACKAGE_NAME    Go package name for stemmers
+  -gor, -goruntime PACKAGE_NAME    Go snowball runtime package
+  --help                           display this help and exit
+  --version                        output version information and exit
+
+ +

+For example, +

+ +
+    snowball danish.sbl -o q/danish
+    snowball danish.sbl -syntax
+    snowball danish.sbl -output q/danish -ep danish_
+
+ +

+The first argument,  SOURCE_FILE, is the name of the Snowball file to be compiled. Unless you specify a different programming language to +generate code for, the default is to generate ISO C which results in two output +files, a C source in  OUTPUT_BASE.c  and a corresponding header file in  OUTPUT_BASE.h. This is similar for other +programming languages, e.g. if option  -java  is +present, Java output is produced in  OUTPUT_BASE.java. +

+ +

+Some options are only valid when generating code for particular programming +languages. For example, the  -widechars, + -utf8,  -eprefix  and + -vprefix  options are specific to C and C++. +

+ +

ISO C generation

+ +

+In the absence of the  -eprefix  and  -vprefix  options, the list of +declared externals in the Snowball program, for example, +

+ +
    externals ( stem_1 stem_2 moderate )
+
+ + +

+gives rise to a header file containing, +

+ +
    extern struct SN_env * create_env(void);
+    extern void close_env(struct SN_env * z);
+
+    extern int moderate(struct SN_env * z);
+    extern int stem_2(struct SN_env * z);
+    extern int stem_1(struct SN_env * z);
+
+ +

+If  -eprefix  is used, its string,  S1, is prefixed to each external +name, for example +

+ +
+    -eprefix Khotanese_
+
+ +

+would give rise to the header file, +

+ +
    extern struct SN_env * Khotanese_create_env(void);
+    extern void Khotanese_close_env(struct SN_env * z);
+
+    extern int Khotanese_moderate(struct SN_env * z);
+    extern int Khotanese_stem_2(struct SN_env * z);
+    extern int Khotanese_stem_1(struct SN_env * z);
+
+ +

+If  -vprefix  is used, all Snowball strings, integers and booleans give +rise to a  #define  line in the header file. For example +

+ +
+    -eprefix Khotanese_ -vprefix Khotanese_variable
+
+ +

+would give rise the header file, +

+ +
    extern struct SN_env * Khotanese_create_env(void);
+    extern void Khotanese_close_env(struct SN_env * z);
+
+    #define Khotanese_variable_ch (S[0])
+    #define Khotanese_variable_Y_found (B[0])
+    #define Khotanese_variable_p2 (I[1])
+    #define Khotanese_variable_p1 (I[0])
+    extern int Khotanese_stem(struct SN_env * z);
+
+ +

+The -utf8 and -widechars  options affects how +the generated C/C++ code expects strings to be represented - UTF-8 or +wide-character Unicode (stored using 2 bytes per codepoint), or if neither is +specified, one byte per codepoint using either ISO-8859-1 or another encoding. +

+ +

+For other programming languages, one of these three options is effectively +implicitly hard-coded (except wide-characters may be wider) - e.g. C#, Java, +Javascript and Python use wide characters; Ada, Go and Rust use UTF-8; Pascal +uses ISO-8859-1. Since Snowball 2.0 it's possible with a little care to write +Snowball code that works regardless of how characters are represented. See +section 12 of the Snowball manual for +more details. +

+ +

+The  -runtime  option is used to prepend a path to any  #include +lines in the generated code, and is useful when the runtime header files (i.e. +those files in the runtime directory in the standard distribution) are not +in the same location as the generated source files. It is used when +building the libstemmer library, and may be useful for other projects. +

+ + + +

Other options

+ +

+If  -syntax  is used the other options are ignored, and the syntax tree +of the Snowball program is directed to  stdout. This can be a handy way +of checking that you have got the bracketing right in the program you have +written. +

+ +

+Any number of  -include  options may be present, for example, +

+ +
+    snowball testfile -output test -ep danish_  \
+             -include /home/martin/Snowball/codesets  \
+             -include extras
+
+ +

+Each  -include  is followed by a directory name. With a chain of +directories  D1,  D2  ...  Dn, a Snowball  get  directive, +

+ +
    get 'F'
+
+ + +

+causes  F  to be searched for in the successive locations, +

+ +
+    F
+    D1/F
+    D2/F
+    ...
+    Dn/F
+
+ +

+— that is, the current directory, followed in turn by directories  D1  to +Dn. +

+ +

The Snowball API

+ +

+To access Snowball from C, include the header  api.h, and any headers +generated from the Snowball scripts you wish to use.  api.h  declares +

+ +
    struct SN_env { /* ... */ };
+    extern void SN_set_current(struct SN_env * z, int size, char * s);
+
+ +

+Continuing the previous example, you set up an environment to call the +resources of the Khotanese module with +

+ +
    struct SN_env * z;
+    z = Khotanese_create_env();
+
+ +

+Snowball has the concept of a ‘current string’. +This can be set up by, +

+ +
    SN_set_current(z, i, b);
+
+ +

+This defines the current string as the  i  bytes of data starting at +address  b. The externals can then be called, +

+ +
    Khotanese_moderate(z);
+    /* ... */
+    Khotanese_stem_1(z);
+
+ +

+They give a 1 or 0 result, corresponding to the t or f result of +the Snowball routine. +

+ +

+And later, +

+ +
    Khotanese_close_env(z);
+
+ +

+To release the space raised by z back to the system. You can do this for a +number of Snowball modules at the same time: you will need a separate +struct SN_env * z;  for each module. +

+ +

+The current string is given by the  z->l  bytes of data starting at  z->p. +The string is not zero-terminated, but you can zero terminate it yourself with +

+ +
    z->p[z->l] = 0;
+
+ +

+(There is always room for this last zero byte.) For example, +

+ +
    SN_set_current(z, strlen(s), s);
+    Khotanese_stem_1(z);
+    z->p[z->l] = 0;
+    printf("Khotanese-1 stems '%s' to '%s'\n", s, z->p);
+
+ +

+The values of the other variables can be accessed via the  #define +settings that result from the  -vprefix  option, although this should not +usually be necessary: +

+ +
    printf("p1 is %d\n", z->Khotanese_variable_p1);
+
+ +

+The stemming scripts on this Web site use Snowball very simply. +-vprefix  is left unset, and  -eprefix  is set to the name of the +script (usually the language the script is for). +

+ + + +

Debugging snowball scripts

+ +

+In the rare event that your Snowball script does not run perfectly the first time: +

+ +

+Remember that the option  -syntax  prints out the syntax tree. A question +mark can be included in Snowball as a command, and it will generate a call +debug(...). The defined  debug  in  runtime/utilities.c  (usually +commented out) can then be used. It causes the +current string to sent to  stdout, with square brackets marking the +slice and vertical bar the position of c. Curly brackets mark the +end-limits of the string, which may be less than the whole string because +of the action of  setlimit. +

+ +

+At present there is no way of reporting the value of an integer or boolean. +

+ +

+If desperate, you can put debugging lines into the generated C program. +You can pass -comments to the snowball compiler to get it to +generate comments showing the correspondence with the Snowball source which +makes it easier to find where to add such debugging code. +

+ +

Compiler bugs

+ +

+If you hit a snowball compiler bug, try to +capture it in a small script before notifying us. +

+ +

Known problems in Snowball

+ +

+The main one is that it is possible to ‘pull the rug from under your own feet’ in +constructions like this: +

+ +
    [ do something ]
+    do something_else
+    ( C1 delete C2 ) or ( C3 )
+
+ + +

+Suppose  C1  gives t, the delete removes the slice established on the first +line, and  C2  gives f, so C3 is done with c set back to the value it had +before  C1  was obeyed — but this old value does not take account of the byte shift +caused by the delete. This problem was foreseen from the beginning when designing +Snowball, and recognised as a minor issue because it is an unnatural thing to want to +do. (C3  should not be an alternative to something which has deletion as an +occasional side-effect.) It may be addressed in the future. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/runtime/use.tt b/runtime/use.tt new file mode 100644 index 0000000..1a00df1 --- /dev/null +++ b/runtime/use.tt @@ -0,0 +1,381 @@ +[% header('Using Snowball') %] + +

Links to resources

+ + + +

Compiling Snowball

+ +

+When you download Snowball, +it already contains a make file to allow you to build it, like so: +

+ +
+    make
+
+ +

+You can confirm it's working with a simple test like so: +

+ +
+    echo "running" | ./stemwords -l en
+
+ +

+which should output: run +

+ +

+There's no built in way to install snowball currently - you can either copy +the snowball binary to somewhere that's on your PATH +(e.g. on a typical Linux machine: sudo cp snowball /usr/local/bin) +or just run it from the source tree with ./snowball). +

+ +

Running Snowball

+ +

+The snowball compiler has the following command line syntax, +

+ +
+[% snowball_help | html %]
+
+ +

+For example, +

+ +
+    snowball danish.sbl -o q/danish
+    snowball danish.sbl -syntax
+    snowball danish.sbl -output q/danish -ep danish_
+
+ +

+The first argument,  SOURCE_FILE, is the name of the Snowball file to be compiled. Unless you specify a different programming language to +generate code for, the default is to generate ISO C which results in two output +files, a C source in  OUTPUT_BASE.c  and a corresponding header file in  OUTPUT_BASE.h. This is similar for other +programming languages, e.g. if option  -java  is +present, Java output is produced in  OUTPUT_BASE.java. +

+ +

+Some options are only valid when generating code for particular programming +languages. For example, the  -widechars, + -utf8,  -eprefix  and + -vprefix  options are specific to C and C++. +

+ +

ISO C generation

+ +

+In the absence of the  -eprefix  and  -vprefix  options, the list of +declared externals in the Snowball program, for example, +

+ +[% highlight(" + externals ( stem_1 stem_2 moderate ) +") %] + +

+gives rise to a header file containing, +

+ +[% highlight(" + extern struct SN_env * create_env(void); + extern void close_env(struct SN_env * z); + + extern int moderate(struct SN_env * z); + extern int stem_2(struct SN_env * z); + extern int stem_1(struct SN_env * z); +", "c") %] + +

+If  -eprefix  is used, its string,  S1, is prefixed to each external +name, for example +

+ +
+    -eprefix Khotanese_
+
+ +

+would give rise to the header file, +

+ +[% highlight(" + extern struct SN_env * Khotanese_create_env(void); + extern void Khotanese_close_env(struct SN_env * z); + + extern int Khotanese_moderate(struct SN_env * z); + extern int Khotanese_stem_2(struct SN_env * z); + extern int Khotanese_stem_1(struct SN_env * z); +", "c") %] + +

+If  -vprefix  is used, all Snowball strings, integers and booleans give +rise to a  #define  line in the header file. For example +

+ +
+    -eprefix Khotanese_ -vprefix Khotanese_variable
+
+ +

+would give rise the header file, +

+ +[% highlight(" + extern struct SN_env * Khotanese_create_env(void); + extern void Khotanese_close_env(struct SN_env * z); + + #define Khotanese_variable_ch (S[0]) + #define Khotanese_variable_Y_found (B[0]) + #define Khotanese_variable_p2 (I[1]) + #define Khotanese_variable_p1 (I[0]) + extern int Khotanese_stem(struct SN_env * z); +", "c") %] + +

+The -utf8 and -widechars  options affects how +the generated C/C++ code expects strings to be represented - UTF-8 or +wide-character Unicode (stored using 2 bytes per codepoint), or if neither is +specified, one byte per codepoint using either ISO-8859-1 or another encoding. +

+ +

+For other programming languages, one of these three options is effectively +implicitly hard-coded (except wide-characters may be wider) - e.g. C#, Java, +Javascript and Python use wide characters; Ada, Go and Rust use UTF-8; Pascal +uses ISO-8859-1. Since Snowball 2.0 it's possible with a little care to write +Snowball code that works regardless of how characters are represented. See +section 12 of the Snowball manual for +more details. +

+ +

+The  -runtime  option is used to prepend a path to any  #include +lines in the generated code, and is useful when the runtime header files (i.e. +those files in the runtime directory in the standard distribution) are not +in the same location as the generated source files. It is used when +building the libstemmer library, and may be useful for other projects. +

+ + + +

Other options

+ +

+If  -syntax  is used the other options are ignored, and the syntax tree +of the Snowball program is directed to  stdout. This can be a handy way +of checking that you have got the bracketing right in the program you have +written. +

+ +

+Any number of  -include  options may be present, for example, +

+ +
+    snowball testfile -output test -ep danish_  \
+             -include /home/martin/Snowball/codesets  \
+             -include extras
+
+ +

+Each  -include  is followed by a directory name. With a chain of +directories  D1,  D2  ...  Dn, a Snowball  get  directive, +

+ +[% highlight(" + get 'F' +") %] + +

+causes  F  to be searched for in the successive locations, +

+ +
+    F
+    D1/F
+    D2/F
+    ...
+    Dn/F
+
+ +

+— that is, the current directory, followed in turn by directories  D1  to +Dn. +

+ +

The Snowball API

+ +

+To access Snowball from C, include the header  api.h, and any headers +generated from the Snowball scripts you wish to use.  api.h  declares +

+ +[% highlight(" + struct SN_env { /* ... */ }; + extern void SN_set_current(struct SN_env * z, int size, char * s); +", "c") %] + +

+Continuing the previous example, you set up an environment to call the +resources of the Khotanese module with +

+ +[% highlight(" + struct SN_env * z; + z = Khotanese_create_env(); +", "c") %] + +

+Snowball has the concept of a ‘current string’. +This can be set up by, +

+ +[% highlight(" + SN_set_current(z, i, b); +", "c") %] + +

+This defines the current string as the  i  bytes of data starting at +address  b. The externals can then be called, +

+ +[% highlight(" + Khotanese_moderate(z); + /* ... */ + Khotanese_stem_1(z); +", "c") %] + +

+They give a 1 or 0 result, corresponding to the t or f result of +the Snowball routine. +

+ +

+And later, +

+ +[% highlight(" + Khotanese_close_env(z); +", "c") %] + +

+To release the space raised by z back to the system. You can do this for a +number of Snowball modules at the same time: you will need a separate +struct SN_env * z;  for each module. +

+ +

+The current string is given by the  z->l  bytes of data starting at  z->p. +The string is not zero-terminated, but you can zero terminate it yourself with +

+ +[% highlight(" + z->p[z->l] = 0; +", "c") %] + +

+(There is always room for this last zero byte.) For example, +

+ +[% highlight(' + SN_set_current(z, strlen(s), s); + Khotanese_stem_1(z); + z->p[z->l] = 0; + printf("Khotanese-1' _ " stems '%s' to '%s'" _ '\n", s, z->p); +', "c") %] + +

+The values of the other variables can be accessed via the  #define +settings that result from the  -vprefix  option, although this should not +usually be necessary: +

+ +[% highlight(' + printf("p1 is %d\n", z->Khotanese_variable_p1); +', "c") %] + +

+The stemming scripts on this Web site use Snowball very simply. +-vprefix  is left unset, and  -eprefix  is set to the name of the +script (usually the language the script is for). +

+ + + +

Debugging snowball scripts

+ +

+In the rare event that your Snowball script does not run perfectly the first time: +

+ +

+Remember that the option  -syntax  prints out the syntax tree. A question +mark can be included in Snowball as a command, and it will generate a call +debug(...). The defined  debug  in  runtime/utilities.c  (usually +commented out) can then be used. It causes the +current string to sent to  stdout, with square brackets marking the +slice and vertical bar the position of c. Curly brackets mark the +end-limits of the string, which may be less than the whole string because +of the action of  setlimit. +

+ +

+At present there is no way of reporting the value of an integer or boolean. +

+ +

+If desperate, you can put debugging lines into the generated C program. +You can pass -comments to the snowball compiler to get it to +generate comments showing the correspondence with the Snowball source which +makes it easier to find where to add such debugging code. +

+ +

Compiler bugs

+ +

+If you hit a snowball compiler bug, try to +capture it in a small script before notifying us. +

+ +

Known problems in Snowball

+ +

+The main one is that it is possible to ‘pull the rug from under your own feet’ in +constructions like this: +

+ +[% highlight(' + [ do something ] + do something_else + ( C1 delete C2 ) or ( C3 ) +') %] + +

+Suppose  C1  gives t, the delete removes the slice established on the first +line, and  C2  gives f, so C3 is done with c set back to the value it had +before  C1  was obeyed — but this old value does not take account of the byte shift +caused by the delete. This problem was foreseen from the beginning when designing +Snowball, and recognised as a minor issue because it is an unnatural thing to want to +do. (C3  should not be an alternative to something which has deletion as an +occasional side-effect.) It may be addressed in the future. +

+ +[% footer %] diff --git a/snub-dodecahedron.gif b/snub-dodecahedron.gif new file mode 100644 index 0000000..219b84a Binary files /dev/null and b/snub-dodecahedron.gif differ diff --git a/styles.css b/styles.css new file mode 100644 index 0000000..b5bf61a --- /dev/null +++ b/styles.css @@ -0,0 +1,101 @@ +body { + min-height: 2000px; +} + +.navbar-static-top { + margin-bottom: 19px; +} + +/* From: https://css-tricks.com/snippets/css/make-pre-text-wrap/ */ +.license { + white-space: pre-wrap; /* css-3 */ + white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ + white-space: -pre-wrap; /* Opera 4-6 */ + white-space: -o-pre-wrap; /* Opera 7 */ + word-wrap: break-word; /* Internet Explorer 5.5+ */ +} + +.footer { + padding-top: 19px; + color: #777; + border-top: 1px solid #e5e5e5; +} + + td.linenos { background-color: #f0f0f0; padding-right: 10px; } + span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; } + pre { line-height: 125%; } + body .hll { background-color: #ffffcc } + .highlight { background: #f8f8f8; } + body .c { color: #408080; font-style: italic } /* Comment */ + body .err { border: 1px solid #FF0000 } /* Error */ + body .k { color: #008000; font-weight: bold } /* Keyword */ + body .o { color: #666666 } /* Operator */ + body .cm { color: #408080; font-style: italic } /* Comment.Multiline */ + body .cp { color: #BC7A00 } /* Comment.Preproc */ + body .c1 { color: #408080; font-style: italic } /* Comment.Single */ + body .cs { color: #408080; font-style: italic } /* Comment.Special */ + body .gd { color: #A00000 } /* Generic.Deleted */ + body .ge { font-style: italic } /* Generic.Emph */ + body .gr { color: #FF0000 } /* Generic.Error */ + body .gh { color: #000080; font-weight: bold } /* Generic.Heading */ + body .gi { color: #00A000 } /* Generic.Inserted */ + body .go { color: #888888 } /* Generic.Output */ + body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ + body .gs { font-weight: bold } /* Generic.Strong */ + body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ + body .gt { color: #0044DD } /* Generic.Traceback */ + body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ + body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ + body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ + body .kp { color: #008000 } /* Keyword.Pseudo */ + body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ + body .kt { color: #B00040 } /* Keyword.Type */ + body .m { color: #666666 } /* Literal.Number */ + body .s { color: #BA2121 } /* Literal.String */ + body .na { color: #7D9029 } /* Name.Attribute */ + body .nb { color: #008000 } /* Name.Builtin */ + body .nc { color: #0000FF; font-weight: bold } /* Name.Class */ + body .no { color: #880000 } /* Name.Constant */ + body .nd { color: #AA22FF } /* Name.Decorator */ + body .ni { color: #999999; font-weight: bold } /* Name.Entity */ + body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */ + body .nf { color: #0000FF } /* Name.Function */ + body .nl { color: #A0A000 } /* Name.Label */ + body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ + body .nt { color: #008000; font-weight: bold } /* Name.Tag */ + body .nv { color: #19177C } /* Name.Variable */ + body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ + body .w { color: #bbbbbb } /* Text.Whitespace */ + body .mf { color: #666666 } /* Literal.Number.Float */ + body .mh { color: #666666 } /* Literal.Number.Hex */ + body .mi { color: #666666 } /* Literal.Number.Integer */ + body .mo { color: #666666 } /* Literal.Number.Oct */ + body .sb { color: #BA2121 } /* Literal.String.Backtick */ + body .sc { color: #BA2121 } /* Literal.String.Char */ + body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ + body .s2 { color: #BA2121 } /* Literal.String.Double */ + body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */ + body .sh { color: #BA2121 } /* Literal.String.Heredoc */ + body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */ + body .sx { color: #008000 } /* Literal.String.Other */ + body .sr { color: #BB6688 } /* Literal.String.Regex */ + body .s1 { color: #BA2121 } /* Literal.String.Single */ + body .ss { color: #19177C } /* Literal.String.Symbol */ + body .bp { color: #008000 } /* Name.Builtin.Pseudo */ + body .vc { color: #19177C } /* Name.Variable.Class */ + body .vg { color: #19177C } /* Name.Variable.Global */ + body .vi { color: #19177C } /* Name.Variable.Instance */ + body .il { color: #666666 } /* Literal.Number.Integer.Long */ + +/* Stop from looking like an error. */ +code { + color: #00008B; + background-color: #EEEEFF; +} + +/* Indent definition lists. */ +dt { margin-left: 32px; } +dd { margin-left: 64px; } + +/* From: https://codepo8.github.io/css-fork-on-github-ribbon/ */ +#forkongithub a{background:#000;color:#fff;text-decoration:none;font-family:arial,sans-serif;text-align:center;font-weight:bold;padding:5px 40px;font-size:1rem;line-height:2rem;position:relative;transition:0.5s;}#forkongithub a:hover{background:#c11;color:#fff;}#forkongithub a::before,#forkongithub a::after{content:"";width:100%;display:block;position:absolute;top:1px;left:0;height:1px;background:#fff;}#forkongithub a::after{bottom:1px;top:auto;}@media screen and (min-width:800px){#forkongithub{position:absolute;display:block;top:0;right:0;width:200px;overflow:hidden;height:200px;z-index:9999;}#forkongithub a{width:200px;position:absolute;top:30px;right:-60px;transform:rotate(45deg);-webkit-transform:rotate(45deg);-ms-transform:rotate(45deg);-moz-transform:rotate(45deg);-o-transform:rotate(45deg);box-shadow:4px 4px 10px rgba(0,0,0,0.8);}} diff --git a/texts/apostrophe.html b/texts/apostrophe.html new file mode 100644 index 0000000..a65f29a --- /dev/null +++ b/texts/apostrophe.html @@ -0,0 +1,139 @@ + + + + + + + + + + The apostrophe character - Snowball + + + + + + + + + + +
+
+
+ +
+
+

The apostrophe character

+ + +

+Representing apostrophe is problematical for various reasons, +

+ +
    +
  1. There are two Unicode characters for apostrophe, U+0027 (also ASCII hex +27), and U+2019. Compare, +

    + +
    +        Hamlet's father's ghost (U+0027)
    +        Hamlet’s father’s ghost (U+2019)
    +
    + +
  2. Although conceptually different from an apostrophe, a single closing +quote is also represented by character U+2019. +

    + +
  3. Character U+0027 is used for apostrophe, single closing quote and +single opening quote (U+2018). +

    + +
  4. A fourth character, U+201B, like U+2018 but with the tail ‘rising’ +instead of ‘descending’, is also sometimes used as apostrophe (in the +house style of certain publishers, for surnames like M’Coy and so on.) +

    +
+ +

+In the English stemming algorithm, it is assumed that apostrophe is +represented by U+0027. This makes it ASCII compatible. Clearly other codes +for apostrophe can be mapped to this code prior to stemming. +

+ +

+In English orthography, apostrophe has one of three functions. +

+ +
    +
  1. It indicates a contraction in what is now accepted as a single word: +o’clock, O’Reilly, M’Coy. Except in proper names such forms +are rare: the apostrophe in Hallowe’en is disappearing, and in +’bus has disappeared. +

    + +
  2. It indicates a standard contraction with auxiliary or modal verbs: +you’re, isn’t, we’d. There are about forty of these forms in +contemporary English, and their use is increasing as they displace the full +forms that were at one time used in formal documents. Although they can be +reduced to word pairs, it is more convenient to treat them as single items +(usually stopwords) in IR work. And then preserving the apostrophe is +important, so that he’ll, she’ll, we’ll are not equated with +hell, shell, well etc. +

    + +
  3. It is used to form the ‘English genitive’, John's book, the horses’ +hooves etc. This is a development of (1), where historically the apostrophe +stood for an elided e. (Similarly the printed form ’d for ed was +very common before the nineteenth century.) Although in decline (witness pigs +trotters, Girls School Trust), its use continues in contemporary +English, where it is fiercely promoted as correct grammar, despite (or it might +be closer to the truth to say because of) its complete semantic redundancy. +

    +
+ +

+For these reasons, the English stemmer treats apostrophe as a letter, removing +it from the beginning of a word, where it might have stood for an opening +quote, from the end of the word, where it might have stood for a closing quote, +or been an apostrophe following s. The form ’s is also treated as an ending. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/texts/apostrophe.tt b/texts/apostrophe.tt new file mode 100644 index 0000000..3948dce --- /dev/null +++ b/texts/apostrophe.tt @@ -0,0 +1,75 @@ +[% header('The apostrophe character') %] + +

+Representing apostrophe is problematical for various reasons, +

+ +
    +
  1. There are two Unicode characters for apostrophe, U+0027 (also ASCII hex +27), and U+2019. Compare, +

    + +
    +        Hamlet's father's ghost (U+0027)
    +        Hamlet’s father’s ghost (U+2019)
    +
    + +
  2. Although conceptually different from an apostrophe, a single closing +quote is also represented by character U+2019. +

    + +
  3. Character U+0027 is used for apostrophe, single closing quote and +single opening quote (U+2018). +

    + +
  4. A fourth character, U+201B, like U+2018 but with the tail ‘rising’ +instead of ‘descending’, is also sometimes used as apostrophe (in the +house style of certain publishers, for surnames like M’Coy and so on.) +

    +
+ +

+In the English stemming algorithm, it is assumed that apostrophe is +represented by U+0027. This makes it ASCII compatible. Clearly other codes +for apostrophe can be mapped to this code prior to stemming. +

+ +

+In English orthography, apostrophe has one of three functions. +

+ +
    +
  1. It indicates a contraction in what is now accepted as a single word: +o’clock, O’Reilly, M’Coy. Except in proper names such forms +are rare: the apostrophe in Hallowe’en is disappearing, and in +’bus has disappeared. +

    + +
  2. It indicates a standard contraction with auxiliary or modal verbs: +you’re, isn’t, we’d. There are about forty of these forms in +contemporary English, and their use is increasing as they displace the full +forms that were at one time used in formal documents. Although they can be +reduced to word pairs, it is more convenient to treat them as single items +(usually stopwords) in IR work. And then preserving the apostrophe is +important, so that he’ll, she’ll, we’ll are not equated with +hell, shell, well etc. +

    + +
  3. It is used to form the ‘English genitive’, John's book, the horses’ +hooves etc. This is a development of (1), where historically the apostrophe +stood for an elided e. (Similarly the printed form ’d for ed was +very common before the nineteenth century.) Although in decline (witness pigs +trotters, Girls School Trust), its use continues in contemporary +English, where it is fiercely promoted as correct grammar, despite (or it might +be closer to the truth to say because of) its complete semantic redundancy. +

    +
+ +

+For these reasons, the English stemmer treats apostrophe as a letter, removing +it from the beginning of a word, where it might have stood for an opening +quote, from the end of the word, where it might have stood for a closing quote, +or been an apostrophe following s. The form ’s is also treated as an ending. +

+ +[% footer %] diff --git a/texts/earlyenglish.html b/texts/earlyenglish.html new file mode 100644 index 0000000..a420c5a --- /dev/null +++ b/texts/earlyenglish.html @@ -0,0 +1,199 @@ + + + + + + + + + + Stemming early English - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Stemming early English

+ + +

Links to resources

+ + + +

+The question occasionally arises of how far the English (or earlier Porter) +stemming algorithm can be adapted to handle older forms of the English +language. +

+ +

+Historically, English is usually divided into three periods of development, +

+ +
    +
  1. Old English (or Anglo-Saxon), the language of Beowulf, +
  2. Middle English, the language of Chaucer, +
  3. Modern English, the language of Shakespeare, Dickens, and people today. +
+ +

+Old English is so different from Modern English that it may be regarded as a +distinct language. +

+ +

+Middle English is problematical for a number of reasons. There is no standard +spelling in the original texts, and the grammatical differences between Middle +and Modern English prevent the spelling from being simply ‘modernised’. It is +however possible to normalise the spelling according to some modern scheme, but +again there is no standard modern scheme. Middle +English itself had great regional variations, so that for example the +English of Chaucer and his contemporary the Gawain poet (both late 14th century) +are strikingly different. Finally, grammar was fluid even for one writer, so +Chaucer might use they love or they loven, he +sitteth or he sit. +

+ +

+We may take Modern English to mean English which can be cast into a modern +spelling form without too much damage being done to the original. From this +point of view Shakespeare and the Authorised Version of the Bible are in Modern +English. The ending structure of words in early Modern English differ from +contemporary English in the est and eth endings of verbs in the present +indicative, +

+ +
+ I bring
+ thou bringest
+ he bringeth
+ we bring
+ you bring
+ they bring
+
+ +

+Both of these endings underwent rapid decline. The eth form occurs in +Shakespeare, but is much rarer than the modern s form. The language of the +Authorised Version, +in which both forms abound, +seemed archaic even on its first publication. Consequently +the eth form survives now only in the language of the traditional Bible and +Book of Common Prayer. The est form disappeared more slowly, as the use of +thou became displaced by you in conversation. +

+ +

+To put the endings into the +Porter stemmer, +the rules +

+ +
+Step 1b +
+
(m>0) EED EE +
(*v*) ED +
(*v*) ING +
+
+ +

+should be extended to +

+ +
+Step 1b +
+
(m>0) EED EE +
(*v*) ED +
(*v*) ING +
(*v*) EST +
(*v*) ETH +
+
+ +

+And to put the endings into the +English stemmer, +the list +

+ +
+ed   edly   ing   ingly +
+of Step 1b should be extended to +
+ed   edly   ing   ingly   est   eth +
+ +

+As far as the Snowball scripts are concerned, the endings  'est' 'eth'  must +be added against ending  'ing'. +

+ +

+The inclusion of these endings does produce certain ‘side effects’. est is +the ending of adjectival superlatives (greatest, unkindest), where it +will also be removed. Words like brandreth, deforest will be mis-stemmed. +Nevertheless, for the vocabulary of the Bible, the inclusion of these extra +endings is not harmful (see +this demonstration — +for example, search for the text love in 1000 verses). + +

+
+
+ +
+ +
+ + + + diff --git a/texts/earlyenglish.tt b/texts/earlyenglish.tt new file mode 100644 index 0000000..827a911 --- /dev/null +++ b/texts/earlyenglish.tt @@ -0,0 +1,135 @@ +[% header('Stemming early English') %] + +

Links to resources

+ + + +

+The question occasionally arises of how far the English (or earlier Porter) +stemming algorithm can be adapted to handle older forms of the English +language. +

+ +

+Historically, English is usually divided into three periods of development, +

+ +
    +
  1. Old English (or Anglo-Saxon), the language of Beowulf, +
  2. Middle English, the language of Chaucer, +
  3. Modern English, the language of Shakespeare, Dickens, and people today. +
+ +

+Old English is so different from Modern English that it may be regarded as a +distinct language. +

+ +

+Middle English is problematical for a number of reasons. There is no standard +spelling in the original texts, and the grammatical differences between Middle +and Modern English prevent the spelling from being simply ‘modernised’. It is +however possible to normalise the spelling according to some modern scheme, but +again there is no standard modern scheme. Middle +English itself had great regional variations, so that for example the +English of Chaucer and his contemporary the Gawain poet (both late 14th century) +are strikingly different. Finally, grammar was fluid even for one writer, so +Chaucer might use they love or they loven, he +sitteth or he sit. +

+ +

+We may take Modern English to mean English which can be cast into a modern +spelling form without too much damage being done to the original. From this +point of view Shakespeare and the Authorised Version of the Bible are in Modern +English. The ending structure of words in early Modern English differ from +contemporary English in the est and eth endings of verbs in the present +indicative, +

+ +
+ I bring
+ thou bringest
+ he bringeth
+ we bring
+ you bring
+ they bring
+
+ +

+Both of these endings underwent rapid decline. The eth form occurs in +Shakespeare, but is much rarer than the modern s form. The language of the +Authorised Version, +in which both forms abound, +seemed archaic even on its first publication. Consequently +the eth form survives now only in the language of the traditional Bible and +Book of Common Prayer. The est form disappeared more slowly, as the use of +thou became displaced by you in conversation. +

+ +

+To put the endings into the +Porter stemmer, +the rules +

+ +
+Step 1b +
+
(m>0) EED EE +
(*v*) ED +
(*v*) ING +
+
+ +

+should be extended to +

+ +
+Step 1b +
+
(m>0) EED EE +
(*v*) ED +
(*v*) ING +
(*v*) EST +
(*v*) ETH +
+
+ +

+And to put the endings into the +English stemmer, +the list +

+ +
+ed   edly   ing   ingly +
+of Step 1b should be extended to +
+ed   edly   ing   ingly   est   eth +
+ +

+As far as the Snowball scripts are concerned, the endings  'est' 'eth'  must +be added against ending  'ing'. +

+ +

+The inclusion of these endings does produce certain ‘side effects’. est is +the ending of adjectival superlatives (greatest, unkindest), where it +will also be removed. Words like brandreth, deforest will be mis-stemmed. +Nevertheless, for the vocabulary of the Bible, the inclusion of these extra +endings is not harmful (see +this demonstration — +for example, search for the text love in 1000 verses). + +[% footer %] diff --git a/texts/glossary.html b/texts/glossary.html new file mode 100644 index 0000000..9f1fc31 --- /dev/null +++ b/texts/glossary.html @@ -0,0 +1,170 @@ + + + + + + + + + + Glossary - Snowball + + + + + + + + +

+ +
+
+
+ +
+
+

Glossary

+ + +

a-suffix

+
+

+An a-suffix, or attached suffix, is a particle word attached to another +word. (In the stemming literature they sometimes get referred to as +‘enclitics’.) In Italian, for example, personal pronouns attach to +certain verb forms: +

+ +
+
mandargli = mandare + gli = to send + to him +
mandarglielo = mandare + gli + lo = to send + it + to him +
+ +

+a-suffixes appear in Italian and Spanish, and also in Portuguese, although +in Portuguese they are separated by hyphen from the preceding word, which +makes them easy to eliminate. +

+
+ +

i-suffix

+
+

+An i-suffix, or inflectional suffix, forms part of the basic grammar of a +language, and is applicable to all words of a certain grammatical type, +with perhaps a small number of exceptions. In English for example, the past +of a verb is formed by adding ed. Certain modifications may be required +in the stem: +

+ +
+
fit + ed fitted (double t) +
love + ed loved (drop the final e of love) +
+
+ +

d-suffix

+ +
+

+A d-suffix, or derivational suffix, enables a new word, often with a +different grammatical category, or with a different sense, to be built from +another word. Whether a d-suffix can be attached is discovered not from +the rules of grammar, but by referring to a dictionary. So in English, +ness can be added to certain adjectives to form corresponding nouns +(littleness, kindness, foolishness ...) but not to all adjectives (not for +example, to big, cruel, wise ...) d-suffixes can be used to change +meaning, often in rather exotic ways. So in italian astro means a sham +form of something else: +

+ +
+
medico + astro = medicastro = quack doctor +
poeta + astro = poetastro = poetaster +
+
+ +

Indo-European languages

+ +
+

+Most European and many Asian languages belong to the Indo-European language +group. Historically, it includes the Latin, Greek, Persian and Sanskrit of +the ancient world, and with the rise of the European empires, languages of +this group are now dominant in the Americas, Australia and large parts of +Africa. Indo-European languages are therefore the main languages of modern +Western culture, and they are all similarly amenable to stemming. +

+ +

+The Indo-European group has many recognisable sub-groups, for example +Romance (Italian, French, Spanish ...), Slavonic (Russian, Polish, +Czech ...), Celtic (Irish Gaelic, Scottish Gaelic, Welsh ...). The +Germanic sub-group includes German and Dutch, and the Scandinavian +languages are also usually classed as Germanic, although for convenience we +have made a separate grouping of them on the Snowball site. English is also +classed as Germanic, although it has been classed separately by us. This is +not for reasons of narrow chauvinism, but because the suffix structure of +English clearly lies mid-way between the Germanic and Romance groups, and it +therefore requires separate treatment. +

+
+ +

Uralic languages

+ +
+

+The Uralic languages are spoken mainly in Northern Russia and Europe. They +are divided into Samoyed, spoken mainly in the Siberian region, and +Finno-Ugric, spoken mainly in Europe. Although the number of languages in +the group is substantial, the total number of speakers is relatively small. +The best known Uralic languages are perhaps Hungarian, Finnish and +Estonian. Finnish and Estonian are in fact fairly similar. On the other +hand Hungarian and Finnish are as different as are, say, French and Persian +in the Indo-European group. +

+ +

+Like the Indo-European languages, the Uralic languages are amenable to +stemming. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/texts/glossary.tt b/texts/glossary.tt new file mode 100644 index 0000000..0860abb --- /dev/null +++ b/texts/glossary.tt @@ -0,0 +1,106 @@ +[% header('Glossary') %] + +

a-suffix

+
+

+An a-suffix, or attached suffix, is a particle word attached to another +word. (In the stemming literature they sometimes get referred to as +‘enclitics’.) In Italian, for example, personal pronouns attach to +certain verb forms: +

+ +
+
mandargli = mandare + gli = to send + to him +
mandarglielo = mandare + gli + lo = to send + it + to him +
+ +

+a-suffixes appear in Italian and Spanish, and also in Portuguese, although +in Portuguese they are separated by hyphen from the preceding word, which +makes them easy to eliminate. +

+
+ +

i-suffix

+
+

+An i-suffix, or inflectional suffix, forms part of the basic grammar of a +language, and is applicable to all words of a certain grammatical type, +with perhaps a small number of exceptions. In English for example, the past +of a verb is formed by adding ed. Certain modifications may be required +in the stem: +

+ +
+
fit + ed fitted (double t) +
love + ed loved (drop the final e of love) +
+
+ +

d-suffix

+ +
+

+A d-suffix, or derivational suffix, enables a new word, often with a +different grammatical category, or with a different sense, to be built from +another word. Whether a d-suffix can be attached is discovered not from +the rules of grammar, but by referring to a dictionary. So in English, +ness can be added to certain adjectives to form corresponding nouns +(littleness, kindness, foolishness ...) but not to all adjectives (not for +example, to big, cruel, wise ...) d-suffixes can be used to change +meaning, often in rather exotic ways. So in italian astro means a sham +form of something else: +

+ +
+
medico + astro = medicastro = quack doctor +
poeta + astro = poetastro = poetaster +
+
+ +

Indo-European languages

+ +
+

+Most European and many Asian languages belong to the Indo-European language +group. Historically, it includes the Latin, Greek, Persian and Sanskrit of +the ancient world, and with the rise of the European empires, languages of +this group are now dominant in the Americas, Australia and large parts of +Africa. Indo-European languages are therefore the main languages of modern +Western culture, and they are all similarly amenable to stemming. +

+ +

+The Indo-European group has many recognisable sub-groups, for example +Romance (Italian, French, Spanish ...), Slavonic (Russian, Polish, +Czech ...), Celtic (Irish Gaelic, Scottish Gaelic, Welsh ...). The +Germanic sub-group includes German and Dutch, and the Scandinavian +languages are also usually classed as Germanic, although for convenience we +have made a separate grouping of them on the Snowball site. English is also +classed as Germanic, although it has been classed separately by us. This is +not for reasons of narrow chauvinism, but because the suffix structure of +English clearly lies mid-way between the Germanic and Romance groups, and it +therefore requires separate treatment. +

+
+ +

Uralic languages

+ +
+

+The Uralic languages are spoken mainly in Northern Russia and Europe. They +are divided into Samoyed, spoken mainly in the Siberian region, and +Finno-Ugric, spoken mainly in Europe. Although the number of languages in +the group is substantial, the total number of speakers is relatively small. +The best known Uralic languages are perhaps Hungarian, Finnish and +Estonian. Finnish and Estonian are in fact fairly similar. On the other +hand Hungarian and Finnish are as different as are, say, French and Persian +in the Indo-European group. +

+ +

+Like the Indo-European languages, the Uralic languages are amenable to +stemming. +

+ +[% footer %] diff --git a/texts/howtohelp.html b/texts/howtohelp.html new file mode 100644 index 0000000..e8f7448 --- /dev/null +++ b/texts/howtohelp.html @@ -0,0 +1,129 @@ + + + + + + + + + + Snowball: How You Can Help - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Snowball: How You Can Help

+ + +

+For the work on this site there are two possible lines of development, one is +Snowball itself — the language and compiler — and the other is the +stemmers which are written in Snowball. At the moment it is the latter that +is the real area of interest. +

+ +

+It is useful to have suggestions about improvements to the existing +stemmers, especially for the ones which are not English. However, the +process of piecemeal improvement can be taken too far, and it is important +in making these suggestions to recognise the inevitable limitations of +accuracy of algorithmic stemmers. But more importantly: — +

+ +

+Stemming algorithms have a well-understood place in IR (Information +Retrieval), and as language-specific tools in an IR system, they have an +extremely useful part to play. It is therefore something of a scandal that +there are so very few stemming algorithms which are readily available, so +if you want to make a contribution to Snowball, the best thing you can do +is to create a good quality stemmer for a new language. This must +include an algorithmic description of the stemmer, an implementation in +Snowball, and a representative language vocabulary of about 30,000 words +that can be used as part of a standard test. +

+ +

+Alternatively, you might come up with the algorithm and be able to provide +representative texts from which to derive the vocabulary, but hesitate +about the Snowball implementation. If so, get in touch, and we might be +able to complete the work collaboratively. +

+ +

+We are also interested in: +

+ +
    +
  • Significant applications developed with the Snowball stemmers + +
  • Stemmers held on other sites that derive from Snowball work + +
  • Other useful stemming resources +
+ +

+It may seem like stating the obvious, but if you do hit a technical +problem, please, please send in a full notice of the system being used, +the activity you were engaged on, and the errors that you encounter. +

+ +

+Finally, if you want to contribute to this site, you must be prepared to +release under the BSD license (i.e. to make your work free). +

+ +

+Martin Porter
+Richard Boulton +

+ +
+
+
+ +
+ +
+ + + + diff --git a/texts/howtohelp.tt b/texts/howtohelp.tt new file mode 100644 index 0000000..5231d72 --- /dev/null +++ b/texts/howtohelp.tt @@ -0,0 +1,65 @@ +[% header('Snowball: How You Can Help') %] + +

+For the work on this site there are two possible lines of development, one is +Snowball itself — the language and compiler — and the other is the +stemmers which are written in Snowball. At the moment it is the latter that +is the real area of interest. +

+ +

+It is useful to have suggestions about improvements to the existing +stemmers, especially for the ones which are not English. However, the +process of piecemeal improvement can be taken too far, and it is important +in making these suggestions to recognise the inevitable limitations of +accuracy of algorithmic stemmers. But more importantly: — +

+ +

+Stemming algorithms have a well-understood place in IR (Information +Retrieval), and as language-specific tools in an IR system, they have an +extremely useful part to play. It is therefore something of a scandal that +there are so very few stemming algorithms which are readily available, so +if you want to make a contribution to Snowball, the best thing you can do +is to create a good quality stemmer for a new language. This must +include an algorithmic description of the stemmer, an implementation in +Snowball, and a representative language vocabulary of about 30,000 words +that can be used as part of a standard test. +

+ +

+Alternatively, you might come up with the algorithm and be able to provide +representative texts from which to derive the vocabulary, but hesitate +about the Snowball implementation. If so, get in touch, and we might be +able to complete the work collaboratively. +

+ +

+We are also interested in: +

+ +
    +
  • Significant applications developed with the Snowball stemmers + +
  • Stemmers held on other sites that derive from Snowball work + +
  • Other useful stemming resources +
+ +

+It may seem like stating the obvious, but if you do hit a technical +problem, please, please send in a full notice of the system being used, +the activity you were engaged on, and the errors that you encounter. +

+ +

+Finally, if you want to contribute to this site, you must be prepared to +release under the BSD license (i.e. to make your work free). +

+ +

+Martin Porter
+Richard Boulton +

+ +[% footer %] diff --git a/texts/introduction.html b/texts/introduction.html new file mode 100644 index 0000000..55591f7 --- /dev/null +++ b/texts/introduction.html @@ -0,0 +1,987 @@ + + + + + + + + + + Snowball: A language for stemming algorithms - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Snowball: A language for stemming algorithms

+ + +

Links

+ + + +

+M.F. Porter
+October 2001 +

+ +

Summary

+ +

+ Algorithmic stemmers continue to have great utility in IR, despite the + promise of out-performance by dictionary-based stemmers. Nevertheless, + there are few algorithmic descriptions of stemmers, and even when they + exist they are liable to misinterpretation. Here we look at the ideas + underlying stemming, and on this website define a language, Snowball, + in which stemmers can be exactly defined, and from which fast stemmer + programs in ANSI C or Java can be generated. A range of stemmers is presented + in parallel algorithmic and Snowball form, including the original + Porter stemmer for English. +

+ +

1 Introduction

+ +

+There are two main reasons for creating Snowball. One is the lack of +readily available stemming algorithms for languages +other than English. The other is the consciousness of a certain failure on +my part in promoting exact implementations of the stemming +algorithm described in (Porter 1980), which has come to be called the +Porter stemming algorithm. The first point needs some qualification: a +great deal of work has been done on stemmers in a wide range of natural +languages, both in their development and evaluation, (a complete +bibliography cannot be attempted here). But it is rare to see a stemmer +laid out in an unambiguous algorithmic form from which encodings in C, +Java, Perl etc might easily be made. When exact descriptions are +attempted, it is often with approaches to stemming that are +relatively simple, for example the Latin stemmer of Schinke (Schinke 1996), +or the Slovene stemmer of Popovic (Popovic 1990). A more complex, and +therefore more characteristic stemmer is the Kraaij-Pohlmann stemmer for +Dutch (Kraaij 1994), which is presented as open source code in ANSI C. To +extract an algorithmic description of their stemmer from the source code +proves to be quite hard. +

+ +

+The disparity between the Porter stemmer definition and many of its +purported implementations is much wider than is generally realised in the +IR community. Three problems seem to compound: one is a misunderstanding +of the meaning of the original algorithm, another is bugs in the +encodings, and a third is the almost irresistible urge of programmers +to add improvements. +

+ +

+For example, a Perl script advertised on the Web as an +implementation of the Porter algorithm was tested in October 2001, and it was +found that 14 percent of words were stemmed incorrectly when given a large sample +vocabulary. Most words of English have +very simple endings, so this means that it was effectively getting everything +wrong. At certain points on the Web are demonstrations of the Porter stemmer. +You type some English into a box and the stemmed words are displayed. These +are frequently faulty. (A good test is to type in agreement. It should stem +to agreement — the same word. If it stems to agreem there is an +error.) Researchers frequently pick up faulty versions of the stemmer and +report that they have applied ‘Porter stemming’, with the result that their +experiments are not quite repeatable. Researchers who work on stemming will +sometimes give incorrect examples of the behaviour of the Porter stemmer in +their published works. +

+ +

+To address all these problems I have tried to develop a rigorous system +for defining stemming algorithms. A language, Snowball, has been invented, +in which the rules of stemming algorithms can be expressed in a natural +way. Snowball is quite small, and can be learned by an experienced +programmer in an hour or so. On this website a number of foreign language +stemmers is presented (a) in Snowball, and (b) in a less formal +English-language description. (b) can be thought of as the program +comments for (a). A Snowball compiler translates each Snowball +definition into (c) an equivalent program in ANSI C or Java. Finally (d) +standard vocabularies of words and their stemmed equivalents are provided +for each stemmer. The combination of (a), (b), (c) and (d) +can be used to pin down the definition of a stemmer exactly, and it is +hoped that Snowball itself will be a useful resource in creating stemmers +in the future. +

+ +

2 Some ideas underlying stemming

+ +

+Work in stemming has produced a number of different approaches, albeit tied +together by a number of common assumptions. It is worthwhile looking at some +of them to see exactly where Snowball fits into the whole picture. +

+ +

+A point tacitly assumed in almost all of the stemming literature is that +stemmers are based upon the written, and not the spoken, form of the +language. This is also the assumption here. Historically, +grammarians often regarded the written language as the real language and +the spoken as a mere derivative form. Almost in reaction, many modern +linguists have taken a precisely opposite view (Palmer, 1965 pp 2-3). A +more balanced position is that the two languages are distinct though +connected, and require separate treatment. One can in fact imagine parallel +stemming algorithms for the spoken language, or rather for the phoneme +sequence into which the spoken language is transformed. Stress and +intonation could be used as clues for an indexing process in the same way +that punctuation and capitalisation are used as clues in the written +language. But currently stemmers work on the written language for the good +reason that there is so much of it available in machine readable form from +which to build our IR systems. Inevitably therefore the stemmers get +caught up in accidental details of orthography. In English, removing the +ing from rotting should be followed by undoubling the tt, +whereas in rolling we do not undouble the ll. In French, removing +the er from ennuyer should be followed by changing the y to +i, so that the resulting word conflates with ennui, and so on. +

+ +

+The idea of stemming is to improve IR performance generally by bringing +under one heading variant forms of a word which share a common meaning. +Harman (1991) was first to present compelling evidence that it may not do +so, when her experiments discovered no significant improvement with the +use of stemming. +Similarly Lennon (1981) discovered no appreciable difference between different +stemmers running on a constant collection. +Later work has modified this position however. Krovetz +(1995) found significant, although sometimes small, improvements across a +range of test collections. What he did discover is that the degree of +improvement varies considerably between different collections. +These tests were however done on collections in +English, and the reasonable assumption of IR researchers has always been that for +languages that are more highly inflected than English (and nearly all +are), greater improvements will be observed when stemming is applied. My +own view is that stemming helps regularise the +vocabulary of an IR system, and this leads to advantages that are not +easily quantifiable through standard IR experiments. For example, it helps +in presenting lists of terms associated with the query back to the IR user +in a relevance feedback cycle, which is one of the underlying ideas of the +probabilistic model. More will be said on the use of a stemmed vocabulary +in section 5. +

+ +

+Stemming is not a concept applicable to all languages. It is not, for +example, applicable in Chinese. But to languages of the Indo-European (*) +group (and most of the stemmers on this site are for Indo-European +languages), a common +pattern of word structure does emerge. Assuming words are written left to +right, the stem, or root of a word is on the left, and zero or more +suffixes may be added on the right. If the root is modified by this +process it will normally be at its right hand end. And also prefixes may +be added on the left. So unhappiness has a prefix un, a suffix +ness, and the y of happy has become i with the addition of +the suffix. Usually, prefixes alter meaning radically, so they are best +left in place (German and Dutch ge is an exception here). But suffixes +can, in certain circumstances, be removed. So for example happy and +happiness have closely related meanings, and we may wish to stem both +forms to happy, or happi. Infixes can occur, although rarely: +ge in German and Dutch, and zu in German. +

+ +

+One can make some distinction between root and stem. Lovins (1968) +sees the root as the stem minus any prefixes. But here we will +think of the stem as the residue of the stemming process, and the root as the +inner word from which the stemmed word derives, so we think of root to +some extent in an etymological way. It must be admitted that when you +start thinking hard about these concepts root, stem, suffix, +prefix ... they turn out to be very difficult indeed to define. +Nor do definitions, even if we arrive at them, help us much. After all, suffix +stripping is a practical aid in IR, not an exercise in linguistics or +etymology. This is especially true of the central concept of root. We +think of the etymological root of a word as something we can discover with +certainty from a dictionary, forgetting that etymology itself is a subject +with its own doubts and controversies (Jesperson 1922, Chapter XVI). +Indeed, Jesperson goes so far as to say that +

+ +
+

+ ‘It is of course impossible to say how great a proportion of the + etymologies given in dictionaries should strictly be classed under + each of the following heads: (1) certain, (2) probable, (3) + possible, (4) improbable, (5) impossible — but I am afraid the + first two classes would be the least numerous.’ +

+
+ +

+Here we will simply assume a common sense understanding of +the basic idea of stem and suffix, and hope that this proves sufficient +for designing and discussing stemming algorithms. +

+ +

+We can separate suffixes out into three basic classes, which will be +called d-, i- and a-suffixes. +

+ +

+An a-suffix, or attached suffix, is a particle word attached to another +word. (In the stemming literature they sometimes get referred to as +‘enclitics’.) In Italian, for example, personal pronouns attach to +certain verb forms: +

+ +
+
mandargli = mandare + gli = to send + to him +
mandarglielo = mandare + gli + lo = to send + it + to him +
+ +

+a-suffixes appear in Italian and Spanish, and also in Portuguese, although +in Portuguese they are separated by hyphen from the preceding word, which +makes them easy to eliminate. +

+ +

+An i-suffix, or inflectional suffix, forms part of the basic grammar of a +language, and is applicable to all words of a certain grammatical type, +with perhaps a small number of exceptions. In English for example, the past +of a verb is formed by adding ed. Certain modifications may be required +in the stem: +

+ +
+
fit + ed fitted (double t) +
love + ed loved (drop the final e of love) +
+ +

+but otherwise the rule applies in a regular way to all verbs in +contemporary English, with about 150 (Palmer, 1965) exceptional forms, +

+ +
+
bear beat become begin bend .... +
bore beat became began bent +
+ +

+A d-suffix, or derivational suffix, enables a new word, often with a +different grammatical category, or with a different sense, to be built from +another word. Whether a d-suffix can be attached is discovered not from +the rules of grammar, but by referring to a dictionary. So in English, +ness can be added to certain adjectives to form corresponding nouns +(littleness, kindness, foolishness ...) but not to all adjectives (not for +example, to big, cruel, wise ...) d-suffixes can be used to change +meaning, often in rather exotic ways. So in Italian astro means a sham +form of something else: +

+ +
+
medico + astro = medicastro = quack doctor +
poeta + astro = poetastro = poetaster +
+ +

+Generally i-suffixes follow d-suffixes. i-suffixes can precede d-suffixes, +for example lovingly, devotedness, but such cases are exceptional. To +be a little more precise, d-suffixes can sometimes be added to +participles. devoted, used adjectivally, is a participle derived from the +verb devote, and ly can be added to turn the adjective into an adverb, +or ness to turn it into a noun. The same feature occurs in other +Indo-European languages. +

+ +

+Sometimes it is hard to say whether a suffix is a d-suffix or i-suffix, +the comparative and superlative endings er, est of English for example. +

+ +

+A d-suffix can serve more than one function. In English, for example, +ly standardly turns an adjective into an adverb (greatly), but it +can also turn a noun into an adjective (kingly). In French, ement +also standardly turns an adjective into an adverb (grandement), but it +can also turn a verb into a noun (rapprochement). (Referring to the +French stemmer, this double use is ultimately why ement is tested for +being in the RV rather than the R2 region of the word being +stemmed.) +

+ +

+It is quite common for an i-suffix to serve more than one function. +In English, s can either be (1) a verb ending attached to third person +singular forms (runs, sings), (2) a noun ending indicating the plural +(dogs, cats) or (3) a noun ending indicating the possessive +(boy’s, girls’). By an orthographic convention now several hundred +years old, the possessive is written with an apostrophe, but +nowadays this is +frequently omitted in familiar phrases (a girls school). (Usage (3) is +relatively rare compared with (1) and (2): there are only nine uses of +’s in this document.) +

+ +

+Since the normal order of suffixes is d, i and a, we +can expect them to be removed +from the right in the order a, i and d. Usually we want to remove +all a- and i-suffixes, and some of the d-suffixes. +

+ +

+If the stemming process reduces two words to the same stem, they are said +to be conflated. +

+ +

3 Stemming errors, and the use of dictionaries

+ +

+One way of thinking of the relation between terms and documents in an IR +system is to see the documents as being about concepts, and the terms as +words that describe the concepts. Then, of course, one word can cover many +concepts, so pound can mean a unit of currency, a weight, an enclosure, +or a beating. Pound is a homonym. And one concept can be described by +many words, as with money, capital, cash, currency. These words +are synonyms. There is a many-many mapping therefore between the set of +terms and the set of concepts. Stemming is a process that transforms this +mapping to advantage, on the whole reducing the number of synonyms, but +occasionally creating new homonyms. It is worth remembering that what are +called stemming errors are usually just the introduction of new homonyms into +vocabularies that already contain very large numbers of homonyms. +

+ +

+Words which have no place in this term-concept mapping are those which +describe no concepts. The particle words of grammar, the, of, +and +..., known in IR as stopwords, fall into this category. Stopwords can be +useful for retrieval but only in searching for phrases, ‘to be or not to +be’, ‘do as you would be done by’ etc. This suggests that stemming +stopwords is not useful. More will be said on stopwords in section 7. +

+ +

+In the literature, a distinction is often made between +under-stemming, which is the error of taking off too small a suffix, and +over-stemming, which is the error of taking off too much. In French, for +example, croûtons is the plural of croûton, ‘a crust’, so to remove +ons would be over-stemming, while croulons is a verb form of crouler, +‘to totter’, so to remove s would be under-stemming. We would like to +introduce a further distinction between mis-stemming and over-stemming. +Mis-stemming is taking off what looks like an ending, but is really part +of the stem. Over-stemming is taking off a true ending which results in +the conflation of words of different meanings. +

+ +

+So for example ly can be removed from cheaply, but not from reply, +because in reply ly is not a suffix. If it was removed, reply would +conflate with rep, (the commonly used short form of representative). +Here we have a case of mis-stemming. +

+ +

+To illustrate over-stemming, look at these four words, +

+ +
+
verb adjective + +
First pair: prove provable +
Second pair: probe probable +
+ +

+Morphologically, the two pairs are exactly parallel (in the written, if not +the spoken language). They also have a common etymology. All four words +derive from the Latin probare, ‘to prove or to test’, and the idea of +testing connects the meanings of the words. But the meanings are not parallel. +provable means ‘able to be proved’; probable does not mean ‘able to be +probed’. Most people would judge conflation of the first pair as correct, +and of the second pair, incorrect. In other words, to remove able from +probable is a case of over-stemming. +

+ +

+We can try to avoid mis-stemming and over-stemming by using a dictionary. +The dictionary can tell us that reply does not derive from rep, and +that the meanings of probe and probable are well separated in modern +English. It is important to realise however that a dictionary does not give +a complete solution here, but can be a tool to improve the conflation +process. +

+ +

+In Krovetz’s dictionary experiments (Krovetz 1995), he noted that in +looking up a past participle like suited, one is led either to suit or +to suite as plausible infinitive forms. suite can be rejected, +however, because the dictionary tells us that +although it is a word of English +it is not a verb form. Cases +like this (and Krovetz found about 60) had to be treated as exceptions. But +the form routed could +either derive from the verb rout or the verb route: +

+ +

+ At Waterloo Napoleon’s forces were routed
+ The cars were routed off the motorway +

+ +

+Such cases in English are extremely rare, but they are commoner in more +highly inflected languages. In French for example, affiliez can either be +the verb affiler, to sharpen, with imperfect ending iez, or the verb +affilier, to affiliate, with present indicative ending ez: +

+ +
+
vous affiliez = vous affil-iez = you sharpened +
vous affiliez = vous affili-ez = you affiliate +
+ +

+If the second is intended, removal of iez is mis-stemming. +

+ +

+With over-stemming we must rely upon the dictionary to separate meanings. +There are different ways of doing this, but all involve some degree of +reliance upon the lexicographers. Krovetz’s methods are no doubt best, +because the most objective: he uses several measures, but they are based on +the idea of measuring the similarity in +meaning of two words by the degree of overlap among the words used to define +them, and this is at a good remove from a lexicographer’s subjective +judgement about semantic similarity. +

+ +

+There is an interesting difference between mis-stemming and over-stemming +to do with language history. The morphology of a language changes less +rapidly than the meanings of the words in it. When extended to include a +few archaic endings, such as ick as an alternative to ic, a stemmer for +contemporary English can be applied to the English of 300 years ago. +Mis-stemmings will be roughly the same, but the pattern of over-stemming will +be different because of the changing meaning of words in the language. For +example, relativity in the 19th century merely meant ‘the condition of +being relative to’. With that meaning, it is acceptable to conflate it +with relative. +But with the 20th century meaning brought to it by +Einstein, stemming to relativ is over-stemming. +Here we see the word with the suffix changing its meaning, but it can happen +the other way round. transpire has come to mean ‘happen’, and its old +meaning of ’exhalation’ or ‘breathing out’ is now effectively lost. +(That is the bitter reality, although dictionaries still try to persuade us +otherwise). But transpiration still carries the earlier meaning. +So what was formerly an acceptable stemming may be judged now as +an over-stemming, not because the word being stemmed has changed its meaning, +but because some cognate word has changed its meaning. +

+ +

+In these examples we are presenting words as if they had single meanings, but +the true picture is more complicated. Krovetz uses a model of word +meanings which is extremely helpful here. He makes a distinction between +homonyms and polysemes. The meaning of homonyms are quite unrelated. +For example, ground in the sense of ‘earth’, and ‘ground’ as the past +participle of ‘grind’ are homonyms. Etymologically homonyms have different +stories, and they usually have separate entries in a dictionary. But each +homonym form can have a range of polysemic forms, corresponding to different +shades of meaning. So ground can mean the earth’s surface, or the bottom +of the sea, or soil, or any base, and so the basis of an argument, and so on. +Over time new polysemes appear and old ones die. At any moment, the use of a +word will be common in some polysemic forms and rare in others. If a suffix is +attached to a word the new word will get a different set of polysemes. For +example, grounds = ground + s acquires the sense of ‘dregs’ and +‘estate lands’, loses the sense of ‘earth’, and shares the sense of +‘basis’. +

+ +

+Consider the conflation of mobility with mobile. mobile has +acquired two new polysemes not shared with mobility. One is the ‘mobile +art object’, common in the nursery. This arrived in the 1960s, and is +still in use. The other is the ‘mobile phone’ which is now very dominant, +although it may decline in the future when it has been replaced by some new +gadget with a different name. We might draw a graph of the degree of +separation of the meanings of mobility and mobile against time, +which would depend upon the number of polysemes and the intensity of their +use. What seemed like a valid conflation of the two words in 1940 may seem +to be invalid today. +

+ +

+In general therefore one can say that judgements about whether words are +over-stemmed change with time as the meanings of words in the language +change. +

+ +

+The use of a dictionary should reduce errors of mis-stemming and errors of +over-stemming. And, for English at least, the mis-stemming errors should +reduce well, even if there are problems with over-stemming errors. Of +course, it depends on the quality of the dictionary. A dictionary will need +to be very comprehensive, fully up-to-date, and with good word definitions +to achieve the best results. +

+ +

+Historically, stemmers have often been thought of as either +dictionary-based or algorithmic. The presentation of studies of stemming +in the literature has perhaps helped to create this division. In the +Lovins’ stemmer the algorithmic description is central. In accounts of +dictionary-based stemmers the emphasis tends to be on dictionary content +and structure, and IR effectiveness. Savoy’s French stemmer (Savoy, 1993) +is a good example of this. But the two approaches are not really distinct. +An algorithmic stemmer can include long exception lists that are +effectively mini-dictionaries, and a dictionary-based stemmer usually +needs a process for removing at least i-suffixes to make the look-up +in the dictionary possible. In fact in a language in which proper names +are inflected (Latin, Finnish, Russian ...), a dictionary-based stemmer +will need to remove i-suffixes independently of dictionary look-up, +because the proper names will not of course be in the dictionary. +

+ +

+The stemmers available on the Snowball website are all purely +algorithmic. They can be extended to include built-in exception lists, they +could be used in combination with a full dictionary, but they are still +presented here in their simplest possible form. Being purely algorithmic, +they are, or ought to be, inferior to the performance of well-constructed +dictionary-based stemmers. But they are still very useful, for the +following reasons: +

+ +
    +
  1. Algorithmic stemmers are (or can be made) very lean and very fast. The +stemmers presented here generate code that will process about a million +words in six seconds on a conventional 500MHz PC. Nowadays we can generate +very large IR systems with quite modest resources, and tools that assist in +this have value. +

    + +
  2. Despite the errors they can be seen to make, algorithmic stemmers still +give good practical results. As Krovetz (1995) says in surprise of the +algorithmic stemmer, ‘Why does it do so well?’ (page 89). +

    + +
  3. Dictionary-based stemmers require dictionary maintenance, to keep up +with an ever-changing language, and this is actually quite a problem. It +is not just that a dictionary created to assist stemming today will +probably require major updating in a few years time, but that a dictionary +in use for this purpose today may already be several years out of date. +

    +
+ +

+We can hazard an answer to Krovetz’s question, as to why algorithmic +stemmers perform as well as they do, when they reveal so many cases of +under-, over- and mis-stemming. Under-stemming is a fault, but by itself +it will not degrade the performance of an IR system. Because of +under-stemming words may fail +to conflate that ought to have conflated, but you are, in a sense, no +worse off than you were before. Mis-stemming is more serious, but again +mis-stemming does not really matter unless it leads to false conflations, +and that frequently does not happen. For example, removing the ate +ending in English, can result in useful conflations (luxury, +luxuriate; affection, affectionate), but very often produces +stems that are not English words +(enerv-ate, accommod-ate, +deliber-ate etc). In the literature, these are normally +classed as stemming errors — overstemming — although in our nomenclature +they are examples of mis-stemming. +However these residual stems, +enerv, accommod, +deliber ... do not conflate with other word forms, and so behave in +an IR system in the same way as if they still retained their ate +ending. No false conflations arise, and so there is no over-stemming here. +

+ +

+To summarise, one can say that just as a word can be over-stemmed +but not mis-stemmed (relativityrelative), so it can be +mis-stemmed but not over-stemmed (enervateenerv). And, of +course, even over-stemming does not matter, if the over-stemmed word falsely +conflates with other words that exist in the language, but are not +encountered in the IR +system which is being used. +

+ +

+Of the three types of error, +over-stemming is the most important, and +using a dictionary does not eliminate all over-stemmings, but does reduce their +incidence. +

+ +

4 Stemming as part of an indexing process

+ +

+Stemming is part of a composite process of extracting words from text and +turning them into index terms in an IR system. Because stemming is somewhat +complex and specialised, it is usually studied in isolation. Even so, it +cannot really be separated from other aspect of the indexing process: +

+ +
    +
  1. What is a word? For indexing purposes, a word in a European language is +a sequence of letters bounded by non-letters. But in English, an internal +apostrophe does not split a word, although it is not classed as a letter. +The treatment of these word boundary characters affects the stemmer. For +example, the Kraaij Pohlmann stemmer for Dutch (Kraaij, 1994, 1995) removes hyphen and +treats apostrophe as part of the alphabet (so ’s, ’tje and ’je are three +of their endings). The Dutch stemmer presented here assumes hyphen and +apostrophe have already been removed from the word to be stemmed. +

    + +
  2. What is a letter? Clearly letters define words, but different languages +use different letters, much confusion coming from the varied use of +accented Roman letters. +

    + +

    +English speakers, perhaps influenced by the ASCII character set, typically regard +their alphabet of a to z as the norm, and other forms (for example, Danish +å and ø, or German ß) as somewhat abnormal. But this is +an insular point of view. In Italian, for example, the letters +j, k, w, x and y are not part of the alphabet, and are +only seen in foreign words. We also tend to regard other alphabets as only +used for isolated languages, and that is not strictly true. Cyrillic is +used for a range of languages other than Russian, among which additional +letters and accented forms abound. +

    + +

    +In English, a broad definition of letter would be anything that could be +accepted as a pronounceable element of a word. This would include +accented Roman letters (naïve, Fauré), and certain ligature +forms (encyclopædia). It would exclude letters +of foreign alphabets, such as Greek and Cyrillic. +The a to z alphabet is one of those where letters come in +two styles, upper and lower case, which historically correspond (very roughly) to the +shapes you get if you use a chisel or a pen. Across all languages, the +exact relation of upper to lower case is not so easy to define. In Italian, +for example, an accented lower case letter is sometimes represented in +upper case by the unaccented letter followed by an apostrophe. (I have +seen this convention used in modern Italian news stories in machine +readable form.) +

    + +

    +In fact the Porter stemmer (which is for English) assumes the word being stemmed is +unaccented and in lower case. More exactly, a, e, i, o, +u, +and sometimes y, are +treated as vowels, and any other character gets treated as a consonant. +Each stemmer presented here assumes some degree of normalisation before it +receives the word, which is roughly (a) put all letters into lower case, +and (b) remove accents from letter-accent combinations that do not form +part of the alphabet of the language. Each stemmer declares the +letter-accent combinations for its language, and this can be used as a +guide for the normalisation, but even so, we can see from +the discussion above that (a) and (b) are not trivial +operations, and need to be done with care. +

    + +

    +(Incidentally, because the stemmers work on lower case words, turning +letters to upper case is sometimes used internally for flagging purposes.) +

    + +
  3. Identifying stopwords. Invariant stopwords are more easily found before +stemming is applied, but inflecting stopwords (for example, German kein, keine, keinem, +keinen ... ) may be easier to find after — because there are fewer forms. +There is a case for building stopword identification into the stemming +process. See section 7. +

    + +
  4. Conflating irregular forms. More will be said on this in section 6. +

    +
+ +

5 The use of stemmed words

+ +

+The idea of how stemmed words might be employed in an IR system has +evolved slightly over the years. The Lovins stemmer (Lovins 1968) was +developed not for indexing document texts, but the subject terms attached +to them. With queries stemmed in the same way, the user needed no special +knowledge of the form of the subject terms. Rijsbergen (1979, Chapter 2) +assumes document text analysis: stopwords are removed, the remaining words +are stemmed, and the resulting set of stemmed word constitute the IR index +(and this style of use is widespread today). More flexibility however is +obtained by indexing all words in a text in an unstemmed form, and +keeping a separate two-column relation which connects the words to their +stemmed equivalents. The relation can be denoted by R(s, w), which means +that s is the stemmed form of word w. From the relation we can get, for +any word w, its unique stemmed form, stem(w), and for any stem s, the set +of words, words(s), that stem to s. +

+ +

+The user should not have to see the stemmed form of a word. If a list of +stems is to be presented back for query expansion, in place of +a stem, s, the user should be shown a single representative from the set +words(s), the one of highest frequency perhaps. The user should also +be able to choose for the whole query, or at a lower level for each word +in a query, whether or not it should be stemmed. In the absence of such +choices, the system can make its own +decisions. +Perhaps single word queries would not undergo +stemming; long queries would; stopwords would be removed +except in phrases. In query expansion, the system would work with stemmed +forms, ignoring stopwords. +

+ +

+Query expansion with stemming results in a much cleaner vocabulary list +than without, and this is a main strength of using a stemming process. +

+ +

+A question arises: if the user never sees the stemmed form, does its +appearance matter? The answer must be no, although +the Porter stemmer tries to make the unstemmed forms guessable from the stemmed +forms. For example, from appropri you can guess appropriate. At least, +trying to achieve this effect acts as a useful control. Similarly with the +other stemmers presented here, an attempt has been made to keep the +appearance of the stemmed forms as familiar as possible. +

+ +

6 Irregular grammatical forms

+ +

+All languages contain irregularities, but to what extent should they be +accommodated in a stemming algorithm? An English stemmer, for example, can +convert regular plurals to singular form without difficulty (boys, girls, +hands ...). Should it do the same with irregular plurals (men, children, +feet, ...)? Here we have irregular cases with i-suffixes, but there are +irregularities with d-suffixes, which Lovins calls ‘spelling exceptions’. +absorb/absorption and conceive/conception are examples of this. +Etymologically, the explanation of the first is that the Latin root, +sorbere, is an irregular verb, and of the second that the word +conceive comes to us from the French rather than straight from the Latin. +It is interesting that, even with no knowledge of the etymology, we do +recognise the connection between the words. +

+ +

+Lovins tries to solve spelling exceptions by formulating general respelling +rules (turn rpt into rb for example), but it might be easier to have +simply a list of exceptional stems. +

+ +

+The Porter stemmer does not handle irregularities at all, but from the +author’s own experience, this has never been an area of complaint. +Complaints in fact are always about false conflations, for example new +and news. +

+ +

+Possibly Lovins was right in wanting to resolve d-suffix irregularities, +and not being concerned about i-suffix irregularities. i-suffix +irregularities in English go with short, old words, that are either in very +common use (man/men, woman/women, see/saw ...) or are used only rarely +(ox/oxen, louse/lice, forsake/forsook ...). The latter class can be +ignored, and the former has its own problems which are not always solved +by stemming. For example man is a verb, and saw can mean a cutting +instrument, or, as a verb, can mean to use such an instrument. Conflation +of these forms frequently leads to an error like mis-stemming therefore. +

+ +

+An algorithmic stemmer really needs holes where the irregular forms can be +plugged in as necessary. This is more serviceable than attempting to +embed special lists of these irregular forms into software. +

+ +

7 Stopwords

+ +

+We have suggested that stemming stopwords is not useful. There is a +grammatical connection between being and be, but conflation of the two +forms has little use in IR because they have no shared meaning that would +entitle us to think of them as synonyms. being and be have a +morphological connection as well, but that is not true of am and was, +although they have a grammatical connection. Generally speaking, +inflectional stopwords exhibit many irregularities, which means that +stemming is not only not useful, but not possible, unless one builds into +the stemmer tables of exceptions. +

+ +

+Switching from English to French, consider être, the equivalent form +of be. It has about 40 different forms, including, +

+ +
+ suis   es   sommes   serez   étaient   fus   furent   sois   été +
+ +

+(and suis incidentally is a homonym, as part of the verb suivre.) +Passing all forms through a rule-based stemmer creates something of a +mess. An alternative approach is to recognise this group of words, and +other groups, and take special action. The recognition could take place +inside the stemmer, or be done before the stemmer is called. One special +action would be to stem (perhaps one should say ‘map’) all the forms to a +standard form, ETRE, to indicate that they are parts of the verb être. +Deciding what to do with the term ETRE, and it would probably be to +discard it, would be done outside the stemming process. Another special +action would be to recognize a whole class of stopwords and simply discard +them. +

+ +

+The strategy adopted will depend upon the underlying IR model, so what one +needs is the flexibility to create modified forms of a standard stemmer. +Usually we present Snowball stemmers in their unadorned form. Thereafter, +the addition of stopword tables is quite easy. +

+ +

8 Rare forms

+ +

+Stemmers do not need to handle linguistic forms that turn up only very +rarely, but in practice it is hard to design a stemmer with all rare forms +eliminated without there appearing to be some gaps in the thinking. For +this reason one should not worry too much about their occasional presence. +For example, in contemporary Portuguese, use of the second person plural +form of verbs has almost completely disappeared. Even so, endings for +those forms are included in the Portuguese stemmer. They appear in all the +grammar books, and will in any case be found in older texts. The habit of +putting in rare forms to ‘complete the picture’ is well established, and +usually passes unnoticed. An example is the list of English stopwords in +van Rijsbergen (1979). This includes yourselves, by analogy with +himself, herself etc., although yourselves is actually quite a rare +word in English. +

+ +

References

+ +

+Farber DJ, Griswold RE and Polonsky IP (1964) SNOBOL, a string manipulation +language. Journal of the Association for Computing Machinery, 11: 21-30. +

+ +

+Griswold RE, Poage JF and Polonsky IP (1968) The SNOBOL4 programming +language. Prentice-Hall, New Jersey. +

+ +

+Harman D (1991) How effective is suffixing? Journal of the American +Society for Information Science, 42: 7-15. +

+ +

+Jesperson O (1921) Language, its nature, origin and development. George +Allen & Unwin, London. +

+ +

+Kraaij W and Pohlmann R. (1994) Porter’s stemming algorithm for Dutch. In +Noordman LGM and de Vroomen WAM, eds. Informatiewetenschap 1994: +Wetenschappelijke bijdragen aan de derde STINFON Conferentie, Tilburg, +1994. pp. 167-180. +

+ +

+Kraaij W and Pohlmann R (1995) Evaluation of a Dutch stemming algorithm. +Rowley J, ed. The New Review of Document and Text Management, volume 1, +Taylor Graham, London, 1995. pp. 25-43, +

+ +

+Krovetz B (1995) Word sense disambiguation for large text databases. PhD +Thesis. Department of Computer Science, University of Massachusetts +Amherst. +

+ +

+Lennon M, Pierce DS, Tarry BD and Willett P (1981) An evaluation of some +conflation algorithms for information retrieval. Journal of Information +Science, 3: 177-183. +

+ +

+Lovins JB (1968) Development of a stemming algorithm. Mechanical +Translation and Computational Linguistics, 11: 22-31. +

+ +

+Palmer FR (1965) A linguistic study of the English verb. Longmans, London. +

+ +

+Popovic M and Willett P (1990) Processing of documents and queries in a +Slovene language free text retrieval system. Literary and Linguistic +Computing, 5: 182-190. +

+ +

+Porter MF (1980) An algorithm for suffix stripping. Program, 14: 130-137. +

+ +

+Rijsbergen CJ (1979) Information retrieval. Second edition. Butterworths, +London. +

+ +

+Savoy J (1993) Stemming of French words based on grammatical categories. +Journal of the American Society for Information Science, 44: 1-9. +

+ +

+Schinke R, Greengrass M, Robertson AM and Willett P (1996) A stemming +algorithm for Latin text databases. Journal of Documentation, 52: +172-187. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/texts/introduction.tt b/texts/introduction.tt new file mode 100644 index 0000000..16b7727 --- /dev/null +++ b/texts/introduction.tt @@ -0,0 +1,923 @@ +[% header('Snowball: A language for stemming algorithms') %] + +

Links

+ + + +

+M.F. Porter
+October 2001 +

+ +

Summary

+ +

+ Algorithmic stemmers continue to have great utility in IR, despite the + promise of out-performance by dictionary-based stemmers. Nevertheless, + there are few algorithmic descriptions of stemmers, and even when they + exist they are liable to misinterpretation. Here we look at the ideas + underlying stemming, and on this website define a language, Snowball, + in which stemmers can be exactly defined, and from which fast stemmer + programs in ANSI C or Java can be generated. A range of stemmers is presented + in parallel algorithmic and Snowball form, including the original + Porter stemmer for English. +

+ +

1 Introduction

+ +

+There are two main reasons for creating Snowball. One is the lack of +readily available stemming algorithms for languages +other than English. The other is the consciousness of a certain failure on +my part in promoting exact implementations of the stemming +algorithm described in (Porter 1980), which has come to be called the +Porter stemming algorithm. The first point needs some qualification: a +great deal of work has been done on stemmers in a wide range of natural +languages, both in their development and evaluation, (a complete +bibliography cannot be attempted here). But it is rare to see a stemmer +laid out in an unambiguous algorithmic form from which encodings in C, +Java, Perl etc might easily be made. When exact descriptions are +attempted, it is often with approaches to stemming that are +relatively simple, for example the Latin stemmer of Schinke (Schinke 1996), +or the Slovene stemmer of Popovic (Popovic 1990). A more complex, and +therefore more characteristic stemmer is the Kraaij-Pohlmann stemmer for +Dutch (Kraaij 1994), which is presented as open source code in ANSI C. To +extract an algorithmic description of their stemmer from the source code +proves to be quite hard. +

+ +

+The disparity between the Porter stemmer definition and many of its +purported implementations is much wider than is generally realised in the +IR community. Three problems seem to compound: one is a misunderstanding +of the meaning of the original algorithm, another is bugs in the +encodings, and a third is the almost irresistible urge of programmers +to add improvements. +

+ +

+For example, a Perl script advertised on the Web as an +implementation of the Porter algorithm was tested in October 2001, and it was +found that 14 percent of words were stemmed incorrectly when given a large sample +vocabulary. Most words of English have +very simple endings, so this means that it was effectively getting everything +wrong. At certain points on the Web are demonstrations of the Porter stemmer. +You type some English into a box and the stemmed words are displayed. These +are frequently faulty. (A good test is to type in agreement. It should stem +to agreement — the same word. If it stems to agreem there is an +error.) Researchers frequently pick up faulty versions of the stemmer and +report that they have applied ‘Porter stemming’, with the result that their +experiments are not quite repeatable. Researchers who work on stemming will +sometimes give incorrect examples of the behaviour of the Porter stemmer in +their published works. +

+ +

+To address all these problems I have tried to develop a rigorous system +for defining stemming algorithms. A language, Snowball, has been invented, +in which the rules of stemming algorithms can be expressed in a natural +way. Snowball is quite small, and can be learned by an experienced +programmer in an hour or so. On this website a number of foreign language +stemmers is presented (a) in Snowball, and (b) in a less formal +English-language description. (b) can be thought of as the program +comments for (a). A Snowball compiler translates each Snowball +definition into (c) an equivalent program in ANSI C or Java. Finally (d) +standard vocabularies of words and their stemmed equivalents are provided +for each stemmer. The combination of (a), (b), (c) and (d) +can be used to pin down the definition of a stemmer exactly, and it is +hoped that Snowball itself will be a useful resource in creating stemmers +in the future. +

+ +

2 Some ideas underlying stemming

+ +

+Work in stemming has produced a number of different approaches, albeit tied +together by a number of common assumptions. It is worthwhile looking at some +of them to see exactly where Snowball fits into the whole picture. +

+ +

+A point tacitly assumed in almost all of the stemming literature is that +stemmers are based upon the written, and not the spoken, form of the +language. This is also the assumption here. Historically, +grammarians often regarded the written language as the real language and +the spoken as a mere derivative form. Almost in reaction, many modern +linguists have taken a precisely opposite view (Palmer, 1965 pp 2-3). A +more balanced position is that the two languages are distinct though +connected, and require separate treatment. One can in fact imagine parallel +stemming algorithms for the spoken language, or rather for the phoneme +sequence into which the spoken language is transformed. Stress and +intonation could be used as clues for an indexing process in the same way +that punctuation and capitalisation are used as clues in the written +language. But currently stemmers work on the written language for the good +reason that there is so much of it available in machine readable form from +which to build our IR systems. Inevitably therefore the stemmers get +caught up in accidental details of orthography. In English, removing the +ing from rotting should be followed by undoubling the tt, +whereas in rolling we do not undouble the ll. In French, removing +the er from ennuyer should be followed by changing the y to +i, so that the resulting word conflates with ennui, and so on. +

+ +

+The idea of stemming is to improve IR performance generally by bringing +under one heading variant forms of a word which share a common meaning. +Harman (1991) was first to present compelling evidence that it may not do +so, when her experiments discovered no significant improvement with the +use of stemming. +Similarly Lennon (1981) discovered no appreciable difference between different +stemmers running on a constant collection. +Later work has modified this position however. Krovetz +(1995) found significant, although sometimes small, improvements across a +range of test collections. What he did discover is that the degree of +improvement varies considerably between different collections. +These tests were however done on collections in +English, and the reasonable assumption of IR researchers has always been that for +languages that are more highly inflected than English (and nearly all +are), greater improvements will be observed when stemming is applied. My +own view is that stemming helps regularise the +vocabulary of an IR system, and this leads to advantages that are not +easily quantifiable through standard IR experiments. For example, it helps +in presenting lists of terms associated with the query back to the IR user +in a relevance feedback cycle, which is one of the underlying ideas of the +probabilistic model. More will be said on the use of a stemmed vocabulary +in section 5. +

+ +

+Stemming is not a concept applicable to all languages. It is not, for +example, applicable in Chinese. But to languages of the Indo-European (*) +group (and most of the stemmers on this site are for Indo-European +languages), a common +pattern of word structure does emerge. Assuming words are written left to +right, the stem, or root of a word is on the left, and zero or more +suffixes may be added on the right. If the root is modified by this +process it will normally be at its right hand end. And also prefixes may +be added on the left. So unhappiness has a prefix un, a suffix +ness, and the y of happy has become i with the addition of +the suffix. Usually, prefixes alter meaning radically, so they are best +left in place (German and Dutch ge is an exception here). But suffixes +can, in certain circumstances, be removed. So for example happy and +happiness have closely related meanings, and we may wish to stem both +forms to happy, or happi. Infixes can occur, although rarely: +ge in German and Dutch, and zu in German. +

+ +

+One can make some distinction between root and stem. Lovins (1968) +sees the root as the stem minus any prefixes. But here we will +think of the stem as the residue of the stemming process, and the root as the +inner word from which the stemmed word derives, so we think of root to +some extent in an etymological way. It must be admitted that when you +start thinking hard about these concepts root, stem, suffix, +prefix ... they turn out to be very difficult indeed to define. +Nor do definitions, even if we arrive at them, help us much. After all, suffix +stripping is a practical aid in IR, not an exercise in linguistics or +etymology. This is especially true of the central concept of root. We +think of the etymological root of a word as something we can discover with +certainty from a dictionary, forgetting that etymology itself is a subject +with its own doubts and controversies (Jesperson 1922, Chapter XVI). +Indeed, Jesperson goes so far as to say that +

+ +
+

+ ‘It is of course impossible to say how great a proportion of the + etymologies given in dictionaries should strictly be classed under + each of the following heads: (1) certain, (2) probable, (3) + possible, (4) improbable, (5) impossible — but I am afraid the + first two classes would be the least numerous.’ +

+
+ +

+Here we will simply assume a common sense understanding of +the basic idea of stem and suffix, and hope that this proves sufficient +for designing and discussing stemming algorithms. +

+ +

+We can separate suffixes out into three basic classes, which will be +called d-, i- and a-suffixes. +

+ +

+An a-suffix, or attached suffix, is a particle word attached to another +word. (In the stemming literature they sometimes get referred to as +‘enclitics’.) In Italian, for example, personal pronouns attach to +certain verb forms: +

+ +
+
mandargli = mandare + gli = to send + to him +
mandarglielo = mandare + gli + lo = to send + it + to him +
+ +

+a-suffixes appear in Italian and Spanish, and also in Portuguese, although +in Portuguese they are separated by hyphen from the preceding word, which +makes them easy to eliminate. +

+ +

+An i-suffix, or inflectional suffix, forms part of the basic grammar of a +language, and is applicable to all words of a certain grammatical type, +with perhaps a small number of exceptions. In English for example, the past +of a verb is formed by adding ed. Certain modifications may be required +in the stem: +

+ +
+
fit + ed fitted (double t) +
love + ed loved (drop the final e of love) +
+ +

+but otherwise the rule applies in a regular way to all verbs in +contemporary English, with about 150 (Palmer, 1965) exceptional forms, +

+ +
+
bear beat become begin bend .... +
bore beat became began bent +
+ +

+A d-suffix, or derivational suffix, enables a new word, often with a +different grammatical category, or with a different sense, to be built from +another word. Whether a d-suffix can be attached is discovered not from +the rules of grammar, but by referring to a dictionary. So in English, +ness can be added to certain adjectives to form corresponding nouns +(littleness, kindness, foolishness ...) but not to all adjectives (not for +example, to big, cruel, wise ...) d-suffixes can be used to change +meaning, often in rather exotic ways. So in Italian astro means a sham +form of something else: +

+ +
+
medico + astro = medicastro = quack doctor +
poeta + astro = poetastro = poetaster +
+ +

+Generally i-suffixes follow d-suffixes. i-suffixes can precede d-suffixes, +for example lovingly, devotedness, but such cases are exceptional. To +be a little more precise, d-suffixes can sometimes be added to +participles. devoted, used adjectivally, is a participle derived from the +verb devote, and ly can be added to turn the adjective into an adverb, +or ness to turn it into a noun. The same feature occurs in other +Indo-European languages. +

+ +

+Sometimes it is hard to say whether a suffix is a d-suffix or i-suffix, +the comparative and superlative endings er, est of English for example. +

+ +

+A d-suffix can serve more than one function. In English, for example, +ly standardly turns an adjective into an adverb (greatly), but it +can also turn a noun into an adjective (kingly). In French, ement +also standardly turns an adjective into an adverb (grandement), but it +can also turn a verb into a noun (rapprochement). (Referring to the +French stemmer, this double use is ultimately why ement is tested for +being in the RV rather than the R2 region of the word being +stemmed.) +

+ +

+It is quite common for an i-suffix to serve more than one function. +In English, s can either be (1) a verb ending attached to third person +singular forms (runs, sings), (2) a noun ending indicating the plural +(dogs, cats) or (3) a noun ending indicating the possessive +(boy’s, girls’). By an orthographic convention now several hundred +years old, the possessive is written with an apostrophe, but +nowadays this is +frequently omitted in familiar phrases (a girls school). (Usage (3) is +relatively rare compared with (1) and (2): there are only nine uses of +’s in this document.) +

+ +

+Since the normal order of suffixes is d, i and a, we +can expect them to be removed +from the right in the order a, i and d. Usually we want to remove +all a- and i-suffixes, and some of the d-suffixes. +

+ +

+If the stemming process reduces two words to the same stem, they are said +to be conflated. +

+ +

3 Stemming errors, and the use of dictionaries

+ +

+One way of thinking of the relation between terms and documents in an IR +system is to see the documents as being about concepts, and the terms as +words that describe the concepts. Then, of course, one word can cover many +concepts, so pound can mean a unit of currency, a weight, an enclosure, +or a beating. Pound is a homonym. And one concept can be described by +many words, as with money, capital, cash, currency. These words +are synonyms. There is a many-many mapping therefore between the set of +terms and the set of concepts. Stemming is a process that transforms this +mapping to advantage, on the whole reducing the number of synonyms, but +occasionally creating new homonyms. It is worth remembering that what are +called stemming errors are usually just the introduction of new homonyms into +vocabularies that already contain very large numbers of homonyms. +

+ +

+Words which have no place in this term-concept mapping are those which +describe no concepts. The particle words of grammar, the, of, +and +..., known in IR as stopwords, fall into this category. Stopwords can be +useful for retrieval but only in searching for phrases, ‘to be or not to +be’, ‘do as you would be done by’ etc. This suggests that stemming +stopwords is not useful. More will be said on stopwords in section 7. +

+ +

+In the literature, a distinction is often made between +under-stemming, which is the error of taking off too small a suffix, and +over-stemming, which is the error of taking off too much. In French, for +example, croûtons is the plural of croûton, ‘a crust’, so to remove +ons would be over-stemming, while croulons is a verb form of crouler, +‘to totter’, so to remove s would be under-stemming. We would like to +introduce a further distinction between mis-stemming and over-stemming. +Mis-stemming is taking off what looks like an ending, but is really part +of the stem. Over-stemming is taking off a true ending which results in +the conflation of words of different meanings. +

+ +

+So for example ly can be removed from cheaply, but not from reply, +because in reply ly is not a suffix. If it was removed, reply would +conflate with rep, (the commonly used short form of representative). +Here we have a case of mis-stemming. +

+ +

+To illustrate over-stemming, look at these four words, +

+ +
+
verb adjective + +
First pair: prove provable +
Second pair: probe probable +
+ +

+Morphologically, the two pairs are exactly parallel (in the written, if not +the spoken language). They also have a common etymology. All four words +derive from the Latin probare, ‘to prove or to test’, and the idea of +testing connects the meanings of the words. But the meanings are not parallel. +provable means ‘able to be proved’; probable does not mean ‘able to be +probed’. Most people would judge conflation of the first pair as correct, +and of the second pair, incorrect. In other words, to remove able from +probable is a case of over-stemming. +

+ +

+We can try to avoid mis-stemming and over-stemming by using a dictionary. +The dictionary can tell us that reply does not derive from rep, and +that the meanings of probe and probable are well separated in modern +English. It is important to realise however that a dictionary does not give +a complete solution here, but can be a tool to improve the conflation +process. +

+ +

+In Krovetz’s dictionary experiments (Krovetz 1995), he noted that in +looking up a past participle like suited, one is led either to suit or +to suite as plausible infinitive forms. suite can be rejected, +however, because the dictionary tells us that +although it is a word of English +it is not a verb form. Cases +like this (and Krovetz found about 60) had to be treated as exceptions. But +the form routed could +either derive from the verb rout or the verb route: +

+ +

+ At Waterloo Napoleon’s forces were routed
+ The cars were routed off the motorway +

+ +

+Such cases in English are extremely rare, but they are commoner in more +highly inflected languages. In French for example, affiliez can either be +the verb affiler, to sharpen, with imperfect ending iez, or the verb +affilier, to affiliate, with present indicative ending ez: +

+ +
+
vous affiliez = vous affil-iez = you sharpened +
vous affiliez = vous affili-ez = you affiliate +
+ +

+If the second is intended, removal of iez is mis-stemming. +

+ +

+With over-stemming we must rely upon the dictionary to separate meanings. +There are different ways of doing this, but all involve some degree of +reliance upon the lexicographers. Krovetz’s methods are no doubt best, +because the most objective: he uses several measures, but they are based on +the idea of measuring the similarity in +meaning of two words by the degree of overlap among the words used to define +them, and this is at a good remove from a lexicographer’s subjective +judgement about semantic similarity. +

+ +

+There is an interesting difference between mis-stemming and over-stemming +to do with language history. The morphology of a language changes less +rapidly than the meanings of the words in it. When extended to include a +few archaic endings, such as ick as an alternative to ic, a stemmer for +contemporary English can be applied to the English of 300 years ago. +Mis-stemmings will be roughly the same, but the pattern of over-stemming will +be different because of the changing meaning of words in the language. For +example, relativity in the 19th century merely meant ‘the condition of +being relative to’. With that meaning, it is acceptable to conflate it +with relative. +But with the 20th century meaning brought to it by +Einstein, stemming to relativ is over-stemming. +Here we see the word with the suffix changing its meaning, but it can happen +the other way round. transpire has come to mean ‘happen’, and its old +meaning of ’exhalation’ or ‘breathing out’ is now effectively lost. +(That is the bitter reality, although dictionaries still try to persuade us +otherwise). But transpiration still carries the earlier meaning. +So what was formerly an acceptable stemming may be judged now as +an over-stemming, not because the word being stemmed has changed its meaning, +but because some cognate word has changed its meaning. +

+ +

+In these examples we are presenting words as if they had single meanings, but +the true picture is more complicated. Krovetz uses a model of word +meanings which is extremely helpful here. He makes a distinction between +homonyms and polysemes. The meaning of homonyms are quite unrelated. +For example, ground in the sense of ‘earth’, and ‘ground’ as the past +participle of ‘grind’ are homonyms. Etymologically homonyms have different +stories, and they usually have separate entries in a dictionary. But each +homonym form can have a range of polysemic forms, corresponding to different +shades of meaning. So ground can mean the earth’s surface, or the bottom +of the sea, or soil, or any base, and so the basis of an argument, and so on. +Over time new polysemes appear and old ones die. At any moment, the use of a +word will be common in some polysemic forms and rare in others. If a suffix is +attached to a word the new word will get a different set of polysemes. For +example, grounds = ground + s acquires the sense of ‘dregs’ and +‘estate lands’, loses the sense of ‘earth’, and shares the sense of +‘basis’. +

+ +

+Consider the conflation of mobility with mobile. mobile has +acquired two new polysemes not shared with mobility. One is the ‘mobile +art object’, common in the nursery. This arrived in the 1960s, and is +still in use. The other is the ‘mobile phone’ which is now very dominant, +although it may decline in the future when it has been replaced by some new +gadget with a different name. We might draw a graph of the degree of +separation of the meanings of mobility and mobile against time, +which would depend upon the number of polysemes and the intensity of their +use. What seemed like a valid conflation of the two words in 1940 may seem +to be invalid today. +

+ +

+In general therefore one can say that judgements about whether words are +over-stemmed change with time as the meanings of words in the language +change. +

+ +

+The use of a dictionary should reduce errors of mis-stemming and errors of +over-stemming. And, for English at least, the mis-stemming errors should +reduce well, even if there are problems with over-stemming errors. Of +course, it depends on the quality of the dictionary. A dictionary will need +to be very comprehensive, fully up-to-date, and with good word definitions +to achieve the best results. +

+ +

+Historically, stemmers have often been thought of as either +dictionary-based or algorithmic. The presentation of studies of stemming +in the literature has perhaps helped to create this division. In the +Lovins’ stemmer the algorithmic description is central. In accounts of +dictionary-based stemmers the emphasis tends to be on dictionary content +and structure, and IR effectiveness. Savoy’s French stemmer (Savoy, 1993) +is a good example of this. But the two approaches are not really distinct. +An algorithmic stemmer can include long exception lists that are +effectively mini-dictionaries, and a dictionary-based stemmer usually +needs a process for removing at least i-suffixes to make the look-up +in the dictionary possible. In fact in a language in which proper names +are inflected (Latin, Finnish, Russian ...), a dictionary-based stemmer +will need to remove i-suffixes independently of dictionary look-up, +because the proper names will not of course be in the dictionary. +

+ +

+The stemmers available on the Snowball website are all purely +algorithmic. They can be extended to include built-in exception lists, they +could be used in combination with a full dictionary, but they are still +presented here in their simplest possible form. Being purely algorithmic, +they are, or ought to be, inferior to the performance of well-constructed +dictionary-based stemmers. But they are still very useful, for the +following reasons: +

+ +
    +
  1. Algorithmic stemmers are (or can be made) very lean and very fast. The +stemmers presented here generate code that will process about a million +words in six seconds on a conventional 500MHz PC. Nowadays we can generate +very large IR systems with quite modest resources, and tools that assist in +this have value. +

    + +
  2. Despite the errors they can be seen to make, algorithmic stemmers still +give good practical results. As Krovetz (1995) says in surprise of the +algorithmic stemmer, ‘Why does it do so well?’ (page 89). +

    + +
  3. Dictionary-based stemmers require dictionary maintenance, to keep up +with an ever-changing language, and this is actually quite a problem. It +is not just that a dictionary created to assist stemming today will +probably require major updating in a few years time, but that a dictionary +in use for this purpose today may already be several years out of date. +

    +
+ +

+We can hazard an answer to Krovetz’s question, as to why algorithmic +stemmers perform as well as they do, when they reveal so many cases of +under-, over- and mis-stemming. Under-stemming is a fault, but by itself +it will not degrade the performance of an IR system. Because of +under-stemming words may fail +to conflate that ought to have conflated, but you are, in a sense, no +worse off than you were before. Mis-stemming is more serious, but again +mis-stemming does not really matter unless it leads to false conflations, +and that frequently does not happen. For example, removing the ate +ending in English, can result in useful conflations (luxury, +luxuriate; affection, affectionate), but very often produces +stems that are not English words +(enerv-ate, accommod-ate, +deliber-ate etc). In the literature, these are normally +classed as stemming errors — overstemming — although in our nomenclature +they are examples of mis-stemming. +However these residual stems, +enerv, accommod, +deliber ... do not conflate with other word forms, and so behave in +an IR system in the same way as if they still retained their ate +ending. No false conflations arise, and so there is no over-stemming here. +

+ +

+To summarise, one can say that just as a word can be over-stemmed +but not mis-stemmed (relativityrelative), so it can be +mis-stemmed but not over-stemmed (enervateenerv). And, of +course, even over-stemming does not matter, if the over-stemmed word falsely +conflates with other words that exist in the language, but are not +encountered in the IR +system which is being used. +

+ +

+Of the three types of error, +over-stemming is the most important, and +using a dictionary does not eliminate all over-stemmings, but does reduce their +incidence. +

+ +

4 Stemming as part of an indexing process

+ +

+Stemming is part of a composite process of extracting words from text and +turning them into index terms in an IR system. Because stemming is somewhat +complex and specialised, it is usually studied in isolation. Even so, it +cannot really be separated from other aspect of the indexing process: +

+ +
    +
  1. What is a word? For indexing purposes, a word in a European language is +a sequence of letters bounded by non-letters. But in English, an internal +apostrophe does not split a word, although it is not classed as a letter. +The treatment of these word boundary characters affects the stemmer. For +example, the Kraaij Pohlmann stemmer for Dutch (Kraaij, 1994, 1995) removes hyphen and +treats apostrophe as part of the alphabet (so ’s, ’tje and ’je are three +of their endings). The Dutch stemmer presented here assumes hyphen and +apostrophe have already been removed from the word to be stemmed. +

    + +
  2. What is a letter? Clearly letters define words, but different languages +use different letters, much confusion coming from the varied use of +accented Roman letters. +

    + +

    +English speakers, perhaps influenced by the ASCII character set, typically regard +their alphabet of a to z as the norm, and other forms (for example, Danish +å and ø, or German ß) as somewhat abnormal. But this is +an insular point of view. In Italian, for example, the letters +j, k, w, x and y are not part of the alphabet, and are +only seen in foreign words. We also tend to regard other alphabets as only +used for isolated languages, and that is not strictly true. Cyrillic is +used for a range of languages other than Russian, among which additional +letters and accented forms abound. +

    + +

    +In English, a broad definition of letter would be anything that could be +accepted as a pronounceable element of a word. This would include +accented Roman letters (naïve, Fauré), and certain ligature +forms (encyclopædia). It would exclude letters +of foreign alphabets, such as Greek and Cyrillic. +The a to z alphabet is one of those where letters come in +two styles, upper and lower case, which historically correspond (very roughly) to the +shapes you get if you use a chisel or a pen. Across all languages, the +exact relation of upper to lower case is not so easy to define. In Italian, +for example, an accented lower case letter is sometimes represented in +upper case by the unaccented letter followed by an apostrophe. (I have +seen this convention used in modern Italian news stories in machine +readable form.) +

    + +

    +In fact the Porter stemmer (which is for English) assumes the word being stemmed is +unaccented and in lower case. More exactly, a, e, i, o, +u, +and sometimes y, are +treated as vowels, and any other character gets treated as a consonant. +Each stemmer presented here assumes some degree of normalisation before it +receives the word, which is roughly (a) put all letters into lower case, +and (b) remove accents from letter-accent combinations that do not form +part of the alphabet of the language. Each stemmer declares the +letter-accent combinations for its language, and this can be used as a +guide for the normalisation, but even so, we can see from +the discussion above that (a) and (b) are not trivial +operations, and need to be done with care. +

    + +

    +(Incidentally, because the stemmers work on lower case words, turning +letters to upper case is sometimes used internally for flagging purposes.) +

    + +
  3. Identifying stopwords. Invariant stopwords are more easily found before +stemming is applied, but inflecting stopwords (for example, German kein, keine, keinem, +keinen ... ) may be easier to find after — because there are fewer forms. +There is a case for building stopword identification into the stemming +process. See section 7. +

    + +
  4. Conflating irregular forms. More will be said on this in section 6. +

    +
+ +

5 The use of stemmed words

+ +

+The idea of how stemmed words might be employed in an IR system has +evolved slightly over the years. The Lovins stemmer (Lovins 1968) was +developed not for indexing document texts, but the subject terms attached +to them. With queries stemmed in the same way, the user needed no special +knowledge of the form of the subject terms. Rijsbergen (1979, Chapter 2) +assumes document text analysis: stopwords are removed, the remaining words +are stemmed, and the resulting set of stemmed word constitute the IR index +(and this style of use is widespread today). More flexibility however is +obtained by indexing all words in a text in an unstemmed form, and +keeping a separate two-column relation which connects the words to their +stemmed equivalents. The relation can be denoted by R(s, w), which means +that s is the stemmed form of word w. From the relation we can get, for +any word w, its unique stemmed form, stem(w), and for any stem s, the set +of words, words(s), that stem to s. +

+ +

+The user should not have to see the stemmed form of a word. If a list of +stems is to be presented back for query expansion, in place of +a stem, s, the user should be shown a single representative from the set +words(s), the one of highest frequency perhaps. The user should also +be able to choose for the whole query, or at a lower level for each word +in a query, whether or not it should be stemmed. In the absence of such +choices, the system can make its own +decisions. +Perhaps single word queries would not undergo +stemming; long queries would; stopwords would be removed +except in phrases. In query expansion, the system would work with stemmed +forms, ignoring stopwords. +

+ +

+Query expansion with stemming results in a much cleaner vocabulary list +than without, and this is a main strength of using a stemming process. +

+ +

+A question arises: if the user never sees the stemmed form, does its +appearance matter? The answer must be no, although +the Porter stemmer tries to make the unstemmed forms guessable from the stemmed +forms. For example, from appropri you can guess appropriate. At least, +trying to achieve this effect acts as a useful control. Similarly with the +other stemmers presented here, an attempt has been made to keep the +appearance of the stemmed forms as familiar as possible. +

+ +

6 Irregular grammatical forms

+ +

+All languages contain irregularities, but to what extent should they be +accommodated in a stemming algorithm? An English stemmer, for example, can +convert regular plurals to singular form without difficulty (boys, girls, +hands ...). Should it do the same with irregular plurals (men, children, +feet, ...)? Here we have irregular cases with i-suffixes, but there are +irregularities with d-suffixes, which Lovins calls ‘spelling exceptions’. +absorb/absorption and conceive/conception are examples of this. +Etymologically, the explanation of the first is that the Latin root, +sorbere, is an irregular verb, and of the second that the word +conceive comes to us from the French rather than straight from the Latin. +It is interesting that, even with no knowledge of the etymology, we do +recognise the connection between the words. +

+ +

+Lovins tries to solve spelling exceptions by formulating general respelling +rules (turn rpt into rb for example), but it might be easier to have +simply a list of exceptional stems. +

+ +

+The Porter stemmer does not handle irregularities at all, but from the +author’s own experience, this has never been an area of complaint. +Complaints in fact are always about false conflations, for example new +and news. +

+ +

+Possibly Lovins was right in wanting to resolve d-suffix irregularities, +and not being concerned about i-suffix irregularities. i-suffix +irregularities in English go with short, old words, that are either in very +common use (man/men, woman/women, see/saw ...) or are used only rarely +(ox/oxen, louse/lice, forsake/forsook ...). The latter class can be +ignored, and the former has its own problems which are not always solved +by stemming. For example man is a verb, and saw can mean a cutting +instrument, or, as a verb, can mean to use such an instrument. Conflation +of these forms frequently leads to an error like mis-stemming therefore. +

+ +

+An algorithmic stemmer really needs holes where the irregular forms can be +plugged in as necessary. This is more serviceable than attempting to +embed special lists of these irregular forms into software. +

+ +

7 Stopwords

+ +

+We have suggested that stemming stopwords is not useful. There is a +grammatical connection between being and be, but conflation of the two +forms has little use in IR because they have no shared meaning that would +entitle us to think of them as synonyms. being and be have a +morphological connection as well, but that is not true of am and was, +although they have a grammatical connection. Generally speaking, +inflectional stopwords exhibit many irregularities, which means that +stemming is not only not useful, but not possible, unless one builds into +the stemmer tables of exceptions. +

+ +

+Switching from English to French, consider être, the equivalent form +of be. It has about 40 different forms, including, +

+ +
+ suis   es   sommes   serez   étaient   fus   furent   sois   été +
+ +

+(and suis incidentally is a homonym, as part of the verb suivre.) +Passing all forms through a rule-based stemmer creates something of a +mess. An alternative approach is to recognise this group of words, and +other groups, and take special action. The recognition could take place +inside the stemmer, or be done before the stemmer is called. One special +action would be to stem (perhaps one should say ‘map’) all the forms to a +standard form, ETRE, to indicate that they are parts of the verb être. +Deciding what to do with the term ETRE, and it would probably be to +discard it, would be done outside the stemming process. Another special +action would be to recognize a whole class of stopwords and simply discard +them. +

+ +

+The strategy adopted will depend upon the underlying IR model, so what one +needs is the flexibility to create modified forms of a standard stemmer. +Usually we present Snowball stemmers in their unadorned form. Thereafter, +the addition of stopword tables is quite easy. +

+ +

8 Rare forms

+ +

+Stemmers do not need to handle linguistic forms that turn up only very +rarely, but in practice it is hard to design a stemmer with all rare forms +eliminated without there appearing to be some gaps in the thinking. For +this reason one should not worry too much about their occasional presence. +For example, in contemporary Portuguese, use of the second person plural +form of verbs has almost completely disappeared. Even so, endings for +those forms are included in the Portuguese stemmer. They appear in all the +grammar books, and will in any case be found in older texts. The habit of +putting in rare forms to ‘complete the picture’ is well established, and +usually passes unnoticed. An example is the list of English stopwords in +van Rijsbergen (1979). This includes yourselves, by analogy with +himself, herself etc., although yourselves is actually quite a rare +word in English. +

+ +

References

+ +

+Farber DJ, Griswold RE and Polonsky IP (1964) SNOBOL, a string manipulation +language. Journal of the Association for Computing Machinery, 11: 21-30. +

+ +

+Griswold RE, Poage JF and Polonsky IP (1968) The SNOBOL4 programming +language. Prentice-Hall, New Jersey. +

+ +

+Harman D (1991) How effective is suffixing? Journal of the American +Society for Information Science, 42: 7-15. +

+ +

+Jesperson O (1921) Language, its nature, origin and development. George +Allen & Unwin, London. +

+ +

+Kraaij W and Pohlmann R. (1994) Porter’s stemming algorithm for Dutch. In +Noordman LGM and de Vroomen WAM, eds. Informatiewetenschap 1994: +Wetenschappelijke bijdragen aan de derde STINFON Conferentie, Tilburg, +1994. pp. 167-180. +

+ +

+Kraaij W and Pohlmann R (1995) Evaluation of a Dutch stemming algorithm. +Rowley J, ed. The New Review of Document and Text Management, volume 1, +Taylor Graham, London, 1995. pp. 25-43, +

+ +

+Krovetz B (1995) Word sense disambiguation for large text databases. PhD +Thesis. Department of Computer Science, University of Massachusetts +Amherst. +

+ +

+Lennon M, Pierce DS, Tarry BD and Willett P (1981) An evaluation of some +conflation algorithms for information retrieval. Journal of Information +Science, 3: 177-183. +

+ +

+Lovins JB (1968) Development of a stemming algorithm. Mechanical +Translation and Computational Linguistics, 11: 22-31. +

+ +

+Palmer FR (1965) A linguistic study of the English verb. Longmans, London. +

+ +

+Popovic M and Willett P (1990) Processing of documents and queries in a +Slovene language free text retrieval system. Literary and Linguistic +Computing, 5: 182-190. +

+ +

+Porter MF (1980) An algorithm for suffix stripping. Program, 14: 130-137. +

+ +

+Rijsbergen CJ (1979) Information retrieval. Second edition. Butterworths, +London. +

+ +

+Savoy J (1993) Stemming of French words based on grammatical categories. +Journal of the American Society for Information Science, 44: 1-9. +

+ +

+Schinke R, Greengrass M, Robertson AM and Willett P (1996) A stemming +algorithm for Latin text databases. Journal of Documentation, 52: +172-187. +

+ +[% footer %] diff --git a/texts/r1r2.html b/texts/r1r2.html new file mode 100644 index 0000000..89d3cda --- /dev/null +++ b/texts/r1r2.html @@ -0,0 +1,142 @@ + + + + + + + + + + Defining R1 and R2 - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Defining R1 and R2

+ + +

+Most of the stemmers make use of at least one of the region definitions R1 and +R2. They are defined as follows: +

+ +

+R1 is the region after the first non-vowel following a vowel, or is the null +region at the end of the word if there is no such non-vowel. +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or is +the null region at the end of the word if there is no such non-vowel. +

+ +

+The definition of vowel varies from language to language. In French, for +example, é is a vowel, and in Italian i between two other vowels is not a +vowel. The class of letters that constitute vowels is made clear in each stemmer. +

+ +

+Below, R1 and R2 are shown for a number of English words, +

+ +
+    b   e   a   u   t   i   f   u   l
+                      |<------------->|    R1
+                              |<----->|    R2
+
+ +

+Letter t is the first non-vowel following a vowel in beautiful, so R1 +is iful. In iful, the letter f is the first non-vowel following a +vowel, so R2 is ul. +

+ +
+    b   e   a   u   t   y
+                      |<->|    R1
+                        ->|<-  R2
+
+ +

+In beauty, the last letter y is classed as a vowel. Again, letter t is +the first non-vowel following a vowel, so R1 is just the last letter, y. +R1 contains no non-vowel, so R2 is the null region at the end of the word. +

+ +
+    b   e   a   u
+                ->|<-  R1
+                ->|<-  R2
+
+In beau, R1 and R2 are both null. +

+ +

+Other examples: +

+ +
+    a   n   i   m   a   d   v   e   r   s   i   o   n
+          |<----------------------------------------->|    R1
+                  |<--------------------------------->|    R2
+
+    s   p   r   i   n   k   l   e   d
+                      |<------------->|    R1
+                                    ->|<-  R2
+
+    e   u   c   h   a   r   i   s   t
+              |<--------------------->|    R1
+                          |<--------->|    R2
+
+ +
+
+
+ +
+ +
+ + + + diff --git a/texts/r1r2.tt b/texts/r1r2.tt new file mode 100644 index 0000000..6b2df04 --- /dev/null +++ b/texts/r1r2.tt @@ -0,0 +1,78 @@ +[% header('Defining R1 and R2') %] + +

+Most of the stemmers make use of at least one of the region definitions R1 and +R2. They are defined as follows: +

+ +

+R1 is the region after the first non-vowel following a vowel, or is the null +region at the end of the word if there is no such non-vowel. +

+ +

+R2 is the region after the first non-vowel following a vowel in R1, or is +the null region at the end of the word if there is no such non-vowel. +

+ +

+The definition of vowel varies from language to language. In French, for +example, é is a vowel, and in Italian i between two other vowels is not a +vowel. The class of letters that constitute vowels is made clear in each stemmer. +

+ +

+Below, R1 and R2 are shown for a number of English words, +

+ +
+    b   e   a   u   t   i   f   u   l
+                      |<------------->|    R1
+                              |<----->|    R2
+
+ +

+Letter t is the first non-vowel following a vowel in beautiful, so R1 +is iful. In iful, the letter f is the first non-vowel following a +vowel, so R2 is ul. +

+ +
+    b   e   a   u   t   y
+                      |<->|    R1
+                        ->|<-  R2
+
+ +

+In beauty, the last letter y is classed as a vowel. Again, letter t is +the first non-vowel following a vowel, so R1 is just the last letter, y. +R1 contains no non-vowel, so R2 is the null region at the end of the word. +

+ +
+    b   e   a   u
+                ->|<-  R1
+                ->|<-  R2
+
+In beau, R1 and R2 are both null. +

+ +

+Other examples: +

+ +
+    a   n   i   m   a   d   v   e   r   s   i   o   n
+          |<----------------------------------------->|    R1
+                  |<--------------------------------->|    R2
+
+    s   p   r   i   n   k   l   e   d
+                      |<------------->|    R1
+                                    ->|<-  R2
+
+    e   u   c   h   a   r   i   s   t
+              |<--------------------->|    R1
+                          |<--------->|    R2
+
+ +[% footer %] diff --git a/texts/vowelmarking.html b/texts/vowelmarking.html new file mode 100644 index 0000000..3485517 --- /dev/null +++ b/texts/vowelmarking.html @@ -0,0 +1,138 @@ + + + + + + + + + + Marking vowels as consonants - Snowball + + + + + + + + + + +
+
+
+ +
+
+

Marking vowels as consonants

+ + +

+Some of the algorithms begin with a step which puts letters which are +normally classed as vowels into upper case to indicate that they are are to be +treated as consonants (the assumption being that the words are presented to +the stemmers in lower case). Upper case therefore acts as a flag indicating a +consonant. +

+ +

+For example, the English stemmer begins with the step +

+ Set initial y, or y after a vowel, to Y, +
+giving rise to the following changes, +

+ +
+
youth   →   Youth +
boy   →   boY +
boyish   →   boYish +
fly   →   fly +
flying   →   flying +
syzygy   →   syzygy +
+ +

+This process works from left to right, and +if a word contains Vyy, where V is a vowel, the first y is put +into upper case, but the second y is left alone, since it is preceded by +upper case Y which is a consonant. A sequence Vyyyyy... would be +changed to VYyYyY.... +

+ +

+The combination yy never occurs in English, although it might appear in +foreign words: +

+ +
+
sayyid   →   saYyid +
+ +

+(A sayyid, my dictionary tells me, is a descendant of Mohammed's daughter +Fatima.) But the left-to-right process is significant in other languages, for +example French. In French the rule for marking vowels as consonants is, +

+ +
+ Put into upper case u or i preceded and followed by a vowel, and + y preceded or followed by a vowel. Put u after q into upper + case. +
+ +

+which gives rise to, +

+ +
+
ennuie   →   ennuIe +
inquiétude   →   inqUiétude +
+ +

+In the first word, i is put into upper case since it has a vowel on both +sides of it. +In the second word, u after q is put into upper case, and again the +following i is left alone, since it is preceded by upper case U which +is a consonant. +

+ +
+
+
+ +
+ +
+ + + + diff --git a/texts/vowelmarking.tt b/texts/vowelmarking.tt new file mode 100644 index 0000000..13a24d2 --- /dev/null +++ b/texts/vowelmarking.tt @@ -0,0 +1,74 @@ +[% header('Marking vowels as consonants') %] + +

+Some of the algorithms begin with a step which puts letters which are +normally classed as vowels into upper case to indicate that they are are to be +treated as consonants (the assumption being that the words are presented to +the stemmers in lower case). Upper case therefore acts as a flag indicating a +consonant. +

+ +

+For example, the English stemmer begins with the step +

+ Set initial y, or y after a vowel, to Y, +
+giving rise to the following changes, +

+ +
+
youth   →   Youth +
boy   →   boY +
boyish   →   boYish +
fly   →   fly +
flying   →   flying +
syzygy   →   syzygy +
+ +

+This process works from left to right, and +if a word contains Vyy, where V is a vowel, the first y is put +into upper case, but the second y is left alone, since it is preceded by +upper case Y which is a consonant. A sequence Vyyyyy... would be +changed to VYyYyY.... +

+ +

+The combination yy never occurs in English, although it might appear in +foreign words: +

+ +
+
sayyid   →   saYyid +
+ +

+(A sayyid, my dictionary tells me, is a descendant of Mohammed's daughter +Fatima.) But the left-to-right process is significant in other languages, for +example French. In French the rule for marking vowels as consonants is, +

+ +
+ Put into upper case u or i preceded and followed by a vowel, and + y preceded or followed by a vowel. Put u after q into upper + case. +
+ +

+which gives rise to, +

+ +
+
ennuie   →   ennuIe +
inquiétude   →   inqUiétude +
+ +

+In the first word, i is put into upper case since it has a vowel on both +sides of it. +In the second word, u after q is put into upper case, and again the +following i is left alone, since it is preceded by upper case U which +is a consonant. +

+ +[% footer %]