diff --git a/algorithms/estonian/stemmer.html b/algorithms/estonian/stemmer.html index 62f5154..658f077 100644 --- a/algorithms/estonian/stemmer.html +++ b/algorithms/estonian/stemmer.html @@ -639,6 +639,277 @@

Links to resources

The full algorithm in Snowball

+
/* Estonian stemmer version 1.3
+
+Made by Linda Freienthal in January 2019.
+
+*/
+
+routines (
+    mark_regions
+    LONGV
+    special_noun_endings
+    case_ending
+    emphasis
+    plural_three_first_cases
+    undouble_kpt
+    i_plural
+    degrees
+    substantive
+    verb_exceptions
+    verb
+    nu
+)
+
+stringescapes {}
+
+stringdef a" '{U+00E4}' //a-umlaut ä
+stringdef o" '{U+00F6}' //o-umlaut ö
+stringdef o~ '{U+00F5}' //o with tilde õ
+stringdef u" '{U+00FC}' //u-umlaut ü
+stringdef s* '{U+0161}' //s-caron š
+stringdef z* '{U+017E}' //z-caron ž
+
+externals ( stem )
+integers ( p1 )
+groupings ( V1 RV KI GI)
+
+define V1 'aeiou{o~}{a"}{o"}{u"}'
+define RV 'aeiuo'
+define KI 'kptgbdshf{s*}z{z*}'
+define GI 'cjlmnqrvwxaeiou{o~}{a"}{o"}{u"}'
+
+define mark_regions as (
+
+    $p1 = limit
+
+    goto V1 gopast non-V1  setmark p1
+)
+
+
+backwardmode (
+
+    define emphasis as (
+        setlimit tomark p1 for ([substring])
+        test hop 4 //kingi -> kingi
+        among(
+            'gi' ((GI and not LONGV) delete) //jookse-me-gi, bioloogi -> bioloogi
+            'ki' (KI delete) //kookki -> kook
+        )
+
+    )
+
+    // Signals t is a replacement was made; f otherwise.
+    define verb as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'nuksin' 'nuksime' 'nuksid' 'nuksite' (delete) //seleta-nuksite
+            'ksin' 'ksid' 'ksime' 'ksite' (delete) //personal conditional: rõõmusta-ksin
+            'mata' (delete)
+            'takse' 'dakse' (delete) //impersonal: laul-dakse, luba-takse
+            'taks' 'daks' (delete)   //impersonal conditional: laul-daks, saade-taks
+            'akse' (<-'a')           //impersonal: tulla-kse, süüa-kse (-> söö), teha-kse (-> tegi), püüta-kse, leita-kse
+            'sime' (delete)          //pl1pst: saat-sime
+            'site' (delete)          //pl2pst: saat-site
+            'sin' (delete)           //sg1pst: laul-sin, saat-sin
+            'me' (V1 delete)         //pl1prs: laula-me, tule-me
+            'da' (V1 delete)         //da-infinitive: luba-da
+            'n' (V1 delete)          //sg1prs: kirjuta-n
+            'b' (V1 delete)          //sg3prs: laula-b
+        )
+    )
+
+    define LONGV as
+        among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}' '{u"}{u"}' '{o~}{o~}')
+
+    define i_plural as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'i' (RV) //raama-tu-i, lapsiku-i
+        )
+        delete
+    )
+
+    define special_noun_endings as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'lasse' (<- 'lase') //teadlasse -> teadlase
+            'last' (<- 'lase') //teadlast -> teadlase
+            'lane' (<- 'lase') //teadlane -> teadlase
+            'lasi'(<- 'lase') //teadlasi -> teadlase
+            'misse' (<- 'mise') //tegemisse -> tegemise
+            'mist' (<- 'mise') //kasutamist -> kasutamise
+            'mine' (<- 'mise') //tegemine -> tegemise
+            'misi' (<- 'mise') //kasutamisi -> kasutamise
+            'lisse' (<- 'lise') //rohelisse -> rohelise
+            'list' (<- 'lise') //tavalist -> tavalise
+            'line' (<- 'lise') //roheline -> rohelise
+            'lisi' (<- 'lise') //tavalisi -> tavalise
+        )
+    )
+
+    define case_ending as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'sse' (RV or LONGV) //illative: saapa-sse
+            'st' (RV or LONGV) //elative: saapa-st and kapsas-t
+            'le' (RV or LONGV) //allative: raama-tu-le
+            'lt' (RV or LONGV) //ablative: raama-tu-lt
+            'ga' (RV or LONGV) //komitatiive: õpetaja-ga
+            'ks' (RV or LONGV) //translative: õpetaja-ks
+            'ta' (RV or LONGV) //abessive and da-infinitive: õpetaja-ta and hüpa-ta
+            't'  (test hop 4)  //partitiiv, raamatu-t
+            's'  (RV or LONGV) //inessive and sg3pst: raama-tu-s and sõiti-s
+            'l'  (RV or LONGV) //adessive: raama-tu-l and kapsa-l.
+        )
+        delete
+    )
+
+
+    define plural_three_first_cases as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'ikkude' (<-'iku') //plural genitive: õnnelikkude -> õnneliku
+            'ikke' (<-'iku') //plural partitive: rahulikke -> rahuliku
+            'ike' (<-'iku') //plural genitive: ohtlike -> ohtliku
+            'sid' (not LONGV delete) //plural partitive and sg2pst and pl3pst: auto-sid and laul-sid (exludes plural nominative with words like gaasid, roosid)
+            // plural genitive and pl2: ministri-te, oluliste -> olulise and saada-te, laula-te;
+            // also torte -> tort (if not in compound word) and kokkuvõtte -> kokkuvõte and roheliste -> rohelise, tegemiste -> tegemise, teadlaste -> teadlase
+            'te' (
+                (test hop 4
+                  among (
+                    'mis' 'las' 'lis' (<- 'e')
+                    't' ()
+                    '' (delete)
+                  )
+                ) or <- 't'
+            )
+            'de' ((RV or LONGV) delete) //plural genitive: lauda-de
+            'd' ((RV or LONGV) delete) //plural nominative: voodi-d, rattai-d (rata), lapsiku-i-d
+        )
+    )
+
+    define nu as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'nu' //haka-nu(-te-ga)
+            'tu' //luba-tu(-d)
+            'du' //laul-du(-te-st)
+            'va' //laul-va(-te-le)
+        )
+        delete
+    )
+
+    define undouble_kpt as (
+        // undouble '-C1C1V' where C1 is k, p or t:
+        // mõtte(-le) -> mõte, hakka(-n) -> haka
+        //
+        // We only undouble if the vowel is in R1 to avoid modifying short
+        // non-words (mostly to avoid modifying acronyms/initialisms which such
+        // as "PPE").
+        V1 $(p1 <= cursor)
+        [substring] among(
+            'kk' (<- 'k')
+            'pp' (<- 'p')
+            'tt' (<- 't')
+        )
+    )
+
+    define degrees as (
+        setlimit tomark p1 for ([substring])
+        among(
+            'mai' (RV delete) //heleda-mai(-le)
+            'ma' (delete)  //tuge-va-ma(-le) and ma-infinitive: sõit-ma
+            'm' (RV delete) //kauge-i-m, rõõmsa-m
+        )
+    )
+
+    define substantive as (
+        do special_noun_endings
+        do case_ending
+        do plural_three_first_cases
+        do degrees
+        do i_plural
+        do nu
+    )
+)
+
+
+define verb_exceptions as (
+    [substring] atlimit
+    among(
+        'joon' 'jood' 'joob' 'joote' 'joome' 'joovad' (<-'joo')
+        'j{o~}in' 'j{o~}id' 'j{o~}i' 'j{o~}ime' 'j{o~}ite'  (<-'joo')
+        'joomata' 'juuakse' 'joodakse' 'juua' 'jooma' (<- 'joo')
+        'saan' 'saad' 'saab' 'saate' 'saame' 'saavad' (<-'saa')
+        'saaksin' 'saaksid' 'saaks' 'saaksite' 'saaksime' (<-'saa')
+        'sain' 'said' 'sai' 'saite' 'saime' (<-'saa')
+        'saamata' 'saadakse' 'saadi' 'saama' 'saada' (<-'saa')
+        'viin' 'viid' 'viib' 'viite' 'viime' 'viivad' (<-'viima')
+        'viiksin' 'viiksid' 'viiks' 'viiksite' 'viiksime' (<-'viima')
+        'viisin' 'viisite' 'viisime' (<-'viima')
+        'viimata' 'viiakse' 'viidi' 'viima' 'viia' (<-'viima')
+        'keen' 'keeb' 'keed' 'kees' 'keeme' 'keete' 'keevad' (<-'keesi')
+        'keeksin' 'keeks' 'keeksid' 'keeksime' 'keeksite' (<-'keesi')
+        'keemata' 'keema' 'keeta' 'keedakse' (<-'keesi')
+        'l{o"}{o"}n' 'l{o"}{o"}d' 'l{o"}{o"}b' 'l{o"}{o"}me' 'l{o"}{o"}te' 'l{o"}{o"}vad'  (<-'l{o"}{o"}')
+        'l{o"}{o"}ksin' 'l{o"}{o"}ksid' 'l{o"}{o"}ks' 'l{o"}{o"}ksime' 'l{o"}{o"}ksite' (<-'l{o"}{o"}')
+        'l{o"}{o"}mata' 'l{u"}{u"}akse' 'l{o"}{o"}dakse' 'l{o"}{o"}di' 'l{o"}{o"}ma' 'l{u"}{u"}a' (<-'l{o"}{o"}')
+        // Both looma and lööma have these same past tense forms
+        'l{o~}in' 'l{o~}id' 'l{o~}i' 'l{o~}ime' 'l{o~}ite' (<-'l{o~}i')
+        'loon' 'lood' 'loob' 'loome' 'loote' 'loovad' (<-'loo')
+        'looksin' 'looksid' 'looks' 'looksime' 'looksite' (<-'loo')
+        'loomata' 'luuakse' 'loodi' 'luua' 'looma' (<-'loo')
+        'k{a"}in' 'k{a"}ib' 'k{a"}id' 'k{a"}is' 'k{a"}ime' 'k{a"}ite' 'k{a"}ivad' (<-'k{a"}isi')
+        'k{a"}iksin' 'k{a"}iks' 'k{a"}iksid' 'k{a"}iksime' 'k{a"}iksite' (<-'k{a"}isi')
+        'k{a"}imata' 'k{a"}iakse' 'k{a"}idi' 'k{a"}ia' 'k{a"}ima' (<-'k{a"}isi')
+        's{o"}{o"}n' 's{o"}{o"}b' 's{o"}{o"}d' 's{o"}{o"}me' 's{o"}{o"}te' 's{o"}{o"}vad' (<-'s{o"}{o"}')
+        's{o"}{o"}ksin' 's{o"}{o"}ks' 's{o"}{o"}ksid' 's{o"}{o"}ksime' 's{o"}{o"}ksite' (<-'s{o"}{o"}')
+        's{o~}in' 's{o~}i' 's{o~}id' 's{o~}ime' 's{o~}ite' (<-'s{o"}{o"}')
+        's{o"}{o"}mata' 's{u"}{u"}akse' 's{o"}{o"}dakse' 's{o"}{o"}di' 's{o"}{o"}ma' 's{u"}{u"}a' (<-'s{o"}{o"}')
+        'toon' 'tood' 'toob' 'toote' 'toome' 'toovad' (<-'too')
+        'tooksin' 'tooksid' 'tooks' 'tooksite' 'tooksime' (<-'too')
+        't{o~}in' 't{o~}id' 't{o~}i' 't{o~}ime' 't{o~}ite' (<-'too')
+        'toomata' 'tuuakse' 'toodi' 'tooma' 'tuua' (<-'too')
+        'v{o~}in' 'v{o~}id' 'v{o~}ib' 'v{o~}ime' 'v{o~}is' 'v{o~}ite' 'v{o~}ivad' (<-'v{o~}isi')
+        'v{o~}iksin' 'v{o~}iksid' 'v{o~}iks' 'v{o~}iksime' 'v{o~}iksite' (<-'v{o~}isi')
+        'v{o~}imata' 'v{o~}idakse' 'v{o~}idi' 'v{o~}ida' 'v{o~}ima' (<-'v{o~}isi')
+        'j{a"}{a"}n' 'j{a"}{a"}d' 'j{a"}{a"}b' 'j{a"}{a"}me' 'j{a"}{a"}te' 'j{a"}{a"}vad' (<-'j{a"}{a"}ma')
+        'j{a"}{a"}ksin' 'j{a"}{a"}ksid' 'j{a"}{a"}ks' 'j{a"}{a"}ksime' 'j{a"}{a"}ksite' (<-'j{a"}{a"}ma')
+        'j{a"}ime' 'j{a"}ite' 'j{a"}in' 'j{a"}id' 'j{a"}i' (<-'j{a"}{a"}ma')
+        'j{a"}{a"}mata' 'j{a"}{a"}dakse' 'j{a"}{a"}da' 'j{a"}{a"}ma' 'j{a"}{a"}di' (<-'j{a"}{a"}ma')
+        'm{u"}{u"}n' 'm{u"}{u"}d' 'm{u"}{u"}b' 'm{u"}{u"}s' 'm{u"}{u"}me' 'm{u"}{u"}te' 'm{u"}{u"}vad' (<-'m{u"}{u"}si')
+        'm{u"}{u"}ksin' 'm{u"}{u"}ksid' 'm{u"}{u"}ks' 'm{u"}{u"}ksime' 'm{u"}{u"}ksite' (<-'m{u"}{u"}si')
+        'm{u"}{u"}mata' 'm{u"}{u"}akse' 'm{u"}{u"}di' 'm{u"}{u"}a' 'm{u"}{u"}ma' (<-'m{u"}{u"}si')
+        'loeb' 'loen' 'loed' 'loeme' 'loete' 'loevad' (<- 'luge')
+        'loeks' 'loeksin' 'loeksid' 'loeksime' 'loeksite' (<- 'luge')
+        'p{o~}en' 'p{o~}eb' 'p{o~}ed' 'p{o~}eme' 'p{o~}ete' 'p{o~}evad' (<- 'p{o~}de')
+        'p{o~}eksin' 'p{o~}eks' 'p{o~}eksid' 'p{o~}eksime' 'p{o~}eksite' (<- 'p{o~}de')
+        'laon' 'laob' 'laod' 'laome' 'laote' 'laovad' (<- 'ladu')
+        'laoksin' 'laoks' 'laoksid' 'laoksime' 'laoksite' (<- 'ladu')
+        'teeksin' 'teeks' 'teeksid' 'teeksime' 'teeksite' (<- 'tegi')
+        'teen' 'teeb' 'teed' 'teeme' 'teete' 'teevad' (<- 'tegi')
+        'tegemata' 'tehakse' 'tehti' 'tegema' 'teha' (<-'tegi')
+        'n{a"}en' 'n{a"}eb' 'n{a"}ed' 'n{a"}eme' 'n{a"}ete' 'n{a"}evad' (<-'n{a"}gi')
+        'n{a"}eksin' 'n{a"}eks' 'n{a"}eksid' 'n{a"}eksime' 'n{a"}eksite' (<-'n{a"}gi')
+        'n{a"}gemata' 'n{a"}hakse' 'n{a"}hti' 'n{a"}ha' 'n{a"}gema' (<-'n{a"}gi')
+    )
+)
+
+
+define stem as (
+    not verb_exceptions
+    // p1 isn't used by verb_exceptions
+    do mark_regions
+    backwards (
+        do emphasis
+        do ( verb or substantive )
+        do undouble_kpt
+
+    )
+)
+
+