Paste: porter-stemmer evil universe

Author: ..
Mode: factor
Date: Thu, 9 May 2024 19:54:26
Plain Text |
USING: combinators english kernel math parser sequences
splitting ;
IN: porter-stemmer

: bounds-check-of? ( seq n -- ? ) swap bounds-check? ; inline

: consonant? ( str i -- ? )
    2dup nth-of dup vowel? [
        3drop f
    ] [
        CHAR: y = [
            dup zero?
            [ 2drop t ] [ 1 - consonant? not ] if
        ] [
            2drop t
        ] if
    ] if ;

: skip-vowels ( str i -- str i )
    2dup bounds-check-of? [
        2dup consonant? [ 1 + skip-vowels ] unless
    ] when ;

: skip-consonants ( str i -- str i )
    2dup bounds-check-of? [
        2dup consonant? [ 1 + skip-consonants ] when
    ] when ;

: (consonant-seq) ( n str i -- n )
    skip-vowels
    2dup bounds-check-of? [
        [ 1 + ] [ ] [ 1 + ] tri* skip-consonants 1 +
        (consonant-seq)
    ] [
        2drop
    ] if ;

: consonant-seq ( str -- n )
    [ 0 ] dip 0 skip-consonants (consonant-seq) ;

: stem-vowel? ( str -- ? )
    dup length <iota> [ consonant? ] with all? not ;

: double-consonant? ( str i -- ? )
    dup 1 < [
        2drop f
    ] [
        2dup nth-of [ dup 1 - pick nth ] dip = [
            consonant?
        ] [
            2drop f
        ] if
    ] if ;

: consonant-end? ( seq n -- ? )
    over length swap - consonant? ;

: last-is? ( str possibilities -- ? ) [ last ] dip member? ;

: cvc? ( str -- ? )
    {
        { [ dup length 3 < ] [ drop f ] }
        { [ dup 1 consonant-end? not ] [ drop f ] }
        { [ dup 2 consonant-end? ] [ drop f ] }
        { [ dup 3 consonant-end? not ] [ drop f ] }
        [ "wxy" last-is? not ]
    } cond ;

: r ( str oldsuffix newsuffix -- str )
    pick consonant-seq 0 > [ nip ] [ drop ] if append ;

: step1a ( str -- newstr )
    dup last CHAR: s = [
        {
            { [ "sses" ?tail ] [ "ss" append ] }
            { [ "ies" ?tail ] [ "i" append ] }
            { [ dup "ss" tail? ] [ ] }
            { [ "s" ?tail ] [ ] }
            [ ]
        } cond
    ] when ;

: -eed ( str -- str )
    dup consonant-seq 0 > "ee" "eed" ? append ;

: -ed ( str -- str ? )
    dup stem-vowel? [ [ "ed" append ] unless ] keep ;

: -ing ( str -- str ? )
    dup stem-vowel? [ [ "ing" append ] unless ] keep ;

: -ed/ing ( str -- str )
    {
        { [ "at" ?tail ] [ "ate" append ] }
        { [ "bl" ?tail ] [ "ble" append ] }
        { [ "iz" ?tail ] [ "ize" append ] }
        {
            [ dup dup length 1 - double-consonant? ]
            [ dup "lsz" last-is? [ but-last-slice ] unless ]
        }
        {
            [ t ]
            [
                dup consonant-seq 1 = over cvc? and
                [ "e" append ] when
            ]
        }
    } cond ;

: step1b ( str -- newstr )
    {
        { [ "eed" ?tail ] [ -eed ] }
        {
            [
                {
                    { [ "ed" ?tail ] [ -ed ] }
                    { [ "ing" ?tail ] [ -ing ] }
                    [ f ]
                } cond
            ] [ -ed/ing ]
        }
        [ ]
    } cond ;

: step1c ( str -- newstr )
    dup but-last-slice stem-vowel? [
        "y" ?tail [ "i" append ] when
    ] when ;

: step2 ( str -- newstr )
    {
        { [ "ational" ?tail ] [ "ational" "ate"  r ] }
        { [ "tional"  ?tail ] [ "tional"  "tion" r ] }
        { [ "enci"    ?tail ] [ "enci"    "ence" r ] }
        { [ "anci"    ?tail ] [ "anci"    "ance" r ] }
        { [ "izer"    ?tail ] [ "izer"    "ize"  r ] }
        { [ "bli"     ?tail ] [ "bli"     "ble"  r ] }
        { [ "alli"    ?tail ] [ "alli"    "al"   r ] }
        { [ "entli"   ?tail ] [ "entli"   "ent"  r ] }
        { [ "eli"     ?tail ] [ "eli"     "e"    r ] }
        { [ "ousli"   ?tail ] [ "ousli"   "ous"  r ] }
        { [ "ization" ?tail ] [ "ization" "ize"  r ] }
        { [ "ation"   ?tail ] [ "ation"   "ate"  r ] }
        { [ "ator"    ?tail ] [ "ator"    "ate"  r ] }
        { [ "alism"   ?tail ] [ "alism"   "al"   r ] }
        { [ "iveness" ?tail ] [ "iveness" "ive"  r ] }
        { [ "fulness" ?tail ] [ "fulness" "ful"  r ] }
        { [ "ousness" ?tail ] [ "ousness" "ous"  r ] }
        { [ "aliti"   ?tail ] [ "aliti"   "al"   r ] }
        { [ "iviti"   ?tail ] [ "iviti"   "ive"  r ] }
        { [ "biliti"  ?tail ] [ "biliti"  "ble"  r ] }
        { [ "logi"    ?tail ] [ "logi"    "log"  r ] }
        [ ]
    } cond ;

: step3 ( str -- newstr )
    {
        { [ "icate" ?tail ] [ "icate" "ic" r ] }
        { [ "ative" ?tail ] [ "ative" ""   r ] }
        { [ "alize" ?tail ] [ "alize" "al" r ] }
        { [ "iciti" ?tail ] [ "iciti" "ic" r ] }
        { [ "ical"  ?tail ] [ "ical"  "ic" r ] }
        { [ "ful"   ?tail ] [ "ful"   ""   r ] }
        { [ "ness"  ?tail ] [ "ness"  ""   r ] }
        [ ]
    } cond ;

: -ion ( str -- newstr )
    [
        "ion"
    ] [
        dup "st" last-is? [ "ion" append ] unless
    ] if-empty ;

: step4 ( str -- newstr )
    dup {
        { [ "al"    ?tail ] [ ] }
        { [ "ance"  ?tail ] [ ] }
        { [ "ence"  ?tail ] [ ] }
        { [ "er"    ?tail ] [ ] }
        { [ "ic"    ?tail ] [ ] }
        { [ "able"  ?tail ] [ ] }
        { [ "ible"  ?tail ] [ ] }
        { [ "ant"   ?tail ] [ ] }
        { [ "ement" ?tail ] [ ] }
        { [ "ment"  ?tail ] [ ] }
        { [ "ent"   ?tail ] [ ] }
        { [ "ion"   ?tail ] [ -ion ] }
        { [ "ou"    ?tail ] [ ] }
        { [ "ism"   ?tail ] [ ] }
        { [ "ate"   ?tail ] [ ] }
        { [ "iti"   ?tail ] [ ] }
        { [ "ous"   ?tail ] [ ] }
        { [ "ive"   ?tail ] [ ] }
        { [ "ize"   ?tail ] [ ] }
        [ ]
    } cond dup consonant-seq 1 > [ nip ] [ drop ] if ;

: remove-e? ( str -- ? )
    dup consonant-seq dup 1 >
    [ 2drop t ]
    [ 1 = [ but-last-slice cvc? not ] [ drop f ] if ] if ;

: remove-e ( str -- newstr )
    dup last CHAR: e = [
        dup remove-e? [ but-last-slice ] when
    ] when ;

: ll->l ( str -- newstr )
    {
        { [ dup last CHAR: l = not ] [ ] }
        { [ dup dup length 1 - double-consonant? not ] [ ] }
        { [ dup consonant-seq 1 > ] [ but-last-slice ] }
        [ ]
    } cond ;

: step5 ( str -- newstr ) remove-e ll->l ;

: stem ( str -- newstr )
    dup length 2 <= [
        step1a step1b step1c step2 step3 step4 step5 "" like
    ] unless ;

New Annotation

Summary:
Author:
Mode:
Body: