Paste: porter-stemmer evil universe
Author: | .. |
Mode: | factor |
Date: | Thu, 9 May 2024 19:54:26 |
Plain Text |
USING: combinators english kernel math parser sequences
splitting ;
IN: porter-stemmer
: bounds-check-of? ( seq n -- ? ) swap bounds-check? ; inline
: consonant? ( str i -- ? )
2dup nth-of dup vowel? [
3drop f
] [
CHAR: y = [
dup zero?
[ 2drop t ] [ 1 - consonant? not ] if
] [
2drop t
] if
] if ;
: skip-vowels ( str i -- str i )
2dup bounds-check-of? [
2dup consonant? [ 1 + skip-vowels ] unless
] when ;
: skip-consonants ( str i -- str i )
2dup bounds-check-of? [
2dup consonant? [ 1 + skip-consonants ] when
] when ;
: (consonant-seq) ( n str i -- n )
skip-vowels
2dup bounds-check-of? [
[ 1 + ] [ ] [ 1 + ] tri* skip-consonants 1 +
(consonant-seq)
] [
2drop
] if ;
: consonant-seq ( str -- n )
[ 0 ] dip 0 skip-consonants (consonant-seq) ;
: stem-vowel? ( str -- ? )
dup length <iota> [ consonant? ] with all? not ;
: double-consonant? ( str i -- ? )
dup 1 < [
2drop f
] [
2dup nth-of [ dup 1 - pick nth ] dip = [
consonant?
] [
2drop f
] if
] if ;
: consonant-end? ( seq n -- ? )
over length swap - consonant? ;
: last-is? ( str possibilities -- ? ) [ last ] dip member? ;
: cvc? ( str -- ? )
{
{ [ dup length 3 < ] [ drop f ] }
{ [ dup 1 consonant-end? not ] [ drop f ] }
{ [ dup 2 consonant-end? ] [ drop f ] }
{ [ dup 3 consonant-end? not ] [ drop f ] }
[ "wxy" last-is? not ]
} cond ;
: r ( str oldsuffix newsuffix -- str )
pick consonant-seq 0 > [ nip ] [ drop ] if append ;
: step1a ( str -- newstr )
dup last CHAR: s = [
{
{ [ "sses" ?tail ] [ "ss" append ] }
{ [ "ies" ?tail ] [ "i" append ] }
{ [ dup "ss" tail? ] [ ] }
{ [ "s" ?tail ] [ ] }
[ ]
} cond
] when ;
: -eed ( str -- str )
dup consonant-seq 0 > "ee" "eed" ? append ;
: -ed ( str -- str ? )
dup stem-vowel? [ [ "ed" append ] unless ] keep ;
: -ing ( str -- str ? )
dup stem-vowel? [ [ "ing" append ] unless ] keep ;
: -ed/ing ( str -- str )
{
{ [ "at" ?tail ] [ "ate" append ] }
{ [ "bl" ?tail ] [ "ble" append ] }
{ [ "iz" ?tail ] [ "ize" append ] }
{
[ dup dup length 1 - double-consonant? ]
[ dup "lsz" last-is? [ but-last-slice ] unless ]
}
{
[ t ]
[
dup consonant-seq 1 = over cvc? and
[ "e" append ] when
]
}
} cond ;
: step1b ( str -- newstr )
{
{ [ "eed" ?tail ] [ -eed ] }
{
[
{
{ [ "ed" ?tail ] [ -ed ] }
{ [ "ing" ?tail ] [ -ing ] }
[ f ]
} cond
] [ -ed/ing ]
}
[ ]
} cond ;
: step1c ( str -- newstr )
dup but-last-slice stem-vowel? [
"y" ?tail [ "i" append ] when
] when ;
: step2 ( str -- newstr )
{
{ [ "ational" ?tail ] [ "ational" "ate" r ] }
{ [ "tional" ?tail ] [ "tional" "tion" r ] }
{ [ "enci" ?tail ] [ "enci" "ence" r ] }
{ [ "anci" ?tail ] [ "anci" "ance" r ] }
{ [ "izer" ?tail ] [ "izer" "ize" r ] }
{ [ "bli" ?tail ] [ "bli" "ble" r ] }
{ [ "alli" ?tail ] [ "alli" "al" r ] }
{ [ "entli" ?tail ] [ "entli" "ent" r ] }
{ [ "eli" ?tail ] [ "eli" "e" r ] }
{ [ "ousli" ?tail ] [ "ousli" "ous" r ] }
{ [ "ization" ?tail ] [ "ization" "ize" r ] }
{ [ "ation" ?tail ] [ "ation" "ate" r ] }
{ [ "ator" ?tail ] [ "ator" "ate" r ] }
{ [ "alism" ?tail ] [ "alism" "al" r ] }
{ [ "iveness" ?tail ] [ "iveness" "ive" r ] }
{ [ "fulness" ?tail ] [ "fulness" "ful" r ] }
{ [ "ousness" ?tail ] [ "ousness" "ous" r ] }
{ [ "aliti" ?tail ] [ "aliti" "al" r ] }
{ [ "iviti" ?tail ] [ "iviti" "ive" r ] }
{ [ "biliti" ?tail ] [ "biliti" "ble" r ] }
{ [ "logi" ?tail ] [ "logi" "log" r ] }
[ ]
} cond ;
: step3 ( str -- newstr )
{
{ [ "icate" ?tail ] [ "icate" "ic" r ] }
{ [ "ative" ?tail ] [ "ative" "" r ] }
{ [ "alize" ?tail ] [ "alize" "al" r ] }
{ [ "iciti" ?tail ] [ "iciti" "ic" r ] }
{ [ "ical" ?tail ] [ "ical" "ic" r ] }
{ [ "ful" ?tail ] [ "ful" "" r ] }
{ [ "ness" ?tail ] [ "ness" "" r ] }
[ ]
} cond ;
: -ion ( str -- newstr )
[
"ion"
] [
dup "st" last-is? [ "ion" append ] unless
] if-empty ;
: step4 ( str -- newstr )
dup {
{ [ "al" ?tail ] [ ] }
{ [ "ance" ?tail ] [ ] }
{ [ "ence" ?tail ] [ ] }
{ [ "er" ?tail ] [ ] }
{ [ "ic" ?tail ] [ ] }
{ [ "able" ?tail ] [ ] }
{ [ "ible" ?tail ] [ ] }
{ [ "ant" ?tail ] [ ] }
{ [ "ement" ?tail ] [ ] }
{ [ "ment" ?tail ] [ ] }
{ [ "ent" ?tail ] [ ] }
{ [ "ion" ?tail ] [ -ion ] }
{ [ "ou" ?tail ] [ ] }
{ [ "ism" ?tail ] [ ] }
{ [ "ate" ?tail ] [ ] }
{ [ "iti" ?tail ] [ ] }
{ [ "ous" ?tail ] [ ] }
{ [ "ive" ?tail ] [ ] }
{ [ "ize" ?tail ] [ ] }
[ ]
} cond dup consonant-seq 1 > [ nip ] [ drop ] if ;
: remove-e? ( str -- ? )
dup consonant-seq dup 1 >
[ 2drop t ]
[ 1 = [ but-last-slice cvc? not ] [ drop f ] if ] if ;
: remove-e ( str -- newstr )
dup last CHAR: e = [
dup remove-e? [ but-last-slice ] when
] when ;
: ll->l ( str -- newstr )
{
{ [ dup last CHAR: l = not ] [ ] }
{ [ dup dup length 1 - double-consonant? not ] [ ] }
{ [ dup consonant-seq 1 > ] [ but-last-slice ] }
[ ]
} cond ;
: step5 ( str -- newstr ) remove-e ll->l ;
: stem ( str -- newstr )
dup length 2 <= [
step1a step1b step1c step2 step3 step4 step5 "" like
] unless ;
New Annotation