USING: accessors assocs fry general-utils grouping kernel locals math.order regexp sequences sorting splitting ; IN: sentences : abbrev-pattern ( -- regexp ) R/ ((\w\.)+\w)|((\w\.){2,}) / ; ! a.m as well as N.B. and N.B.A or N.B.A. : sort-slices-by-from ( slices -- slices' ) [ [ from>> ] bi@ <=> ] sort ; : naive-sentence-split ( str -- lines ) R/ .[.?!:]+/ [ re-split ] [ all-matching-slices ] bi-curry bi append sort-slices-by-from 2 [ concat trim-spaces ] map harvest ; : abbreviations ( string -- abbrevs ) abbrev-pattern all-matching-subseqs [ trim-spaces ] map ; ! to solve complicated periods in sentences, regexp couldn't catch all sentences. : (abbrev-replace-pairs) ( abbrevs -- pairs ) dup [ { CHAR: . } "。" replace1 ] map zip ; : reverse-pairs ( pairs -- sriap ) [ values ] [ keys ] bi zip ; : replace-all ( text pairs -- text' ) [ first2 replace-subseq ] each ; :: sentence-split ( text -- sentences ) text abbreviations (abbrev-replace-pairs) :> pairs text pairs replace-all naive-sentence-split pairs reverse-pairs '[ _ replace-all ] map ;