! Copyright (C) 2008 Marc Fauconneau. ! See http://factorcode.org/license.txt for BSD license. USING: kernel arrays math accessors sequences math.vectors math.order sorting binary-search sets assocs fry ; IN: suffix-array ! this suffix array is a sorted array of suffixes ! query is efficient through binary searches : suffixes ( string -- suffixes-seq ) dup length [ tail-slice ] with map ; : new-sa ( strings -- sa ) [ suffixes ] map concat [ <=> ] sort ; : prefix<=> ( seq begin -- <=> ) [ swap <=> ] [ head? ] 2bi [ drop +eq+ ] [ ] if ; : find-index ( sa begin -- index ) '[ _ prefix<=> ] search drop ; ! note, slices are [m,n) and we want (m,n) so we increment : from-to ( index sa begin -- from to ) '[ _ head? not ] [ find-last-from drop 1+ ] [ find-from drop ] 3bi ; : query-sa ( begin sa -- matches ) [ swap [ find-index ] 2keep from-to ] keep [ seq>> ] map prune ; ! to search on word names : new-word-sa ( words -- sa ) [ name>> ] map new-sa ; : name-word-map ( words -- map ) dup [ name>> V{ } clone ] H{ } map>assoc [ '[ dup name>> _ at push ] each ] keep ; : name>words ( map string -- words ) swap at ; : query-word-sa ( map begin sa -- matches ) query-sa [ name>words ] with map concat ; ! usage example : ! clear all-words 100 head dup name-word-map "test" rot new-word-sa query-sa .