IN: factor-finder USING: kernel ui.tools.search io.encodings.utf8 regexp splitting grouping fry sets io.pathnames sequences assocs arrays math math.order sorting io.files ; : count-dups ( seq -- seq ) H{ } clone [ '[ _ inc-at ] each ] keep { } assoc-clone-like ; : just-2+ ( seq -- seq ) [ dup second 2 < [ drop f ] when ] map harvest ; : sort-by-count ( seq -- seq ) [ [ second ] bi@ <=> ] sort reverse ; : read-as-one-string ( seq -- str ) [ utf8 file-contents ] map " " join ; : consistent-space ( str -- str ) "[\n|\r|\t|\s\s]" " " re-replace ; : split-to-words ( str -- seq ) " " split harvest ; : (prepare-clumps) ( seq n -- seq ) [ length >array ] dip short head [ 1+ ] map dup length 1 > [ rest ] when ; : make-words-groups ( count-array words -- groups ) [ swap clump ] curry map ; : words-to-phrases ( seq -- seq ) [ [ " " join ] map ] map ; : find-all-duplicates ( long-string -- seq ) consistent-space split-to-words [ 10 (prepare-clumps) ] keep ! 2-10 consecutive words make-words-groups words-to-phrases [ count-dups ] map [ just-2+ ] map concat sort-by-count ; : run-factor-finder ( -- ) all-source-files [ "-docs" swap subseq? not ] filter ! 10 short head read-as-one-string find-all-duplicates ; : find-phrases ( path -- seq ) utf8 file-contents find-all-duplicates ;