Paste: draft

: >words ( sentence -- words )
    R/ [^-\w]+/ re-split [ >string ] map ;

: maybe-phrases ( sentence -- seq )
    >words dup length [1,b] [ <clumps> ] with map concat ;
    
: common2 ( sentence1 sentence2 -- common-parts ) 
    [ maybe-phrases ] bi@ intersect ;
    
    
! the input is sentences with that contain the translated word or phrase.
! the idea is to find intersections between every two sentences. then histogram sort them.    
! the point is to guess the right translation for a word, using sentences where the word/phrase appeared in.

: matching-subs ( sentences -- hash ) 
    2 all-combinations [ first2 common2 ] map 
    harvest concat 
    [ " " join ] map 
    sorted-histogram reverse ;