#!/usr/bin/perl -w use strict; use List::Util qw{shuffle}; use List::MoreUtils qw{uniq}; my $MAX_PARA = 30; my $MAX_SENTENCE = 50; my $ORDER = 4; my $DEBUG = undef; my $TWITTER_MODE = $ENV{TWITTER_MODE}; my $filename = shift @ARGV || "markov.dat"; # Slurp in the entire corpus, gimme the words, etc. my $corpus = do {open my $h, $filename || die "$filename: $!\n"; local $/; <$h>}; my @corpuswords = split(/\s+/, $corpus); my $count = scalar(@corpuswords); die "Your puny corpus cannot satisfy my cavernous maw\n" unless $count > $MAX_SENTENCE*10; # Generate a "paragraph" PARA: for my $para(1 .. $MAX_PARA) { my @sentence; last PARA if int(rand(100)) < 5 and $para > 3; # 5% chance of ending the chain here my $seedindex = int(rand($count-$MAX_SENTENCE)); # Pick a starting point my $seedword = join('\s+', map {quotemeta} (@sentence = @corpuswords[$seedindex .. $seedindex+$ORDER-1])); #kind of dirty. my $lettercount = length(join(" ", @sentence))+1; print STDERR "Seed words $seedword\n" if $DEBUG; # Build up a sentence word-by-word WORD: for my $word(1 .. $MAX_SENTENCE) { last WORD if int(rand(100)) < 10 and $word>7; my ($new_seedword) = shuffle($corpus =~ m/$seedword\s+\K\S+/gi); $lettercount += length($new_seedword)+1; push @sentence, $new_seedword; $seedword = quotemeta($new_seedword); print STDERR "New seed word $seedword\n" if $DEBUG; } continue { # that last one pushed us over the edge. drop it and bail if ($TWITTER_MODE && $lettercount >= 140) { pop @sentence; print join(" ", @sentence); exit; } } print join(" ", @sentence), $para%5==0? "\n\n":" "; exit if $TWITTER_MODE; } print "\n" unless $MAX_PARA % 5 == 0;