Paste: regex markov chain

#!/usr/bin/perl -w
use strict;
use List::Util qw{shuffle};
use List::MoreUtils qw{uniq};

my $MAX_PARA     = 30;
my $MAX_SENTENCE = 50;
my $ORDER        = 4;
my $DEBUG        = undef; 
my $TWITTER_MODE = $ENV{TWITTER_MODE};
my $filename     = shift @ARGV || "markov.dat";

# Slurp in the entire corpus, gimme the words, etc.
my $corpus = do {open my $h, $filename || die "$filename: $!\n"; local $/; <$h>};
my @corpuswords = split(/\s+/, $corpus);
my $count = scalar(@corpuswords);

die "Your puny corpus cannot satisfy my cavernous maw\n" unless $count > $MAX_SENTENCE*10;

# Generate a "paragraph"
PARA: for my $para(1 .. $MAX_PARA) {
    my @sentence;
    last PARA if int(rand(100)) < 5 and $para > 3; # 5% chance of ending the chain here

    my $seedindex = int(rand($count-$MAX_SENTENCE)); # Pick a starting point
    my $seedword = join('\s+',
                   map {quotemeta}  
                   (@sentence = @corpuswords[$seedindex .. $seedindex+$ORDER-1])); #kind of dirty.

    my $lettercount = length(join(" ", @sentence))+1;
    print STDERR "Seed words $seedword\n" if $DEBUG;

    # Build up a sentence word-by-word
    WORD: for my $word(1 .. $MAX_SENTENCE) {
        last WORD if int(rand(100)) < 10 and $word>7;

        my ($new_seedword) = shuffle($corpus =~ m/$seedword\s+\K\S+/gi);

        $lettercount += length($new_seedword)+1;
        push @sentence, $new_seedword;
        $seedword = quotemeta($new_seedword);

        print STDERR "New seed word $seedword\n" if $DEBUG;

    } continue {
        # that last one pushed us over the edge. drop it and bail
        if ($TWITTER_MODE && $lettercount >= 140) {
            pop @sentence;
            print join(" ", @sentence);
            exit;
        }
    }

    print join(" ", @sentence), $para%5==0? "\n\n":" ";
    exit if $TWITTER_MODE;
}
print "\n" unless $MAX_PARA % 5 == 0;