Paste: regex markov chain
Author: | atax1a |
Mode: | perl |
Date: | Fri, 16 Dec 2022 07:41:24 |
Plain Text |
use strict;
use List::Util qw{shuffle};
use List::MoreUtils qw{uniq};
my $MAX_PARA = 30;
my $MAX_SENTENCE = 50;
my $ORDER = 4;
my $DEBUG = undef;
my $TWITTER_MODE = $ENV{TWITTER_MODE};
my $filename = shift @ARGV || "markov.dat";
# Slurp in the entire corpus, gimme the words, etc.
my $corpus = do {open my $h, $filename || die "$filename: $!\n"; local $/; <$h>};
my @corpuswords = split(/\s+/, $corpus);
my $count = scalar(@corpuswords);
die "Your puny corpus cannot satisfy my cavernous maw\n" unless $count > $MAX_SENTENCE*10;
# Generate a "paragraph"
PARA: for my $para(1 .. $MAX_PARA) {
my @sentence;
last PARA if int(rand(100)) < 5 and $para > 3; # 5% chance of ending the chain here
my $seedindex = int(rand($count-$MAX_SENTENCE)); # Pick a starting point
my $seedword = join('\s+',
map {quotemeta}
(@sentence = @corpuswords[$seedindex .. $seedindex+$ORDER-1])); #kind of dirty.
my $lettercount = length(join(" ", @sentence))+1;
print STDERR "Seed words $seedword\n" if $DEBUG;
# Build up a sentence word-by-word
WORD: for my $word(1 .. $MAX_SENTENCE) {
last WORD if int(rand(100)) < 10 and $word>7;
my ($new_seedword) = shuffle($corpus =~ m/$seedword\s+\K\S+/gi);
$lettercount += length($new_seedword)+1;
push @sentence, $new_seedword;
$seedword = quotemeta($new_seedword);
print STDERR "New seed word $seedword\n" if $DEBUG;
} continue {
# that last one pushed us over the edge. drop it and bail
if ($TWITTER_MODE && $lettercount >= 140) {
pop @sentence;
print join(" ", @sentence);
exit;
}
}
print join(" ", @sentence), $para%5==0? "\n\n":" ";
exit if $TWITTER_MODE;
}
print "\n" unless $MAX_PARA % 5 == 0;
New Annotation