#! /usr/bin/perl -w

# Copyright (C) 2008 Paul Kuliniewicz <paul@kuliniewicz.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02111-1301, USA.

use strict;

use Getopt::Std;

my %opts;
getopts('n:w:p:', \%opts);
my $states = $opts{'n'} || 1;
my $goal_words = $opts{'w'} || 0;
my $goal_paragraphs = $opts{'p'} || 0;

# Initialize state.

my %markov = ();
my @begin_state = ();
foreach (1 .. $states)
{
	push @begin_state, '<begin>';
}

# Build the Markov matrix.

my @state = @begin_state;
while (<>)
{
	# Telling the difference between apostrophes and single quotes
	# is very problematic, so just pretend the issue doesn't exist
	# and treat apostrophes like single dashes: parts of words.
	# All other groups of punctuation are treated as standalone tokens.

	chomp;

	# You know what, just split on whitespace and see what happens.
	#s/(-{2,}|[^\w\s'-]+)/ $1 /g;

	if ($_ eq '')
	{
		++$markov{join (' ', @state)}{'<end>'};
		@state = @begin_state;
	}
	else
	{
		foreach (split)
		{
			++$markov{join (' ', @state)}{$_};
			shift @state;
			push @state, $_;
		}
	}
}

# Dump

#foreach my $state_vec (sort keys %markov)
#{
#	print "$state_vec:\n";
#	foreach my $dest (sort keys %{$markov{$state_vec}})
#	{
#		print "\t$dest: $markov{$state_vec}{$dest}\n";
#	}
#}

# Compute the lengths of each entry.

my %markov_lengths = ();
foreach my $state_vec (keys %markov)
{
	$markov_lengths{$state_vec} = 0;
	foreach (values %{$markov{$state_vec}})
	{
		$markov_lengths{$state_vec} += $_;
	}
}

# Dump

#foreach (sort keys %markov_lengths)
#{
#	print "$_: $markov_lengths{$_}\n";
#}
#exit 0;

# Generate a paragraph.

if ($goal_words)
{
	my $generated = 0;
	while ($generated < $goal_words)
	{
		$generated += generate_paragraph ();
	}
}
else
{
	for (1 .. ($goal_paragraphs || 1))
	{
		generate_paragraph ();
	}
}

exit 0;


sub generate_paragraph
{
	my @state = @begin_state;
	my $count = 0;
	my $word = '<error>';
	do
	{
		my $state_vec = join (' ', @state);
		my $which = int (rand $markov_lengths{$state_vec});
		$word = '<error>';
		foreach (keys %{$markov{$state_vec}})
		{
			if ($which < $markov{$state_vec}{$_})
			{
				$word = $_;
				last;
			}
			else
			{
				$which -= $markov{$state_vec}{$_};
			}
		}

		if ($word ne '<end>')
		{
			print "$word ";
		}
		else
		{
			print "\n\n";
		}

		++$count;

		shift @state;
		push @state, $word;
	} while ($word ne '<end>' && $word ne '<error>');

	return $count;
}
