#!/usr/bin/python

import sys, re, string, whrandom
from math import *
from cPickle import dump, load

SWAPLIST = {"i": "you",
	"me": "you",
	"mine": "yours",
	"my": "your",
	"myself": "yourself",
	"no": "yes",
	"why": "because",
	"yes": "no",
	"you": "i",
	"you": "me",
	"your": "my",
	"yours": "mine",
	"yourself": "myself"}

STOPLIST = [
	"s",
	"t",
	"a",
	"ability",
	"able",
	"about",
	"absolute",
	"absolutely",
	"across",
	"actual",
	"actually",
	"after",
	"afternoon",
	"again",
	"against",
	"ago",
	"agree",
	"all",
	"almost",
	"along",
	"already",
	"although",
	"always",
	"am",
	"an",
	"and",
	"another",
	"any",
	"anyhow",
	"anything",
	"anyway",
	"are",
	"aren",
	"around",
	"as",
	"at",
	"away",
	"back",
	"bad",
	"be",
	"been",
	"before",
	"behind",
	"being",
	"believe",
	"belong",
	"best",
	"better",
	"between",
	"big",
	"bigger",
	"biggest",
	"bit",
	"both",
	"buddy",
	"but",
	"by",
	"call",
	"called",
	"calling",
	"came",
	"can",
	"cannot",
	"care",
	"caring",
	"case",
	"catch",
	"caught",
	"certain",
	"certainly",
	"change",
	"close",
	"closer",
	"come",
	"coming",
	"common",
	"constant",
	"constantly",
	"could",
	"current",
	"day",
	"days",
	"derived",
	"describe",
	"describes",
	"determine",
	"determines",
	"did",
	"didn",
	"do",
	"does",
	"doesn",
	"doing",
	"don",
	"done",
	"doubt",
	"down",
	"each",
	"earlier",
	"early",
	"else",
	"enjoy",
	"especially",
	"even",
	"ever",
	"every",
	"everybody",
	"everyone",
	"everything",
	"fact",
	"fair",
	"fairly",
	"far",
	"fellow",
	"few",
	"find",
	"fine",
	"for",
	"form",
	"found",
	"from",
	"full",
	"further",
	"gave",
	"get",
	"getting",
	"give",
	"given",
	"giving",
	"go",
	"going",
	"gone",
	"good",
	"got",
	"gotten",
	"great",
	"had",
	"has",
	"hasn",
	"have",
	"haven",
	"having",
	"held",
	"here",
	"high",
	"hold",
	"holding",
	"how",
	"if",
	"in",
	"indeed",
	"inside",
	"instead",
	"into",
	"is",
	"isn",
	"it",
	"it",
	"its",
	"just",
	"keep",
	"kind",
	"knew",
	"know",
	"known",
	"large",
	"larger",
	"largets",
	"last",
	"late",
	"later",
	"least",
	"less",
	"let",
	"let",
	"level",
	"likes",
	"little",
	"long",
	"longer",
	"look",
	"looked",
	"looking",
	"looks",
	"low",
	"made",
	"make",
	"making",
	"many",
	"mate",
	"may",
	"maybe",
	"mean",
	"meet",
	"mention",
	"mere",
	"might",
	"moment",
	"more",
	"morning",
	"most",
	"move",
	"much",
	"must",
	"near",
	"nearer",
	"never",
	"next",
	"nice",
	"nobody",
	"none",
	"noon",
	"noone",
	"not",
	"note",
	"nothing",
	"now",
	"obvious",
	"of",
	"off",
	"on",
	"once",
	"only",
	"onto",
	"opinion",
	"or",
	"other",
	"our",
	"out",
	"over",
	"own",
	"part",
	"particular",
	"particularly",
	"perhaps",
	"person",
	"piece",
	"place",
	"pleasant",
	"please",
	"popular",
	"prefer",
	"pretty",
	"put",
	"quite",
	"real",
	"really",
	"receive",
	"received",
	"recent",
	"recently",
	"related",
	"result",
	"resulting",
	"results",
	"said",
	"same",
	"saw",
	"say",
	"saying",
	"see",
	"seem",
	"seemed",
	"seems",
	"seen",
	"seldom",
	"sense",
	"set",
	"several",
	"shall",
	"short",
	"shorter",
	"should",
	"show",
	"shows",
	"simple",
	"simply",
	"small",
	"so",
	"some",
	"someone",
	"something",
	"sometime",
	"sometimes",
	"somewhere",
	"sort",
	"sorts",
	"spend",
	"spent",
	"still",
	"stuff",
	"such",
	"suggest",
	"suggestion",
	"suppose",
	"sure",
	"surely",
	"surround",
	"surrounds",
	"take",
	"taken",
	"taking",
	"tell",
	"than",
	"thank",
	"thanks",
	"that",
	"that",
	"thats",
	"the",
	"their",
	"them",
	"then",
	"there",
	"therefore",
	"these",
	"they",
	"thing",
	"things",
	"this",
	"those",
	"though",
	"thoughts",
	"thouroughly",
	"through",
	"tiny",
	"to",
	"today",
	"together",
	"told",
	"tomorrow",
	"too",
	"total",
	"totally",
	"touch",
	"try",
	"twice",
	"under",
	"understand",
	"understood",
	"until",
	"up",
	"us",
	"used",
	"using",
	"usually",
	"various",
	"very",
	"want",
	"wanted",
	"wants",
	"was",
	"watch",
	"way",
	"ways",
	"we",
	"re",
	"well",
	"went",
	"were",
	"what",
	"what",
	"whatever",
	"whats",
	"when",
	"where",
	"where",
	"which",
	"while",
	"whilst",
	"who",
	"who",
	"whom",
	"will",
	"wish",
	"with",
	"within",
	"wonder",
	"wonderful",
	"worse",
	"worst",
	"would",
	"wrong",
	"yesterday",
	"yet"]

class Entry:
    def __init__(self):
        self.freqs = {}
	self.count = 0

    def add_symbol(self, symbol):
        try: self.freqs[symbol] = self.freqs[symbol] + 1
	except KeyError: self.freqs[symbol] = 1
	self.count = self.count + 1
	
    def remove_symbol(self, symbol):
        """Remove a symbol from this entry, returning the remaining count"""
        try: freq = self.freqs[symbol]
	except KeyError: pass
	else: 
	    del self.freqs[symbol]
	    self.count = self.count - freq
	    return self.count

    def choose(self):
        """Pick a random symbol from the list of symbols with the correct probability."""
        total = 0
	n = whrandom.randint(1, self.count)
	for symbol, freq in self.freqs.items():
	    total = total + freq
	    if n <= total: return symbol

    def prob(self, symbol):
        return float(self.freqs[symbol])/float(self.count)


class Brain:
    split_re = re.compile('(\W+)')
    whitespace_re = re.compile('\s+')

    def __init__(self):
        self.markov = {}
 
    def parse(self, sentence):
        """Convert all whitespace to a single space, split up the sentence into alternating words and non-words, and remove any empty symbols."""
        return map(intern, filter(lambda s: s, self.split_re.split(self.whitespace_re.sub(' ', sentence.strip()).lower())))

    def train(self, sentence):
	# Split up the sentence and add sentinels at the beginning and end
        sentence = [None, None, None, None, None] + self.parse(sentence) + [None, None, None, None]
	for i in range(len(sentence) - 5):
	    symbol, sym1, sym2, sym3, sym4, sym5 = sentence[i:i+6]

            key = sym1, sym2, sym3, sym4
	    try: leaders, followers = self.markov[key]
	    except KeyError: 
	        leaders = Entry()
		followers = Entry()
		self.markov[key] = (leaders, followers)

	    leaders.add_symbol(symbol)
	    followers.add_symbol(sym5)

    def train_from_file(self, file):
        if type(file) == type(''): file = open(file)
	line = file.readline()
	while line:
	    if line[0] != '#': self.train(line)
	    line = file.readline()

    def generate_sentence(self, sentence):
        """Destructively modifies the seed!"""
	while sentence[-1]: 
	    symbol = self.markov[tuple(sentence[-4:])][1].choose()
	    sentence.append(symbol)
	
	# Generate the beginning of the sentence.
	while sentence[0]:
	    symbol = self.markov[tuple(sentence[:4])][0].choose()
	    sentence = [symbol] + sentence

        # Strip sentinels
	while not sentence[0]: del sentence[0] 
	while not sentence[-1]: del sentence[-1]

	return sentence
        
    def reply(self, sentence):
        keywords = []
	for k in self.parse(sentence):
	    if k[0] in string.letters and k not in \
	        STOPLIST:
		try: k = SWAPLIST[k]
		except KeyError: pass
		if k not in keywords: keywords.append(k)
	    
        keys = self.markov.keys()
	candidates = []
	for keyword in keywords:
	    seeds = filter(lambda s, k=keyword: s[3] == k, keys)
	    if seeds:
	        # generate 10 replies per keyword
	        for i in range(10):
	            candidate = self.generate_sentence(list(whrandom.choice(seeds)))
		    # Calculate the probability of each keywords' appearing
		    # in its position
		    # This is a total hack.
		    total_logprob = 0
		    num_keywords = 0
		    key = [None, None, None, None]
		    for symbol in candidate:
		        if symbol in keywords:
			    total_logprob = total_logprob - \
			        log(self.markov[tuple(key)][1].prob(symbol))
			    num_keywords = num_keywords + 1

			del key[0]
			key.append(symbol)

		    # Insert them in random order.
		    candidates.insert(whrandom.randint(0, len(candidates)), (total_logprob/num_keywords, candidate))


	if not candidates:
	    return 0.0, ''.join(self.generate_sentence(list(whrandom.choice(keys)))).capitalize()

	candidates.sort(lambda a, b: cmp(a[0], b[0]))
	avg_logprob, sentence = candidates[-1]
	return avg_logprob, ''.join(sentence).capitalize()
        
    def save(self, file):
        if type(file) == type(''): file = open(file, 'w')
	dump(self, file, 1)


def load_brain(file):
    if type(file) == type(''): file = open(file, 'r')
    return load(file)


def main(argv):
    import microhal
    b = microhal.Brain()
    print >> sys.stderr, 'Training from %s...' % argv[1]
    b.train_from_file(argv[1])
    print >> sys.stderr, 'Saving to %s...' % argv[2]
    b.save(argv[2])
    print >> sys.stderr, 'Done.'

if __name__ == '__main__': main(sys.argv) 

