-
Notifications
You must be signed in to change notification settings - Fork 0
/
markov.py
executable file
·200 lines (151 loc) · 6.94 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python
#################################################################################
#
# Author: Morgen Peschke
# Date: Dec 15th, 2013
#
# Description: This is a random text generator that uses the Markov Chaining
# technique to create text with passing similarity to valid English
# text. This is also an exploration of the idea of graceful
# degradation, by attempting to provide a match of the requested
# order, but returning matches of lower orders when that is not
# possible. In this manner the algorithm attempts to recover from
# an otherwise fatal dead end chain.
#
# Usage: ./markov.py <file 1> <file 2> ... <file n>
#
# The script will read in and will attempt to create a string of random
# words of the specified length and order. Customizing the default
# behavior is handled in the constructor. For more control the script
# can be imported an used as a library.
#
# Acknowledgments: Although the code has massively diverged from the sample
# code, and has been basically been rebuild from scratch, the
# genesis of this script came from the sample code provided at
# http://xkcd.com/markov.py.txt as part of the XKCD "What if?"
# titled "Phone Keypad" [http://what-if.xkcd.com/75/].
#
# The article provided the impetus for the creation of this
# script, and as such, I would be remiss to neglect giving
# credit where it is due.
#
import random
import sys
import copy
class markov_text_generator:
def __init__ (self, **kwargs):
self.min_sentence_length = kwargs.get ('length') or 10
self.order = kwargs.get ('order') or 3
self.source_file = kwargs.get ('file')
self.dictionaries = None
if self.order < 2: self.order = 2
if self.source_file:
self.gen_dictionaries ()
# generator to read file as words
@staticmethod
def read_words (fileobj):
for line in fileobj:
for word in line.split():
yield word
# set input file
def set_file (self, file_path):
self.source_file = file_path
self.dictionaries = None
return self
# clears out the old dictionaries and sets up for a new generation run
def _clear_dictionaries (self):
self.dictionaries = []
# Load the dictionaries list with an empty dict for each order 2 and
# above
for i in range (1, self.order):
self.dictionaries.append ({})
return self
# generate the dictionaries
def _gen_dictionaries (self):
with open (self.source_file) as fh:
# set up required to read the file word-wise
words = markov_text_generator.read_words (fh)
self._clear_dictionaries()
# build a list of the same length as the order + 1, with 'None' as
# the 0 index value
tmp = [None]
for i in range (self.order):
tmp.append (words.next())
# read until the end of the file and add each set to the appropriate
# dictionary
for w in words:
tmp = tmp[1:] + [ w ]
# build and add the sequences
for i in range (1, self.order):
key = tuple (tmp[:i + 1])
if not self.dictionaries[i - 2].has_key (key):
self.dictionaries[i - 2][key] = []
self.dictionaries[i - 2][key].append(tmp[i + 1])
# finish processing the last few words
while len (tmp) >= self.order:
tmp = tmp[1:]
# build and add the sequences
for i in range (1, len (tmp) - 1):
key = tuple (tmp[:i + 1])
if not self.dictionaries[i - 2].has_key (key):
self.dictionaries[i - 2][key] = []
self.dictionaries[i - 2][key].append (tmp[i + 1])
# remove empty orders
self.dictionaries = [ x for x in self.dictionaries if len (x) ]
return self
# Generate a candidate sentence
def _make_sentence (self):
# Cache the highest order
max_order = len (self.dictionaries) - 1
# Set the initial candidate to a random key in the highest order
candidate = list (random.choice (self.dictionaries[-1].keys()))
# Start looking at the highest order
cur_order = max_order
# Generate the longest possible sentence
while True:
# The length of the current key is needed to generate a valid key
# from the current candidate sentence
key_len = len(self.dictionaries[cur_order].keys()[0])
try:
options = self.dictionaries[cur_order][tuple(candidate[-key_len:])]
except KeyError:
return candidate
options.sort (key = lambda x: len(x) + random.random () * 5.0,
reverse = True)
# Try for the best possible option
found = False
for o in options:
# A best option:
# - is part of another key (not a dead end, higher order is better)
# - is not already in the candidate sentence (avoid endless
# loops)
new_key = tuple(candidate[-(key_len - 1):] + [o])
#print " new_key: %s" % (str(new_key))
if (self.dictionaries[cur_order].has_key (new_key)
and o not in candidate):
candidate.append(o)
cur_order = max_order
found = True
break
# If a best option is not found and we've bottomed out the potential
# Markov orders, just add anything and terminate the chain
if not found and cur_order == 0:
candidate.append (random.choice (options))
return candidate
# If a best option is not found at this level, try at a lower level
elif not found:
cur_order -= 1
return candidate
# Attempt to generate a sentence of a given length by generating random length
# sentences until one of the appropriate length pops out
def generate (self):
if self.dictionaries == None:
self._gen_dictionaries()
while (True):
candidate = self._make_sentence()
if len(candidate) >= self.min_sentence_length:
return ' '.join(candidate)
if __name__ == '__main__':
gen = markov_text_generator ()
for i in sys.argv[1:]:
print gen.set_file (i).generate()