blob: 7a0720fa7c34d845a43a3e2cbcd0ced8390347bd [file] [log] [blame]
Georg Brandl856898b2010-12-30 22:11:50 +00001#!/usr/bin/env python3
2
3"""
4Markov chain simulation of words or characters.
5"""
Guido van Rossum6930b3d1993-12-14 10:08:02 +00006
7class Markov:
Tim Peterse6ddc8b2004-07-18 05:56:09 +00008 def __init__(self, histsize, choice):
9 self.histsize = histsize
10 self.choice = choice
11 self.trans = {}
Georg Brandl2cbd09c2009-10-11 08:42:09 +000012
Tim Peterse6ddc8b2004-07-18 05:56:09 +000013 def add(self, state, next):
Georg Brandl2cbd09c2009-10-11 08:42:09 +000014 self.trans.setdefault(state, []).append(next)
15
Tim Peterse6ddc8b2004-07-18 05:56:09 +000016 def put(self, seq):
17 n = self.histsize
18 add = self.add
19 add(None, seq[:0])
20 for i in range(len(seq)):
21 add(seq[max(0, i-n):i], seq[i:i+1])
22 add(seq[len(seq)-n:], None)
Georg Brandl2cbd09c2009-10-11 08:42:09 +000023
Tim Peterse6ddc8b2004-07-18 05:56:09 +000024 def get(self):
25 choice = self.choice
26 trans = self.trans
27 n = self.histsize
28 seq = choice(trans[None])
Georg Brandl2cbd09c2009-10-11 08:42:09 +000029 while True:
Tim Peterse6ddc8b2004-07-18 05:56:09 +000030 subseq = seq[max(0, len(seq)-n):]
31 options = trans[subseq]
32 next = choice(options)
Georg Brandl2cbd09c2009-10-11 08:42:09 +000033 if not next:
34 break
35 seq += next
Tim Peterse6ddc8b2004-07-18 05:56:09 +000036 return seq
Guido van Rossum6930b3d1993-12-14 10:08:02 +000037
Georg Brandl2cbd09c2009-10-11 08:42:09 +000038
Guido van Rossum6930b3d1993-12-14 10:08:02 +000039def test():
Georg Brandl2cbd09c2009-10-11 08:42:09 +000040 import sys, random, getopt
Tim Peterse6ddc8b2004-07-18 05:56:09 +000041 args = sys.argv[1:]
42 try:
Georg Brandl2cbd09c2009-10-11 08:42:09 +000043 opts, args = getopt.getopt(args, '0123456789cdwq')
Tim Peterse6ddc8b2004-07-18 05:56:09 +000044 except getopt.error:
Georg Brandl2cbd09c2009-10-11 08:42:09 +000045 print('Usage: %s [-#] [-cddqw] [file] ...' % sys.argv[0])
Collin Winter6f2df4d2007-07-17 20:59:35 +000046 print('Options:')
47 print('-#: 1-digit history size (default 2)')
48 print('-c: characters (default)')
49 print('-w: words')
50 print('-d: more debugging output')
51 print('-q: no debugging output')
52 print('Input files (default stdin) are split in paragraphs')
53 print('separated blank lines and each paragraph is split')
54 print('in words by whitespace, then reconcatenated with')
55 print('exactly one space separating words.')
56 print('Output consists of paragraphs separated by blank')
57 print('lines, where lines are no longer than 72 characters.')
Georg Brandl2cbd09c2009-10-11 08:42:09 +000058 sys.exit(2)
Tim Peterse6ddc8b2004-07-18 05:56:09 +000059 histsize = 2
Georg Brandl2cbd09c2009-10-11 08:42:09 +000060 do_words = False
Tim Peterse6ddc8b2004-07-18 05:56:09 +000061 debug = 1
62 for o, a in opts:
Georg Brandl2cbd09c2009-10-11 08:42:09 +000063 if '-0' <= o <= '-9': histsize = int(o[1:])
64 if o == '-c': do_words = False
65 if o == '-d': debug += 1
Tim Peterse6ddc8b2004-07-18 05:56:09 +000066 if o == '-q': debug = 0
Georg Brandl2cbd09c2009-10-11 08:42:09 +000067 if o == '-w': do_words = True
68 if not args:
69 args = ['-']
70
Tim Peterse6ddc8b2004-07-18 05:56:09 +000071 m = Markov(histsize, random.choice)
72 try:
73 for filename in args:
74 if filename == '-':
75 f = sys.stdin
76 if f.isatty():
Collin Winter6f2df4d2007-07-17 20:59:35 +000077 print('Sorry, need stdin from file')
Tim Peterse6ddc8b2004-07-18 05:56:09 +000078 continue
79 else:
80 f = open(filename, 'r')
Collin Winter6f2df4d2007-07-17 20:59:35 +000081 if debug: print('processing', filename, '...')
Tim Peterse6ddc8b2004-07-18 05:56:09 +000082 text = f.read()
83 f.close()
Georg Brandl2cbd09c2009-10-11 08:42:09 +000084 paralist = text.split('\n\n')
Tim Peterse6ddc8b2004-07-18 05:56:09 +000085 for para in paralist:
Collin Winter6f2df4d2007-07-17 20:59:35 +000086 if debug > 1: print('feeding ...')
Georg Brandl2cbd09c2009-10-11 08:42:09 +000087 words = para.split()
Tim Peterse6ddc8b2004-07-18 05:56:09 +000088 if words:
Georg Brandl2cbd09c2009-10-11 08:42:09 +000089 if do_words:
90 data = tuple(words)
91 else:
92 data = ' '.join(words)
Tim Peterse6ddc8b2004-07-18 05:56:09 +000093 m.put(data)
94 except KeyboardInterrupt:
Collin Winter6f2df4d2007-07-17 20:59:35 +000095 print('Interrupted -- continue with data read so far')
Tim Peterse6ddc8b2004-07-18 05:56:09 +000096 if not m.trans:
Collin Winter6f2df4d2007-07-17 20:59:35 +000097 print('No valid input files')
Tim Peterse6ddc8b2004-07-18 05:56:09 +000098 return
Collin Winter6f2df4d2007-07-17 20:59:35 +000099 if debug: print('done.')
Georg Brandl2cbd09c2009-10-11 08:42:09 +0000100
Tim Peterse6ddc8b2004-07-18 05:56:09 +0000101 if debug > 1:
Skip Montanaro1e8ce582007-08-06 21:07:53 +0000102 for key in m.trans.keys():
Tim Peterse6ddc8b2004-07-18 05:56:09 +0000103 if key is None or len(key) < histsize:
Collin Winter6f2df4d2007-07-17 20:59:35 +0000104 print(repr(key), m.trans[key])
105 if histsize == 0: print(repr(''), m.trans[''])
106 print()
Georg Brandl2cbd09c2009-10-11 08:42:09 +0000107 while True:
Tim Peterse6ddc8b2004-07-18 05:56:09 +0000108 data = m.get()
Georg Brandl2cbd09c2009-10-11 08:42:09 +0000109 if do_words:
110 words = data
111 else:
112 words = data.split()
Tim Peterse6ddc8b2004-07-18 05:56:09 +0000113 n = 0
114 limit = 72
115 for w in words:
116 if n + len(w) > limit:
Collin Winter6f2df4d2007-07-17 20:59:35 +0000117 print()
Tim Peterse6ddc8b2004-07-18 05:56:09 +0000118 n = 0
Collin Winter6f2df4d2007-07-17 20:59:35 +0000119 print(w, end=' ')
Georg Brandl2cbd09c2009-10-11 08:42:09 +0000120 n += len(w) + 1
Collin Winter6f2df4d2007-07-17 20:59:35 +0000121 print()
122 print()
Guido van Rossum6930b3d1993-12-14 10:08:02 +0000123
Johannes Gijsbers7a8c43e2004-09-11 16:34:35 +0000124if __name__ == "__main__":
125 test()