Georg Brandl | 856898b | 2010-12-30 22:11:50 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | """ |
| 4 | Markov chain simulation of words or characters. |
| 5 | """ |
Guido van Rossum | 6930b3d | 1993-12-14 10:08:02 +0000 | [diff] [blame] | 6 | |
| 7 | class Markov: |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 8 | def __init__(self, histsize, choice): |
| 9 | self.histsize = histsize |
| 10 | self.choice = choice |
| 11 | self.trans = {} |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 12 | |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 13 | def add(self, state, next): |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 14 | self.trans.setdefault(state, []).append(next) |
| 15 | |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 16 | def put(self, seq): |
| 17 | n = self.histsize |
| 18 | add = self.add |
| 19 | add(None, seq[:0]) |
| 20 | for i in range(len(seq)): |
| 21 | add(seq[max(0, i-n):i], seq[i:i+1]) |
| 22 | add(seq[len(seq)-n:], None) |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 23 | |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 24 | def get(self): |
| 25 | choice = self.choice |
| 26 | trans = self.trans |
| 27 | n = self.histsize |
| 28 | seq = choice(trans[None]) |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 29 | while True: |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 30 | subseq = seq[max(0, len(seq)-n):] |
| 31 | options = trans[subseq] |
| 32 | next = choice(options) |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 33 | if not next: |
| 34 | break |
| 35 | seq += next |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 36 | return seq |
Guido van Rossum | 6930b3d | 1993-12-14 10:08:02 +0000 | [diff] [blame] | 37 | |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 38 | |
Guido van Rossum | 6930b3d | 1993-12-14 10:08:02 +0000 | [diff] [blame] | 39 | def test(): |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 40 | import sys, random, getopt |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 41 | args = sys.argv[1:] |
| 42 | try: |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 43 | opts, args = getopt.getopt(args, '0123456789cdwq') |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 44 | except getopt.error: |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 45 | print('Usage: %s [-#] [-cddqw] [file] ...' % sys.argv[0]) |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 46 | print('Options:') |
| 47 | print('-#: 1-digit history size (default 2)') |
| 48 | print('-c: characters (default)') |
| 49 | print('-w: words') |
| 50 | print('-d: more debugging output') |
| 51 | print('-q: no debugging output') |
| 52 | print('Input files (default stdin) are split in paragraphs') |
| 53 | print('separated blank lines and each paragraph is split') |
| 54 | print('in words by whitespace, then reconcatenated with') |
| 55 | print('exactly one space separating words.') |
| 56 | print('Output consists of paragraphs separated by blank') |
| 57 | print('lines, where lines are no longer than 72 characters.') |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 58 | sys.exit(2) |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 59 | histsize = 2 |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 60 | do_words = False |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 61 | debug = 1 |
| 62 | for o, a in opts: |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 63 | if '-0' <= o <= '-9': histsize = int(o[1:]) |
| 64 | if o == '-c': do_words = False |
| 65 | if o == '-d': debug += 1 |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 66 | if o == '-q': debug = 0 |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 67 | if o == '-w': do_words = True |
| 68 | if not args: |
| 69 | args = ['-'] |
| 70 | |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 71 | m = Markov(histsize, random.choice) |
| 72 | try: |
| 73 | for filename in args: |
| 74 | if filename == '-': |
| 75 | f = sys.stdin |
| 76 | if f.isatty(): |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 77 | print('Sorry, need stdin from file') |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 78 | continue |
| 79 | else: |
| 80 | f = open(filename, 'r') |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 81 | if debug: print('processing', filename, '...') |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 82 | text = f.read() |
| 83 | f.close() |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 84 | paralist = text.split('\n\n') |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 85 | for para in paralist: |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 86 | if debug > 1: print('feeding ...') |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 87 | words = para.split() |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 88 | if words: |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 89 | if do_words: |
| 90 | data = tuple(words) |
| 91 | else: |
| 92 | data = ' '.join(words) |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 93 | m.put(data) |
| 94 | except KeyboardInterrupt: |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 95 | print('Interrupted -- continue with data read so far') |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 96 | if not m.trans: |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 97 | print('No valid input files') |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 98 | return |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 99 | if debug: print('done.') |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 100 | |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 101 | if debug > 1: |
Skip Montanaro | 1e8ce58 | 2007-08-06 21:07:53 +0000 | [diff] [blame] | 102 | for key in m.trans.keys(): |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 103 | if key is None or len(key) < histsize: |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 104 | print(repr(key), m.trans[key]) |
| 105 | if histsize == 0: print(repr(''), m.trans['']) |
| 106 | print() |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 107 | while True: |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 108 | data = m.get() |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 109 | if do_words: |
| 110 | words = data |
| 111 | else: |
| 112 | words = data.split() |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 113 | n = 0 |
| 114 | limit = 72 |
| 115 | for w in words: |
| 116 | if n + len(w) > limit: |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 117 | print() |
Tim Peters | e6ddc8b | 2004-07-18 05:56:09 +0000 | [diff] [blame] | 118 | n = 0 |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 119 | print(w, end=' ') |
Georg Brandl | 2cbd09c | 2009-10-11 08:42:09 +0000 | [diff] [blame] | 120 | n += len(w) + 1 |
Collin Winter | 6f2df4d | 2007-07-17 20:59:35 +0000 | [diff] [blame] | 121 | print() |
| 122 | print() |
Guido van Rossum | 6930b3d | 1993-12-14 10:08:02 +0000 | [diff] [blame] | 123 | |
Johannes Gijsbers | 7a8c43e | 2004-09-11 16:34:35 +0000 | [diff] [blame] | 124 | if __name__ == "__main__": |
| 125 | test() |