Guido van Rossum | 5dd52d3 | 1995-04-10 11:47:11 +0000 | [diff] [blame] | 1 | # Tools for info file processing. |
| 2 | |
| 3 | # XXX Need to be more careful with reading ahead searching for nodes. |
| 4 | |
| 5 | |
| 6 | import regexp |
| 7 | import string |
| 8 | |
| 9 | |
| 10 | # Exported exceptions. |
| 11 | # |
| 12 | NoSuchFile = 'no such file' |
| 13 | NoSuchNode = 'no such node' |
| 14 | |
| 15 | |
| 16 | # The search path for info files; this is site-specific. |
| 17 | # Directory names should end in a partname delimiter, |
| 18 | # so they can simply be concatenated to a relative pathname. |
| 19 | # |
| 20 | #INFOPATH = ['', ':Info.Ibrowse:', ':Info:'] # Mac |
| 21 | INFOPATH = ['', '/usr/local/emacs/info/'] # X11 on UNIX |
| 22 | |
| 23 | |
| 24 | # Tunable constants. |
| 25 | # |
| 26 | BLOCKSIZE = 512 # Qty to align reads to, if possible |
| 27 | FUZZ = 2*BLOCKSIZE # Qty to back-up before searching for a node |
| 28 | CHUNKSIZE = 4*BLOCKSIZE # Qty to read at once when reading lots of data |
| 29 | |
| 30 | |
| 31 | # Regular expressions used. |
| 32 | # Note that it is essential that Python leaves unrecognized backslash |
| 33 | # escapes in a string so they can be seen by regexp.compile! |
| 34 | # |
| 35 | findheader = regexp.compile('\037\014?\n(.*\n)').match |
| 36 | findescape = regexp.compile('\037').match |
| 37 | parseheader = regexp.compile('[nN]ode:[ \t]*([^\t,\n]*)').match |
| 38 | findfirstline = regexp.compile('^.*\n').match |
| 39 | findnode = regexp.compile('[nN]ode:[ \t]*([^\t,\n]*)').match |
| 40 | findprev = regexp.compile('[pP]rev[ious]*:[ \t]*([^\t,\n]*)').match |
| 41 | findnext = regexp.compile('[nN]ext:[ \t]*([^\t,\n]*)').match |
| 42 | findup = regexp.compile('[uU]p:[ \t]*([^\t,\n]*)').match |
| 43 | findmenu = regexp.compile('^\* [mM]enu:').match |
| 44 | findmenuitem = regexp.compile( \ |
| 45 | '^\* ([^:]+):[ \t]*(:|\([^\t]*\)[^\t,\n.]*|[^:(][^\t,\n.]*)').match |
| 46 | findfootnote = regexp.compile( \ |
| 47 | '\*[nN]ote ([^:]+):[ \t]*(:|[^:][^\t,\n.]*)').match |
| 48 | parsenoderef = regexp.compile('^\((.*)\)(.*)$').match |
| 49 | |
| 50 | |
| 51 | # Get a node and all information pertaining to it. |
| 52 | # This doesn't work if there is an indirect tag table, |
| 53 | # and in general you are better off using icache.get_node() instead. |
| 54 | # Functions get_whole_file() and get_file_node() provide part |
| 55 | # functionality used by icache. |
| 56 | # Raise NoSuchFile or NoSuchNode as appropriate. |
| 57 | # |
| 58 | def get_node(curfile, ref): |
| 59 | file, node = parse_ref(curfile, ref) |
| 60 | if node == '*': |
| 61 | return get_whole_file(file) |
| 62 | else: |
| 63 | return get_file_node(file, 0, node) |
| 64 | # |
| 65 | def get_whole_file(file): |
| 66 | f = try_open(file) # May raise NoSuchFile |
| 67 | text = f.read() |
| 68 | header, menu, footnotes = ('', '', ''), [], [] |
| 69 | return file, '*', header, menu, footnotes, text |
| 70 | # |
| 71 | def get_file_node(file, offset, node): |
| 72 | f = try_open(file) # May raise NoSuchFile |
| 73 | text = find_node(f, offset, node) # May raise NoSuchNode |
| 74 | node, header, menu, footnotes = analyze_node(text) |
| 75 | return file, node, header, menu, footnotes, text |
| 76 | |
| 77 | |
| 78 | # Parse a node reference into a file (possibly default) and node name. |
| 79 | # Possible reference formats are: "NODE", "(FILE)", "(FILE)NODE". |
| 80 | # Default file is the curfile argument; default node is Top. |
| 81 | # A node value of '*' is a special case: the whole file should |
| 82 | # be interpreted (by the caller!) as a single node. |
| 83 | # |
| 84 | def parse_ref(curfile, ref): |
| 85 | match = parsenoderef(ref) |
| 86 | if not match: |
| 87 | file, node = curfile, ref |
| 88 | else: |
| 89 | (a, b), (a1, b1), (a2, b2) = match |
| 90 | file, node = ref[a1:b1], ref[a2:b2] |
| 91 | if not file: |
| 92 | file = curfile # (Is this necessary?) |
| 93 | if not node: |
| 94 | node = 'Top' |
| 95 | return file, node |
| 96 | |
| 97 | |
| 98 | # Extract node name, links, menu and footnotes from the node text. |
| 99 | # |
| 100 | def analyze_node(text): |
| 101 | # |
| 102 | # Get node name and links from the header line |
| 103 | # |
| 104 | match = findfirstline(text) |
| 105 | if match: |
| 106 | (a, b) = match[0] |
| 107 | line = text[a:b] |
| 108 | else: |
| 109 | line = '' |
| 110 | node = get_it(text, findnode) |
| 111 | prev = get_it(text, findprev) |
| 112 | next = get_it(text, findnext) |
| 113 | up = get_it(text, findup) |
| 114 | # |
| 115 | # Get the menu items, if there is a menu |
| 116 | # |
| 117 | menu = [] |
| 118 | match = findmenu(text) |
| 119 | if match: |
| 120 | (a, b) = match[0] |
| 121 | while 1: |
| 122 | match = findmenuitem(text, b) |
| 123 | if not match: |
| 124 | break |
| 125 | (a, b), (a1, b1), (a2, b2) = match |
| 126 | topic, ref = text[a1:b1], text[a2:b2] |
| 127 | if ref == ':': |
| 128 | ref = topic |
Guido van Rossum | 9c2c1e8 | 1998-10-08 15:24:48 +0000 | [diff] [blame] | 129 | menu.append((topic, ref)) |
Guido van Rossum | 5dd52d3 | 1995-04-10 11:47:11 +0000 | [diff] [blame] | 130 | # |
| 131 | # Get the footnotes |
| 132 | # |
| 133 | footnotes = [] |
| 134 | b = 0 |
| 135 | while 1: |
| 136 | match = findfootnote(text, b) |
| 137 | if not match: |
| 138 | break |
| 139 | (a, b), (a1, b1), (a2, b2) = match |
| 140 | topic, ref = text[a1:b1], text[a2:b2] |
| 141 | if ref == ':': |
| 142 | ref = topic |
Guido van Rossum | 9c2c1e8 | 1998-10-08 15:24:48 +0000 | [diff] [blame] | 143 | footnotes.append((topic, ref)) |
Guido van Rossum | 5dd52d3 | 1995-04-10 11:47:11 +0000 | [diff] [blame] | 144 | # |
| 145 | return node, (prev, next, up), menu, footnotes |
| 146 | # |
| 147 | def get_it(line, matcher): |
| 148 | match = matcher(line) |
| 149 | if not match: |
| 150 | return '' |
| 151 | else: |
| 152 | (a, b), (a1, b1) = match |
| 153 | return line[a1:b1] |
| 154 | |
| 155 | |
| 156 | # Find a node in an open file. |
| 157 | # The offset (from the tags table) is a hint about the node's position. |
| 158 | # Pass zero if there is no tags table. |
| 159 | # Raise NoSuchNode if the node isn't found. |
| 160 | # NB: This seeks around in the file. |
| 161 | # |
| 162 | def find_node(f, offset, node): |
| 163 | node = string.lower(node) # Just to be sure |
| 164 | # |
| 165 | # Position a little before the given offset, |
| 166 | # so we may find the node even if it has moved around |
| 167 | # in the file a little. |
| 168 | # |
| 169 | offset = max(0, ((offset-FUZZ) / BLOCKSIZE) * BLOCKSIZE) |
| 170 | f.seek(offset) |
| 171 | # |
| 172 | # Loop, hunting for a matching node header. |
| 173 | # |
| 174 | while 1: |
| 175 | buf = f.read(CHUNKSIZE) |
| 176 | if not buf: |
| 177 | break |
| 178 | i = 0 |
| 179 | while 1: |
| 180 | match = findheader(buf, i) |
| 181 | if match: |
| 182 | (a,b), (a1,b1) = match |
| 183 | start = a1 |
| 184 | line = buf[a1:b1] |
| 185 | i = b |
| 186 | match = parseheader(line) |
| 187 | if match: |
| 188 | (a,b), (a1,b1) = match |
| 189 | key = string.lower(line[a1:b1]) |
| 190 | if key == node: |
| 191 | # Got it! Now read the rest. |
| 192 | return read_node(f, buf[start:]) |
| 193 | elif findescape(buf, i): |
| 194 | next = f.read(CHUNKSIZE) |
| 195 | if not next: |
| 196 | break |
| 197 | buf = buf + next |
| 198 | else: |
| 199 | break |
| 200 | # |
| 201 | # If we get here, we didn't find it. Too bad. |
| 202 | # |
| 203 | raise NoSuchNode, node |
| 204 | |
| 205 | |
| 206 | # Finish off getting a node (subroutine for find_node()). |
| 207 | # The node begins at the start of buf and may end in buf; |
| 208 | # if it doesn't end there, read additional data from f. |
| 209 | # |
| 210 | def read_node(f, buf): |
| 211 | i = 0 |
| 212 | match = findescape(buf, i) |
| 213 | while not match: |
| 214 | next = f.read(CHUNKSIZE) |
| 215 | if not next: |
| 216 | end = len(buf) |
| 217 | break |
| 218 | i = len(buf) |
| 219 | buf = buf + next |
| 220 | match = findescape(buf, i) |
| 221 | else: |
| 222 | # Got a match |
| 223 | (a, b) = match[0] |
| 224 | end = a |
| 225 | # Strip trailing newlines |
| 226 | while end > 0 and buf[end-1] == '\n': |
| 227 | end = end-1 |
| 228 | buf = buf[:end] |
| 229 | return buf |
| 230 | |
| 231 | |
| 232 | # Read reverse starting at offset until the beginning of a node is found. |
| 233 | # Then return a buffer containing the beginning of the node, |
| 234 | # with f positioned just after the buffer. |
| 235 | # The buffer will contain at least the full header line of the node; |
| 236 | # the caller should finish off with read_node() if it is the right node. |
| 237 | # (It is also possible that the buffer extends beyond the node!) |
| 238 | # Return an empty string if there is no node before the given offset. |
| 239 | # |
| 240 | def backup_node(f, offset): |
| 241 | start = max(0, ((offset-CHUNKSIZE) / BLOCKSIZE) * BLOCKSIZE) |
| 242 | end = offset |
| 243 | while start < end: |
| 244 | f.seek(start) |
| 245 | buf = f.read(end-start) |
| 246 | i = 0 |
| 247 | hit = -1 |
| 248 | while 1: |
| 249 | match = findheader(buf, i) |
| 250 | if match: |
| 251 | (a,b), (a1,b1) = match |
| 252 | hit = a1 |
| 253 | i = b |
| 254 | elif end < offset and findescape(buf, i): |
| 255 | next = f.read(min(offset-end, BLOCKSIZE)) |
| 256 | if not next: |
| 257 | break |
| 258 | buf = buf + next |
| 259 | end = end + len(next) |
| 260 | else: |
| 261 | break |
| 262 | if hit >= 0: |
| 263 | return buf[hit:] |
| 264 | end = start |
| 265 | start = max(0, end - CHUNKSIZE) |
| 266 | return '' |
| 267 | |
| 268 | |
| 269 | # Make a tag table for the given file by scanning the file. |
| 270 | # The file must be open for reading, and positioned at the beginning |
| 271 | # (or wherever the hunt for tags must begin; it is read till the end). |
| 272 | # |
| 273 | def make_tags(f): |
| 274 | tags = {} |
| 275 | while 1: |
| 276 | offset = f.tell() |
| 277 | buf = f.read(CHUNKSIZE) |
| 278 | if not buf: |
| 279 | break |
| 280 | i = 0 |
| 281 | while 1: |
| 282 | match = findheader(buf, i) |
| 283 | if match: |
| 284 | (a,b), (a1,b1) = match |
| 285 | start = offset+a1 |
| 286 | line = buf[a1:b1] |
| 287 | i = b |
| 288 | match = parseheader(line) |
| 289 | if match: |
| 290 | (a,b), (a1,b1) = match |
| 291 | key = string.lower(line[a1:b1]) |
| 292 | if tags.has_key(key): |
| 293 | print 'Duplicate node:', |
| 294 | print key |
| 295 | tags[key] = '', start, line |
| 296 | elif findescape(buf, i): |
| 297 | next = f.read(CHUNKSIZE) |
| 298 | if not next: |
| 299 | break |
| 300 | buf = buf + next |
| 301 | else: |
| 302 | break |
| 303 | return tags |
| 304 | |
| 305 | |
| 306 | # Try to open a file, return a file object if succeeds. |
| 307 | # Raise NoSuchFile if the file can't be opened. |
| 308 | # Should treat absolute pathnames special. |
| 309 | # |
| 310 | def try_open(file): |
| 311 | for dir in INFOPATH: |
| 312 | try: |
| 313 | return open(dir + file, 'r') |
| 314 | except IOError: |
| 315 | pass |
| 316 | raise NoSuchFile, file |
| 317 | |
| 318 | |
| 319 | # A little test for the speed of make_tags(). |
| 320 | # |
| 321 | TESTFILE = 'texinfo-1' |
| 322 | def test_make_tags(): |
| 323 | import time |
| 324 | f = try_open(TESTFILE) |
| 325 | t1 = time.time() |
| 326 | tags = make_tags(f) |
| 327 | t2 = time.time() |
| 328 | print 'Making tag table for', `TESTFILE`, 'took', t2-t1, 'sec.' |