blob: 28433c740c4ac5ee7d1038f3b400cd4a99296aec [file] [log] [blame]
Fred Drake30a68c71998-11-23 16:59:39 +00001#! /usr/bin/env python
2
Fred Drake0eb7b2a1999-05-19 17:37:37 +00003"""Generate ESIS events based on a LaTeX source document and
4configuration data.
5
6The conversion is not strong enough to work with arbitrary LaTeX
7documents; it has only been designed to work with the highly stylized
8markup used in the standard Python documentation. A lot of
9information about specific markup is encoded in the control table
10passed to the convert() function; changing this table can allow this
11tool to support additional LaTeX markups.
12
13The format of the table is largely undocumented; see the commented
14headers where the table is specified in main(). There is no provision
15to load an alternate table from an external file.
Fred Drake30a68c71998-11-23 16:59:39 +000016"""
Fred Drake30a68c71998-11-23 16:59:39 +000017
18import errno
Fred Drake96e4a061999-07-29 22:22:13 +000019import getopt
20import os
Fred Drake30a68c71998-11-23 16:59:39 +000021import re
22import string
Fred Drake30a68c71998-11-23 16:59:39 +000023import sys
Fred Drake96e4a061999-07-29 22:22:13 +000024import UserList
Fred Drake691a5a72000-11-22 17:56:43 +000025import xml.sax.saxutils
Fred Drake30a68c71998-11-23 16:59:39 +000026
Fred Drake54fb7fb1999-05-10 19:36:03 +000027from types import ListType, StringType, TupleType
Fred Drakeaeea9811998-12-01 19:04:12 +000028
Fred Drake96e4a061999-07-29 22:22:13 +000029try:
30 from xml.parsers.xmllib import XMLParser
31except ImportError:
32 from xmllib import XMLParser
33
Fred Drake30a68c71998-11-23 16:59:39 +000034
Fred Drake2262a802001-03-23 16:53:34 +000035from esistools import encode
36
37
Fred Draked7acf021999-01-14 17:38:12 +000038DEBUG = 0
39
40
Fred Drake96e4a061999-07-29 22:22:13 +000041class LaTeXFormatError(Exception):
Fred Drake30a68c71998-11-23 16:59:39 +000042 pass
43
44
Fred Drake96e4a061999-07-29 22:22:13 +000045class LaTeXStackError(LaTeXFormatError):
46 def __init__(self, found, stack):
47 msg = "environment close for %s doesn't match;\n stack = %s" \
48 % (found, stack)
49 self.found = found
50 self.stack = stack[:]
51 LaTeXFormatError.__init__(self, msg)
52
53
Fred Drake30a68c71998-11-23 16:59:39 +000054_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
55_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
Fred Drake0eb7b2a1999-05-19 17:37:37 +000056_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
Fred Drake96c00b01999-05-07 19:59:02 +000057_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
Fred Drake691a5a72000-11-22 17:56:43 +000058_text_rx = re.compile(r"[^]~%\\{}]+")
Fred Drakeb5fc0ab2001-07-06 21:01:19 +000059_optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
Fred Drakeaeea9811998-12-01 19:04:12 +000060# _parameter_rx is this complicated to allow {...} inside a parameter;
61# this is useful to match tabular layout specifications like {c|p{24pt}}
62_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
Fred Drake30a68c71998-11-23 16:59:39 +000063_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
64_start_group_rx = re.compile("[ \n]*{")
65_start_optional_rx = re.compile("[ \n]*[[]")
66
67
Fred Drake42f52981998-11-30 14:45:24 +000068ESCAPED_CHARS = "$%#^ {}&~"
Fred Drake30a68c71998-11-23 16:59:39 +000069
70
Fred Drakef79acbd1999-05-07 21:12:21 +000071def dbgmsg(msg):
Fred Draked7acf021999-01-14 17:38:12 +000072 if DEBUG:
Fred Drakef79acbd1999-05-07 21:12:21 +000073 sys.stderr.write(msg + "\n")
74
75def pushing(name, point, depth):
Fred Drake96e4a061999-07-29 22:22:13 +000076 dbgmsg("pushing <%s> at %s" % (name, point))
Fred Draked7acf021999-01-14 17:38:12 +000077
78def popping(name, point, depth):
Fred Drake96e4a061999-07-29 22:22:13 +000079 dbgmsg("popping </%s> at %s" % (name, point))
Fred Draked7acf021999-01-14 17:38:12 +000080
81
Fred Drake96e4a061999-07-29 22:22:13 +000082class _Stack(UserList.UserList):
Fred Drake96e4a061999-07-29 22:22:13 +000083 def append(self, entry):
Fred Drake4fbdf971999-08-02 14:35:25 +000084 if type(entry) is not StringType:
Fred Drake96e4a061999-07-29 22:22:13 +000085 raise LaTeXFormatError("cannot push non-string on stack: "
86 + `entry`)
Fred Drake2262a802001-03-23 16:53:34 +000087 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
Fred Drake96e4a061999-07-29 22:22:13 +000088 self.data.append(entry)
89
90 def pop(self, index=-1):
91 entry = self.data[index]
92 del self.data[index]
Fred Drake2262a802001-03-23 16:53:34 +000093 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
Fred Drake96e4a061999-07-29 22:22:13 +000094
95 def __delitem__(self, index):
96 entry = self.data[index]
97 del self.data[index]
Fred Drake2262a802001-03-23 16:53:34 +000098 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
Fred Drake96e4a061999-07-29 22:22:13 +000099
100
101def new_stack():
102 if DEBUG:
103 return _Stack()
104 return []
105
106
Fred Drake4fbdf971999-08-02 14:35:25 +0000107class Conversion:
108 def __init__(self, ifp, ofp, table):
109 self.write = ofp.write
110 self.ofp = ofp
Fred Drake96c00b01999-05-07 19:59:02 +0000111 self.table = table
Fred Drake00c96ae2001-11-19 05:27:40 +0000112 L = [s.rstrip() for s in ifp.readlines()]
113 L.append("")
114 self.line = string.join(L, "\n")
Fred Drake96c00b01999-05-07 19:59:02 +0000115 self.preamble = 1
Fred Drake96c00b01999-05-07 19:59:02 +0000116
Fred Drake96e4a061999-07-29 22:22:13 +0000117 def convert(self):
118 self.subconvert()
119
Fred Drake96e4a061999-07-29 22:22:13 +0000120 def subconvert(self, endchar=None, depth=0):
121 #
122 # Parses content, including sub-structures, until the character
123 # 'endchar' is found (with no open structures), or until the end
124 # of the input data is endchar is None.
125 #
126 stack = new_stack()
127 line = self.line
128 while line:
129 if line[0] == endchar and not stack:
130 self.line = line
131 return line
132 m = _comment_rx.match(line)
133 if m:
134 text = m.group(1)
135 if text:
136 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
137 % encode(text))
138 line = line[m.end():]
139 continue
140 m = _begin_env_rx.match(line)
141 if m:
142 name = m.group(1)
143 entry = self.get_env_entry(name)
144 # re-write to use the macro handler
145 line = r"\%s %s" % (name, line[m.end():])
146 continue
147 m = _end_env_rx.match(line)
148 if m:
149 # end of environment
150 envname = m.group(1)
151 entry = self.get_entry(envname)
152 while stack and envname != stack[-1] \
153 and stack[-1] in entry.endcloses:
154 self.write(")%s\n" % stack.pop())
155 if stack and envname == stack[-1]:
156 self.write(")%s\n" % entry.outputname)
157 del stack[-1]
158 else:
159 raise LaTeXStackError(envname, stack)
160 line = line[m.end():]
161 continue
162 m = _begin_macro_rx.match(line)
163 if m:
164 # start of macro
165 macroname = m.group(1)
Fred Drake691a5a72000-11-22 17:56:43 +0000166 if macroname == "c":
167 # Ugh! This is a combining character...
168 endpos = m.end()
169 self.combining_char("c", line[endpos])
170 line = line[endpos + 1:]
171 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000172 entry = self.get_entry(macroname)
173 if entry.verbatim:
174 # magic case!
Fred Drake0f9bfd32001-09-28 16:26:13 +0000175 pos = line.find("\\end{%s}" % macroname)
Fred Drake96e4a061999-07-29 22:22:13 +0000176 text = line[m.end(1):pos]
177 stack.append(entry.name)
178 self.write("(%s\n" % entry.outputname)
179 self.write("-%s\n" % encode(text))
180 self.write(")%s\n" % entry.outputname)
181 stack.pop()
182 line = line[pos + len("\\end{%s}" % macroname):]
183 continue
184 while stack and stack[-1] in entry.closes:
185 top = stack.pop()
186 topentry = self.get_entry(top)
187 if topentry.outputname:
188 self.write(")%s\n-\\n\n" % topentry.outputname)
189 #
Fred Drake9eda3ae2001-09-25 20:57:36 +0000190 if entry.outputname and entry.empty:
191 self.write("e\n")
Fred Drake96e4a061999-07-29 22:22:13 +0000192 #
Fred Drake9eda3ae2001-09-25 20:57:36 +0000193 params, optional, empty = self.start_macro(macroname)
Fred Drake96e4a061999-07-29 22:22:13 +0000194 # rip off the macroname
195 if params:
196 line = line[m.end(1):]
197 elif empty:
198 line = line[m.end(1):]
199 else:
200 line = line[m.end():]
201 opened = 0
202 implied_content = 0
203
204 # handle attribute mappings here:
205 for pentry in params:
206 if pentry.type == "attribute":
207 if pentry.optional:
208 m = _optional_rx.match(line)
Fred Drake4fbdf971999-08-02 14:35:25 +0000209 if m and entry.outputname:
Fred Drake96e4a061999-07-29 22:22:13 +0000210 line = line[m.end():]
211 self.dump_attr(pentry, m.group(1))
Fred Drake4fbdf971999-08-02 14:35:25 +0000212 elif pentry.text and entry.outputname:
Fred Drake96e4a061999-07-29 22:22:13 +0000213 # value supplied by conversion spec:
214 self.dump_attr(pentry, pentry.text)
215 else:
216 m = _parameter_rx.match(line)
217 if not m:
218 raise LaTeXFormatError(
219 "could not extract parameter %s for %s: %s"
220 % (pentry.name, macroname, `line[:100]`))
Fred Drake4fbdf971999-08-02 14:35:25 +0000221 if entry.outputname:
222 self.dump_attr(pentry, m.group(1))
Fred Drake96e4a061999-07-29 22:22:13 +0000223 line = line[m.end():]
224 elif pentry.type == "child":
225 if pentry.optional:
226 m = _optional_rx.match(line)
227 if m:
228 line = line[m.end():]
229 if entry.outputname and not opened:
230 opened = 1
231 self.write("(%s\n" % entry.outputname)
232 stack.append(macroname)
233 stack.append(pentry.name)
234 self.write("(%s\n" % pentry.name)
235 self.write("-%s\n" % encode(m.group(1)))
236 self.write(")%s\n" % pentry.name)
237 stack.pop()
238 else:
239 if entry.outputname and not opened:
240 opened = 1
241 self.write("(%s\n" % entry.outputname)
242 stack.append(entry.name)
243 self.write("(%s\n" % pentry.name)
244 stack.append(pentry.name)
245 self.line = skip_white(line)[1:]
246 line = self.subconvert(
247 "}", len(stack) + depth + 1)[1:]
248 self.write(")%s\n" % stack.pop())
249 elif pentry.type == "content":
250 if pentry.implied:
251 implied_content = 1
252 else:
253 if entry.outputname and not opened:
254 opened = 1
255 self.write("(%s\n" % entry.outputname)
256 stack.append(entry.name)
257 line = skip_white(line)
258 if line[0] != "{":
259 raise LaTeXFormatError(
260 "missing content for " + macroname)
261 self.line = line[1:]
262 line = self.subconvert("}", len(stack) + depth + 1)
263 if line and line[0] == "}":
264 line = line[1:]
Fred Drake4fbdf971999-08-02 14:35:25 +0000265 elif pentry.type == "text" and pentry.text:
266 if entry.outputname and not opened:
267 opened = 1
268 stack.append(entry.name)
269 self.write("(%s\n" % entry.outputname)
Fred Drake2262a802001-03-23 16:53:34 +0000270 #dbgmsg("--- text: %s" % `pentry.text`)
Fred Drake4fbdf971999-08-02 14:35:25 +0000271 self.write("-%s\n" % encode(pentry.text))
Fred Drakef6199ed1999-08-26 17:54:16 +0000272 elif pentry.type == "entityref":
273 self.write("&%s\n" % pentry.name)
Fred Drake96e4a061999-07-29 22:22:13 +0000274 if entry.outputname:
275 if not opened:
276 self.write("(%s\n" % entry.outputname)
277 stack.append(entry.name)
278 if not implied_content:
279 self.write(")%s\n" % entry.outputname)
280 stack.pop()
Fred Drake96e4a061999-07-29 22:22:13 +0000281 continue
282 if line[0] == endchar and not stack:
283 self.line = line[1:]
284 return self.line
285 if line[0] == "}":
286 # end of macro or group
287 macroname = stack[-1]
288 if macroname:
Fred Drake2262a802001-03-23 16:53:34 +0000289 conversion = self.table[macroname]
Fred Drake96e4a061999-07-29 22:22:13 +0000290 if conversion.outputname:
291 # otherwise, it was just a bare group
292 self.write(")%s\n" % conversion.outputname)
293 del stack[-1]
294 line = line[1:]
295 continue
Fred Drake691a5a72000-11-22 17:56:43 +0000296 if line[0] == "~":
297 # don't worry about the "tie" aspect of this command
298 line = line[1:]
299 self.write("- \n")
300 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000301 if line[0] == "{":
302 stack.append("")
303 line = line[1:]
304 continue
305 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
306 self.write("-%s\n" % encode(line[1]))
307 line = line[2:]
308 continue
309 if line[:2] == r"\\":
310 self.write("(BREAK\n)BREAK\n")
311 line = line[2:]
312 continue
Fred Drake691a5a72000-11-22 17:56:43 +0000313 if line[:2] == r"\_":
314 line = "_" + line[2:]
315 continue
316 if line[:2] in (r"\'", r'\"'):
317 # combining characters...
318 self.combining_char(line[1], line[2])
319 line = line[3:]
320 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000321 m = _text_rx.match(line)
322 if m:
323 text = encode(m.group())
324 self.write("-%s\n" % text)
325 line = line[m.end():]
326 continue
327 # special case because of \item[]
328 # XXX can we axe this???
329 if line[0] == "]":
330 self.write("-]\n")
331 line = line[1:]
332 continue
333 # avoid infinite loops
334 extra = ""
335 if len(line) > 100:
336 extra = "..."
337 raise LaTeXFormatError("could not identify markup: %s%s"
338 % (`line[:100]`, extra))
339 while stack:
340 entry = self.get_entry(stack[-1])
341 if entry.closes:
342 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
343 del stack[-1]
344 else:
345 break
346 if stack:
347 raise LaTeXFormatError("elements remain on stack: "
348 + string.join(stack, ", "))
349 # otherwise we just ran out of input here...
350
Fred Drake691a5a72000-11-22 17:56:43 +0000351 # This is a really limited table of combinations, but it will have
352 # to do for now.
353 _combinations = {
354 ("c", "c"): 0x00E7,
355 ("'", "e"): 0x00E9,
356 ('"', "o"): 0x00F6,
357 }
358
359 def combining_char(self, prefix, char):
360 ordinal = self._combinations[(prefix, char)]
361 self.write("-\\%%%d;\n" % ordinal)
362
Fred Drake96e4a061999-07-29 22:22:13 +0000363 def start_macro(self, name):
364 conversion = self.get_entry(name)
365 parameters = conversion.parameters
366 optional = parameters and parameters[0].optional
Fred Drake9eda3ae2001-09-25 20:57:36 +0000367 return parameters, optional, conversion.empty
Fred Drake96e4a061999-07-29 22:22:13 +0000368
369 def get_entry(self, name):
370 entry = self.table.get(name)
371 if entry is None:
Fred Drake2262a802001-03-23 16:53:34 +0000372 dbgmsg("get_entry(%s) failing; building default entry!" % `name`)
Fred Drake96e4a061999-07-29 22:22:13 +0000373 # not defined; build a default entry:
374 entry = TableEntry(name)
375 entry.has_content = 1
376 entry.parameters.append(Parameter("content"))
377 self.table[name] = entry
378 return entry
379
380 def get_env_entry(self, name):
381 entry = self.table.get(name)
382 if entry is None:
383 # not defined; build a default entry:
384 entry = TableEntry(name, 1)
385 entry.has_content = 1
386 entry.parameters.append(Parameter("content"))
387 entry.parameters[-1].implied = 1
388 self.table[name] = entry
389 elif not entry.environment:
390 raise LaTeXFormatError(
391 name + " is defined as a macro; expected environment")
392 return entry
393
394 def dump_attr(self, pentry, value):
395 if not (pentry.name and value):
396 return
397 if _token_rx.match(value):
398 dtype = "TOKEN"
399 else:
400 dtype = "CDATA"
401 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
402
403
Fred Drakeeac8abe1999-07-29 22:42:27 +0000404def convert(ifp, ofp, table):
405 c = Conversion(ifp, ofp, table)
Fred Drake96e4a061999-07-29 22:22:13 +0000406 try:
407 c.convert()
408 except IOError, (err, msg):
409 if err != errno.EPIPE:
410 raise
411
412
Fred Draked7acf021999-01-14 17:38:12 +0000413def skip_white(line):
Fred Drake96e4a061999-07-29 22:22:13 +0000414 while line and line[0] in " %\n\t\r":
Fred Drake0f9bfd32001-09-28 16:26:13 +0000415 line = line[1:].lstrip()
Fred Draked7acf021999-01-14 17:38:12 +0000416 return line
417
418
Fred Drake96e4a061999-07-29 22:22:13 +0000419
420class TableEntry:
421 def __init__(self, name, environment=0):
422 self.name = name
423 self.outputname = name
424 self.environment = environment
425 self.empty = not environment
426 self.has_content = 0
427 self.verbatim = 0
428 self.auto_close = 0
429 self.parameters = []
430 self.closes = []
431 self.endcloses = []
432
433class Parameter:
434 def __init__(self, type, name=None, optional=0):
435 self.type = type
436 self.name = name
437 self.optional = optional
438 self.text = ''
439 self.implied = 0
440
441
442class TableParser(XMLParser):
Fred Drake4fbdf971999-08-02 14:35:25 +0000443 def __init__(self, table=None):
444 if table is None:
445 table = {}
446 self.__table = table
Fred Drake96e4a061999-07-29 22:22:13 +0000447 self.__current = None
448 self.__buffer = ''
449 XMLParser.__init__(self)
450
451 def get_table(self):
452 for entry in self.__table.values():
453 if entry.environment and not entry.has_content:
454 p = Parameter("content")
455 p.implied = 1
456 entry.parameters.append(p)
457 entry.has_content = 1
458 return self.__table
459
460 def start_environment(self, attrs):
461 name = attrs["name"]
462 self.__current = TableEntry(name, environment=1)
463 self.__current.verbatim = attrs.get("verbatim") == "yes"
464 if attrs.has_key("outputname"):
465 self.__current.outputname = attrs.get("outputname")
Fred Drake0f9bfd32001-09-28 16:26:13 +0000466 self.__current.endcloses = attrs.get("endcloses", "").split()
Fred Drake96e4a061999-07-29 22:22:13 +0000467 def end_environment(self):
468 self.end_macro()
469
470 def start_macro(self, attrs):
471 name = attrs["name"]
472 self.__current = TableEntry(name)
Fred Drake0f9bfd32001-09-28 16:26:13 +0000473 self.__current.closes = attrs.get("closes", "").split()
Fred Drake96e4a061999-07-29 22:22:13 +0000474 if attrs.has_key("outputname"):
475 self.__current.outputname = attrs.get("outputname")
476 def end_macro(self):
Fred Drake96e4a061999-07-29 22:22:13 +0000477 self.__table[self.__current.name] = self.__current
478 self.__current = None
479
480 def start_attribute(self, attrs):
481 name = attrs.get("name")
482 optional = attrs.get("optional") == "yes"
483 if name:
484 p = Parameter("attribute", name, optional=optional)
485 else:
486 p = Parameter("attribute", optional=optional)
487 self.__current.parameters.append(p)
488 self.__buffer = ''
489 def end_attribute(self):
490 self.__current.parameters[-1].text = self.__buffer
491
Fred Drakef6199ed1999-08-26 17:54:16 +0000492 def start_entityref(self, attrs):
493 name = attrs["name"]
494 p = Parameter("entityref", name)
495 self.__current.parameters.append(p)
496
Fred Drake96e4a061999-07-29 22:22:13 +0000497 def start_child(self, attrs):
498 name = attrs["name"]
499 p = Parameter("child", name, attrs.get("optional") == "yes")
500 self.__current.parameters.append(p)
501 self.__current.empty = 0
502
503 def start_content(self, attrs):
504 p = Parameter("content")
505 p.implied = attrs.get("implied") == "yes"
506 if self.__current.environment:
507 p.implied = 1
508 self.__current.parameters.append(p)
509 self.__current.has_content = 1
510 self.__current.empty = 0
511
512 def start_text(self, attrs):
Fred Drake4fbdf971999-08-02 14:35:25 +0000513 self.__current.empty = 0
Fred Drake96e4a061999-07-29 22:22:13 +0000514 self.__buffer = ''
515 def end_text(self):
516 p = Parameter("text")
517 p.text = self.__buffer
518 self.__current.parameters.append(p)
519
520 def handle_data(self, data):
521 self.__buffer = self.__buffer + data
522
523
Fred Drake4fbdf971999-08-02 14:35:25 +0000524def load_table(fp, table=None):
525 parser = TableParser(table=table)
Fred Drake96e4a061999-07-29 22:22:13 +0000526 parser.feed(fp.read())
527 parser.close()
528 return parser.get_table()
529
530
Fred Drake30a68c71998-11-23 16:59:39 +0000531def main():
Fred Drake96e4a061999-07-29 22:22:13 +0000532 global DEBUG
533 #
Fred Drakeeac8abe1999-07-29 22:42:27 +0000534 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
Fred Drake96e4a061999-07-29 22:22:13 +0000535 for opt, arg in opts:
Fred Drakeeac8abe1999-07-29 22:42:27 +0000536 if opt in ("-D", "--debug"):
Fred Drake96e4a061999-07-29 22:22:13 +0000537 DEBUG = DEBUG + 1
538 if len(args) == 0:
539 ifp = sys.stdin
Fred Drake30a68c71998-11-23 16:59:39 +0000540 ofp = sys.stdout
Fred Drake96e4a061999-07-29 22:22:13 +0000541 elif len(args) == 1:
542 ifp = open(args)
543 ofp = sys.stdout
544 elif len(args) == 2:
545 ifp = open(args[0])
546 ofp = open(args[1], "w")
Fred Drake30a68c71998-11-23 16:59:39 +0000547 else:
548 usage()
549 sys.exit(2)
Fred Drakeeac8abe1999-07-29 22:42:27 +0000550
551 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
552 convert(ifp, ofp, table)
Fred Drake30a68c71998-11-23 16:59:39 +0000553
554
555if __name__ == "__main__":
556 main()