blob: 379878a5b761af364e0d0628494fe31074d502e1 [file] [log] [blame]
Fred Drake30a68c71998-11-23 16:59:39 +00001#! /usr/bin/env python
2
Fred Drake0eb7b2a1999-05-19 17:37:37 +00003"""Generate ESIS events based on a LaTeX source document and
4configuration data.
5
6The conversion is not strong enough to work with arbitrary LaTeX
7documents; it has only been designed to work with the highly stylized
8markup used in the standard Python documentation. A lot of
9information about specific markup is encoded in the control table
10passed to the convert() function; changing this table can allow this
11tool to support additional LaTeX markups.
12
13The format of the table is largely undocumented; see the commented
14headers where the table is specified in main(). There is no provision
15to load an alternate table from an external file.
Fred Drake30a68c71998-11-23 16:59:39 +000016"""
17__version__ = '$Revision$'
18
19import errno
Fred Drake96e4a061999-07-29 22:22:13 +000020import getopt
21import os
Fred Drake30a68c71998-11-23 16:59:39 +000022import re
23import string
Fred Drake30a68c71998-11-23 16:59:39 +000024import sys
Fred Drake96e4a061999-07-29 22:22:13 +000025import UserList
Fred Drake691a5a72000-11-22 17:56:43 +000026import xml.sax.saxutils
Fred Drake30a68c71998-11-23 16:59:39 +000027
Fred Drake54fb7fb1999-05-10 19:36:03 +000028from types import ListType, StringType, TupleType
Fred Drakeaeea9811998-12-01 19:04:12 +000029
Fred Drake96e4a061999-07-29 22:22:13 +000030try:
31 from xml.parsers.xmllib import XMLParser
32except ImportError:
33 from xmllib import XMLParser
34
Fred Drake30a68c71998-11-23 16:59:39 +000035
Fred Draked7acf021999-01-14 17:38:12 +000036DEBUG = 0
37
38
Fred Drake96e4a061999-07-29 22:22:13 +000039class LaTeXFormatError(Exception):
Fred Drake30a68c71998-11-23 16:59:39 +000040 pass
41
42
Fred Drake96e4a061999-07-29 22:22:13 +000043class LaTeXStackError(LaTeXFormatError):
44 def __init__(self, found, stack):
45 msg = "environment close for %s doesn't match;\n stack = %s" \
46 % (found, stack)
47 self.found = found
48 self.stack = stack[:]
49 LaTeXFormatError.__init__(self, msg)
50
Fred Drake691a5a72000-11-22 17:56:43 +000051def encode(s):
52 s = xml.sax.saxutils.escape(s)
53 return s.replace("\n", "\\n\n-")
54
Fred Drake96e4a061999-07-29 22:22:13 +000055
Fred Drake30a68c71998-11-23 16:59:39 +000056_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
57_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
Fred Drake0eb7b2a1999-05-19 17:37:37 +000058_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
Fred Drake96c00b01999-05-07 19:59:02 +000059_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
Fred Drake691a5a72000-11-22 17:56:43 +000060_text_rx = re.compile(r"[^]~%\\{}]+")
Fred Drake30a68c71998-11-23 16:59:39 +000061_optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
Fred Drakeaeea9811998-12-01 19:04:12 +000062# _parameter_rx is this complicated to allow {...} inside a parameter;
63# this is useful to match tabular layout specifications like {c|p{24pt}}
64_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
Fred Drake30a68c71998-11-23 16:59:39 +000065_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
66_start_group_rx = re.compile("[ \n]*{")
67_start_optional_rx = re.compile("[ \n]*[[]")
68
69
Fred Drake42f52981998-11-30 14:45:24 +000070ESCAPED_CHARS = "$%#^ {}&~"
Fred Drake30a68c71998-11-23 16:59:39 +000071
72
Fred Drakef79acbd1999-05-07 21:12:21 +000073def dbgmsg(msg):
Fred Draked7acf021999-01-14 17:38:12 +000074 if DEBUG:
Fred Drakef79acbd1999-05-07 21:12:21 +000075 sys.stderr.write(msg + "\n")
76
77def pushing(name, point, depth):
Fred Drake96e4a061999-07-29 22:22:13 +000078 dbgmsg("pushing <%s> at %s" % (name, point))
Fred Draked7acf021999-01-14 17:38:12 +000079
80def popping(name, point, depth):
Fred Drake96e4a061999-07-29 22:22:13 +000081 dbgmsg("popping </%s> at %s" % (name, point))
Fred Draked7acf021999-01-14 17:38:12 +000082
83
Fred Drake96e4a061999-07-29 22:22:13 +000084class _Stack(UserList.UserList):
Fred Drake96e4a061999-07-29 22:22:13 +000085 def append(self, entry):
Fred Drake4fbdf971999-08-02 14:35:25 +000086 if type(entry) is not StringType:
Fred Drake96e4a061999-07-29 22:22:13 +000087 raise LaTeXFormatError("cannot push non-string on stack: "
88 + `entry`)
89 sys.stderr.write("%s<%s>\n" % (" "*len(self.data), entry))
90 self.data.append(entry)
91
92 def pop(self, index=-1):
93 entry = self.data[index]
94 del self.data[index]
95 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
96
97 def __delitem__(self, index):
98 entry = self.data[index]
99 del self.data[index]
100 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
101
102
103def new_stack():
104 if DEBUG:
105 return _Stack()
106 return []
107
108
Fred Drake4fbdf971999-08-02 14:35:25 +0000109class Conversion:
110 def __init__(self, ifp, ofp, table):
111 self.write = ofp.write
112 self.ofp = ofp
Fred Drake96c00b01999-05-07 19:59:02 +0000113 self.table = table
Fred Drake96c00b01999-05-07 19:59:02 +0000114 self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
Fred Drake96c00b01999-05-07 19:59:02 +0000115 self.preamble = 1
Fred Drake96c00b01999-05-07 19:59:02 +0000116
Fred Drake691a5a72000-11-22 17:56:43 +0000117 def write_ordinal(self, ordinal):
118 self.write("-\\%%%d;\n" % ordinal)
119
Fred Drake96e4a061999-07-29 22:22:13 +0000120 def err_write(self, msg):
121 if DEBUG:
122 sys.stderr.write(str(msg) + "\n")
123
124 def convert(self):
125 self.subconvert()
126
Fred Drake96e4a061999-07-29 22:22:13 +0000127 def subconvert(self, endchar=None, depth=0):
128 #
129 # Parses content, including sub-structures, until the character
130 # 'endchar' is found (with no open structures), or until the end
131 # of the input data is endchar is None.
132 #
133 stack = new_stack()
134 line = self.line
135 while line:
136 if line[0] == endchar and not stack:
137 self.line = line
138 return line
139 m = _comment_rx.match(line)
140 if m:
141 text = m.group(1)
142 if text:
143 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
144 % encode(text))
145 line = line[m.end():]
146 continue
147 m = _begin_env_rx.match(line)
148 if m:
149 name = m.group(1)
150 entry = self.get_env_entry(name)
151 # re-write to use the macro handler
152 line = r"\%s %s" % (name, line[m.end():])
153 continue
154 m = _end_env_rx.match(line)
155 if m:
156 # end of environment
157 envname = m.group(1)
158 entry = self.get_entry(envname)
159 while stack and envname != stack[-1] \
160 and stack[-1] in entry.endcloses:
161 self.write(")%s\n" % stack.pop())
162 if stack and envname == stack[-1]:
163 self.write(")%s\n" % entry.outputname)
164 del stack[-1]
165 else:
166 raise LaTeXStackError(envname, stack)
167 line = line[m.end():]
168 continue
169 m = _begin_macro_rx.match(line)
170 if m:
171 # start of macro
172 macroname = m.group(1)
Fred Drake691a5a72000-11-22 17:56:43 +0000173 if macroname == "c":
174 # Ugh! This is a combining character...
175 endpos = m.end()
176 self.combining_char("c", line[endpos])
177 line = line[endpos + 1:]
178 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000179 entry = self.get_entry(macroname)
180 if entry.verbatim:
181 # magic case!
182 pos = string.find(line, "\\end{%s}" % macroname)
183 text = line[m.end(1):pos]
184 stack.append(entry.name)
185 self.write("(%s\n" % entry.outputname)
186 self.write("-%s\n" % encode(text))
187 self.write(")%s\n" % entry.outputname)
188 stack.pop()
189 line = line[pos + len("\\end{%s}" % macroname):]
190 continue
191 while stack and stack[-1] in entry.closes:
192 top = stack.pop()
193 topentry = self.get_entry(top)
194 if topentry.outputname:
195 self.write(")%s\n-\\n\n" % topentry.outputname)
196 #
197 if entry.outputname:
198 if entry.empty:
199 self.write("e\n")
Fred Drake96e4a061999-07-29 22:22:13 +0000200 #
201 params, optional, empty, environ = self.start_macro(macroname)
202 # rip off the macroname
203 if params:
204 line = line[m.end(1):]
205 elif empty:
206 line = line[m.end(1):]
207 else:
208 line = line[m.end():]
209 opened = 0
210 implied_content = 0
211
212 # handle attribute mappings here:
213 for pentry in params:
214 if pentry.type == "attribute":
215 if pentry.optional:
216 m = _optional_rx.match(line)
Fred Drake4fbdf971999-08-02 14:35:25 +0000217 if m and entry.outputname:
Fred Drake96e4a061999-07-29 22:22:13 +0000218 line = line[m.end():]
219 self.dump_attr(pentry, m.group(1))
Fred Drake4fbdf971999-08-02 14:35:25 +0000220 elif pentry.text and entry.outputname:
Fred Drake96e4a061999-07-29 22:22:13 +0000221 # value supplied by conversion spec:
222 self.dump_attr(pentry, pentry.text)
223 else:
224 m = _parameter_rx.match(line)
225 if not m:
226 raise LaTeXFormatError(
227 "could not extract parameter %s for %s: %s"
228 % (pentry.name, macroname, `line[:100]`))
Fred Drake4fbdf971999-08-02 14:35:25 +0000229 if entry.outputname:
230 self.dump_attr(pentry, m.group(1))
Fred Drake96e4a061999-07-29 22:22:13 +0000231 line = line[m.end():]
232 elif pentry.type == "child":
233 if pentry.optional:
234 m = _optional_rx.match(line)
235 if m:
236 line = line[m.end():]
237 if entry.outputname and not opened:
238 opened = 1
239 self.write("(%s\n" % entry.outputname)
240 stack.append(macroname)
241 stack.append(pentry.name)
242 self.write("(%s\n" % pentry.name)
243 self.write("-%s\n" % encode(m.group(1)))
244 self.write(")%s\n" % pentry.name)
245 stack.pop()
246 else:
247 if entry.outputname and not opened:
248 opened = 1
249 self.write("(%s\n" % entry.outputname)
250 stack.append(entry.name)
251 self.write("(%s\n" % pentry.name)
252 stack.append(pentry.name)
253 self.line = skip_white(line)[1:]
254 line = self.subconvert(
255 "}", len(stack) + depth + 1)[1:]
256 self.write(")%s\n" % stack.pop())
257 elif pentry.type == "content":
258 if pentry.implied:
259 implied_content = 1
260 else:
261 if entry.outputname and not opened:
262 opened = 1
263 self.write("(%s\n" % entry.outputname)
264 stack.append(entry.name)
265 line = skip_white(line)
266 if line[0] != "{":
267 raise LaTeXFormatError(
268 "missing content for " + macroname)
269 self.line = line[1:]
270 line = self.subconvert("}", len(stack) + depth + 1)
271 if line and line[0] == "}":
272 line = line[1:]
Fred Drake4fbdf971999-08-02 14:35:25 +0000273 elif pentry.type == "text" and pentry.text:
274 if entry.outputname and not opened:
275 opened = 1
276 stack.append(entry.name)
277 self.write("(%s\n" % entry.outputname)
278 self.err_write("--- text: %s\n" % `pentry.text`)
279 self.write("-%s\n" % encode(pentry.text))
Fred Drakef6199ed1999-08-26 17:54:16 +0000280 elif pentry.type == "entityref":
281 self.write("&%s\n" % pentry.name)
Fred Drake96e4a061999-07-29 22:22:13 +0000282 if entry.outputname:
283 if not opened:
284 self.write("(%s\n" % entry.outputname)
285 stack.append(entry.name)
286 if not implied_content:
287 self.write(")%s\n" % entry.outputname)
288 stack.pop()
Fred Drake96e4a061999-07-29 22:22:13 +0000289 continue
290 if line[0] == endchar and not stack:
291 self.line = line[1:]
292 return self.line
293 if line[0] == "}":
294 # end of macro or group
295 macroname = stack[-1]
296 if macroname:
297 conversion = self.table.get(macroname)
298 if conversion.outputname:
299 # otherwise, it was just a bare group
300 self.write(")%s\n" % conversion.outputname)
301 del stack[-1]
302 line = line[1:]
303 continue
Fred Drake691a5a72000-11-22 17:56:43 +0000304 if line[0] == "~":
305 # don't worry about the "tie" aspect of this command
306 line = line[1:]
307 self.write("- \n")
308 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000309 if line[0] == "{":
310 stack.append("")
311 line = line[1:]
312 continue
313 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
314 self.write("-%s\n" % encode(line[1]))
315 line = line[2:]
316 continue
317 if line[:2] == r"\\":
318 self.write("(BREAK\n)BREAK\n")
319 line = line[2:]
320 continue
Fred Drake691a5a72000-11-22 17:56:43 +0000321 if line[:2] == r"\_":
322 line = "_" + line[2:]
323 continue
324 if line[:2] in (r"\'", r'\"'):
325 # combining characters...
326 self.combining_char(line[1], line[2])
327 line = line[3:]
328 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000329 m = _text_rx.match(line)
330 if m:
331 text = encode(m.group())
332 self.write("-%s\n" % text)
333 line = line[m.end():]
334 continue
335 # special case because of \item[]
336 # XXX can we axe this???
337 if line[0] == "]":
338 self.write("-]\n")
339 line = line[1:]
340 continue
341 # avoid infinite loops
342 extra = ""
343 if len(line) > 100:
344 extra = "..."
345 raise LaTeXFormatError("could not identify markup: %s%s"
346 % (`line[:100]`, extra))
347 while stack:
348 entry = self.get_entry(stack[-1])
349 if entry.closes:
350 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
351 del stack[-1]
352 else:
353 break
354 if stack:
355 raise LaTeXFormatError("elements remain on stack: "
356 + string.join(stack, ", "))
357 # otherwise we just ran out of input here...
358
Fred Drake691a5a72000-11-22 17:56:43 +0000359 # This is a really limited table of combinations, but it will have
360 # to do for now.
361 _combinations = {
362 ("c", "c"): 0x00E7,
363 ("'", "e"): 0x00E9,
364 ('"', "o"): 0x00F6,
365 }
366
367 def combining_char(self, prefix, char):
368 ordinal = self._combinations[(prefix, char)]
369 self.write("-\\%%%d;\n" % ordinal)
370
Fred Drake96e4a061999-07-29 22:22:13 +0000371 def start_macro(self, name):
372 conversion = self.get_entry(name)
373 parameters = conversion.parameters
374 optional = parameters and parameters[0].optional
Fred Drake96e4a061999-07-29 22:22:13 +0000375 return parameters, optional, conversion.empty, conversion.environment
376
377 def get_entry(self, name):
378 entry = self.table.get(name)
379 if entry is None:
380 self.err_write("get_entry(%s) failing; building default entry!"
381 % `name`)
382 # not defined; build a default entry:
383 entry = TableEntry(name)
384 entry.has_content = 1
385 entry.parameters.append(Parameter("content"))
386 self.table[name] = entry
387 return entry
388
389 def get_env_entry(self, name):
390 entry = self.table.get(name)
391 if entry is None:
392 # not defined; build a default entry:
393 entry = TableEntry(name, 1)
394 entry.has_content = 1
395 entry.parameters.append(Parameter("content"))
396 entry.parameters[-1].implied = 1
397 self.table[name] = entry
398 elif not entry.environment:
399 raise LaTeXFormatError(
400 name + " is defined as a macro; expected environment")
401 return entry
402
403 def dump_attr(self, pentry, value):
404 if not (pentry.name and value):
405 return
406 if _token_rx.match(value):
407 dtype = "TOKEN"
408 else:
409 dtype = "CDATA"
410 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
411
412
Fred Drakeeac8abe1999-07-29 22:42:27 +0000413def convert(ifp, ofp, table):
414 c = Conversion(ifp, ofp, table)
Fred Drake96e4a061999-07-29 22:22:13 +0000415 try:
416 c.convert()
417 except IOError, (err, msg):
418 if err != errno.EPIPE:
419 raise
420
421
Fred Draked7acf021999-01-14 17:38:12 +0000422def skip_white(line):
Fred Drake96e4a061999-07-29 22:22:13 +0000423 while line and line[0] in " %\n\t\r":
Fred Draked7acf021999-01-14 17:38:12 +0000424 line = string.lstrip(line[1:])
425 return line
426
427
Fred Drake96e4a061999-07-29 22:22:13 +0000428
429class TableEntry:
430 def __init__(self, name, environment=0):
431 self.name = name
432 self.outputname = name
433 self.environment = environment
434 self.empty = not environment
435 self.has_content = 0
436 self.verbatim = 0
437 self.auto_close = 0
438 self.parameters = []
439 self.closes = []
440 self.endcloses = []
441
442class Parameter:
443 def __init__(self, type, name=None, optional=0):
444 self.type = type
445 self.name = name
446 self.optional = optional
447 self.text = ''
448 self.implied = 0
449
450
451class TableParser(XMLParser):
Fred Drake4fbdf971999-08-02 14:35:25 +0000452 def __init__(self, table=None):
453 if table is None:
454 table = {}
455 self.__table = table
Fred Drake96e4a061999-07-29 22:22:13 +0000456 self.__current = None
457 self.__buffer = ''
458 XMLParser.__init__(self)
459
460 def get_table(self):
461 for entry in self.__table.values():
462 if entry.environment and not entry.has_content:
463 p = Parameter("content")
464 p.implied = 1
465 entry.parameters.append(p)
466 entry.has_content = 1
467 return self.__table
468
469 def start_environment(self, attrs):
470 name = attrs["name"]
471 self.__current = TableEntry(name, environment=1)
472 self.__current.verbatim = attrs.get("verbatim") == "yes"
473 if attrs.has_key("outputname"):
474 self.__current.outputname = attrs.get("outputname")
475 self.__current.endcloses = string.split(attrs.get("endcloses", ""))
476 def end_environment(self):
477 self.end_macro()
478
479 def start_macro(self, attrs):
480 name = attrs["name"]
481 self.__current = TableEntry(name)
482 self.__current.closes = string.split(attrs.get("closes", ""))
483 if attrs.has_key("outputname"):
484 self.__current.outputname = attrs.get("outputname")
485 def end_macro(self):
Fred Drake96e4a061999-07-29 22:22:13 +0000486 self.__table[self.__current.name] = self.__current
487 self.__current = None
488
489 def start_attribute(self, attrs):
490 name = attrs.get("name")
491 optional = attrs.get("optional") == "yes"
492 if name:
493 p = Parameter("attribute", name, optional=optional)
494 else:
495 p = Parameter("attribute", optional=optional)
496 self.__current.parameters.append(p)
497 self.__buffer = ''
498 def end_attribute(self):
499 self.__current.parameters[-1].text = self.__buffer
500
Fred Drakef6199ed1999-08-26 17:54:16 +0000501 def start_entityref(self, attrs):
502 name = attrs["name"]
503 p = Parameter("entityref", name)
504 self.__current.parameters.append(p)
505
Fred Drake96e4a061999-07-29 22:22:13 +0000506 def start_child(self, attrs):
507 name = attrs["name"]
508 p = Parameter("child", name, attrs.get("optional") == "yes")
509 self.__current.parameters.append(p)
510 self.__current.empty = 0
511
512 def start_content(self, attrs):
513 p = Parameter("content")
514 p.implied = attrs.get("implied") == "yes"
515 if self.__current.environment:
516 p.implied = 1
517 self.__current.parameters.append(p)
518 self.__current.has_content = 1
519 self.__current.empty = 0
520
521 def start_text(self, attrs):
Fred Drake4fbdf971999-08-02 14:35:25 +0000522 self.__current.empty = 0
Fred Drake96e4a061999-07-29 22:22:13 +0000523 self.__buffer = ''
524 def end_text(self):
525 p = Parameter("text")
526 p.text = self.__buffer
527 self.__current.parameters.append(p)
528
529 def handle_data(self, data):
530 self.__buffer = self.__buffer + data
531
532
Fred Drake4fbdf971999-08-02 14:35:25 +0000533def load_table(fp, table=None):
534 parser = TableParser(table=table)
Fred Drake96e4a061999-07-29 22:22:13 +0000535 parser.feed(fp.read())
536 parser.close()
537 return parser.get_table()
538
539
Fred Drake30a68c71998-11-23 16:59:39 +0000540def main():
Fred Drake96e4a061999-07-29 22:22:13 +0000541 global DEBUG
542 #
Fred Drakeeac8abe1999-07-29 22:42:27 +0000543 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
Fred Drake96e4a061999-07-29 22:22:13 +0000544 for opt, arg in opts:
Fred Drakeeac8abe1999-07-29 22:42:27 +0000545 if opt in ("-D", "--debug"):
Fred Drake96e4a061999-07-29 22:22:13 +0000546 DEBUG = DEBUG + 1
547 if len(args) == 0:
548 ifp = sys.stdin
Fred Drake30a68c71998-11-23 16:59:39 +0000549 ofp = sys.stdout
Fred Drake96e4a061999-07-29 22:22:13 +0000550 elif len(args) == 1:
551 ifp = open(args)
552 ofp = sys.stdout
553 elif len(args) == 2:
554 ifp = open(args[0])
555 ofp = open(args[1], "w")
Fred Drake30a68c71998-11-23 16:59:39 +0000556 else:
557 usage()
558 sys.exit(2)
Fred Drakeeac8abe1999-07-29 22:42:27 +0000559
560 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
561 convert(ifp, ofp, table)
Fred Drake30a68c71998-11-23 16:59:39 +0000562
563
564if __name__ == "__main__":
565 main()