blob: d4bfa3f8162d540feaca60571f86a5991844251f [file] [log] [blame]
Fred Drake30a68c71998-11-23 16:59:39 +00001#! /usr/bin/env python
2
Fred Drake0eb7b2a1999-05-19 17:37:37 +00003"""Generate ESIS events based on a LaTeX source document and
4configuration data.
5
6The conversion is not strong enough to work with arbitrary LaTeX
7documents; it has only been designed to work with the highly stylized
8markup used in the standard Python documentation. A lot of
9information about specific markup is encoded in the control table
10passed to the convert() function; changing this table can allow this
11tool to support additional LaTeX markups.
12
13The format of the table is largely undocumented; see the commented
14headers where the table is specified in main(). There is no provision
15to load an alternate table from an external file.
Fred Drake30a68c71998-11-23 16:59:39 +000016"""
Fred Drake30a68c71998-11-23 16:59:39 +000017
18import errno
Fred Drake96e4a061999-07-29 22:22:13 +000019import getopt
20import os
Fred Drake30a68c71998-11-23 16:59:39 +000021import re
22import string
Fred Drake30a68c71998-11-23 16:59:39 +000023import sys
Fred Drake96e4a061999-07-29 22:22:13 +000024import UserList
Fred Drake691a5a72000-11-22 17:56:43 +000025import xml.sax.saxutils
Fred Drake30a68c71998-11-23 16:59:39 +000026
Fred Drake54fb7fb1999-05-10 19:36:03 +000027from types import ListType, StringType, TupleType
Fred Drakeaeea9811998-12-01 19:04:12 +000028
Fred Drake96e4a061999-07-29 22:22:13 +000029try:
30 from xml.parsers.xmllib import XMLParser
31except ImportError:
32 from xmllib import XMLParser
33
Fred Drake30a68c71998-11-23 16:59:39 +000034
Fred Draked7acf021999-01-14 17:38:12 +000035DEBUG = 0
36
37
Fred Drake96e4a061999-07-29 22:22:13 +000038class LaTeXFormatError(Exception):
Fred Drake30a68c71998-11-23 16:59:39 +000039 pass
40
41
Fred Drake96e4a061999-07-29 22:22:13 +000042class LaTeXStackError(LaTeXFormatError):
43 def __init__(self, found, stack):
44 msg = "environment close for %s doesn't match;\n stack = %s" \
45 % (found, stack)
46 self.found = found
47 self.stack = stack[:]
48 LaTeXFormatError.__init__(self, msg)
49
Fred Drake691a5a72000-11-22 17:56:43 +000050def encode(s):
51 s = xml.sax.saxutils.escape(s)
52 return s.replace("\n", "\\n\n-")
53
Fred Drake96e4a061999-07-29 22:22:13 +000054
Fred Drake30a68c71998-11-23 16:59:39 +000055_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
56_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
Fred Drake0eb7b2a1999-05-19 17:37:37 +000057_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
Fred Drake96c00b01999-05-07 19:59:02 +000058_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
Fred Drake691a5a72000-11-22 17:56:43 +000059_text_rx = re.compile(r"[^]~%\\{}]+")
Fred Drake30a68c71998-11-23 16:59:39 +000060_optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
Fred Drakeaeea9811998-12-01 19:04:12 +000061# _parameter_rx is this complicated to allow {...} inside a parameter;
62# this is useful to match tabular layout specifications like {c|p{24pt}}
63_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
Fred Drake30a68c71998-11-23 16:59:39 +000064_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
65_start_group_rx = re.compile("[ \n]*{")
66_start_optional_rx = re.compile("[ \n]*[[]")
67
68
Fred Drake42f52981998-11-30 14:45:24 +000069ESCAPED_CHARS = "$%#^ {}&~"
Fred Drake30a68c71998-11-23 16:59:39 +000070
71
Fred Drakef79acbd1999-05-07 21:12:21 +000072def dbgmsg(msg):
Fred Draked7acf021999-01-14 17:38:12 +000073 if DEBUG:
Fred Drakef79acbd1999-05-07 21:12:21 +000074 sys.stderr.write(msg + "\n")
75
76def pushing(name, point, depth):
Fred Drake96e4a061999-07-29 22:22:13 +000077 dbgmsg("pushing <%s> at %s" % (name, point))
Fred Draked7acf021999-01-14 17:38:12 +000078
79def popping(name, point, depth):
Fred Drake96e4a061999-07-29 22:22:13 +000080 dbgmsg("popping </%s> at %s" % (name, point))
Fred Draked7acf021999-01-14 17:38:12 +000081
82
Fred Drake96e4a061999-07-29 22:22:13 +000083class _Stack(UserList.UserList):
Fred Drake96e4a061999-07-29 22:22:13 +000084 def append(self, entry):
Fred Drake4fbdf971999-08-02 14:35:25 +000085 if type(entry) is not StringType:
Fred Drake96e4a061999-07-29 22:22:13 +000086 raise LaTeXFormatError("cannot push non-string on stack: "
87 + `entry`)
88 sys.stderr.write("%s<%s>\n" % (" "*len(self.data), entry))
89 self.data.append(entry)
90
91 def pop(self, index=-1):
92 entry = self.data[index]
93 del self.data[index]
94 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
95
96 def __delitem__(self, index):
97 entry = self.data[index]
98 del self.data[index]
99 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
100
101
102def new_stack():
103 if DEBUG:
104 return _Stack()
105 return []
106
107
Fred Drake4fbdf971999-08-02 14:35:25 +0000108class Conversion:
109 def __init__(self, ifp, ofp, table):
110 self.write = ofp.write
111 self.ofp = ofp
Fred Drake96c00b01999-05-07 19:59:02 +0000112 self.table = table
Fred Drake96c00b01999-05-07 19:59:02 +0000113 self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
Fred Drake96c00b01999-05-07 19:59:02 +0000114 self.preamble = 1
Fred Drake96c00b01999-05-07 19:59:02 +0000115
Fred Drake96e4a061999-07-29 22:22:13 +0000116 def err_write(self, msg):
117 if DEBUG:
118 sys.stderr.write(str(msg) + "\n")
119
120 def convert(self):
121 self.subconvert()
122
Fred Drake96e4a061999-07-29 22:22:13 +0000123 def subconvert(self, endchar=None, depth=0):
124 #
125 # Parses content, including sub-structures, until the character
126 # 'endchar' is found (with no open structures), or until the end
127 # of the input data is endchar is None.
128 #
129 stack = new_stack()
130 line = self.line
131 while line:
132 if line[0] == endchar and not stack:
133 self.line = line
134 return line
135 m = _comment_rx.match(line)
136 if m:
137 text = m.group(1)
138 if text:
139 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
140 % encode(text))
141 line = line[m.end():]
142 continue
143 m = _begin_env_rx.match(line)
144 if m:
145 name = m.group(1)
146 entry = self.get_env_entry(name)
147 # re-write to use the macro handler
148 line = r"\%s %s" % (name, line[m.end():])
149 continue
150 m = _end_env_rx.match(line)
151 if m:
152 # end of environment
153 envname = m.group(1)
154 entry = self.get_entry(envname)
155 while stack and envname != stack[-1] \
156 and stack[-1] in entry.endcloses:
157 self.write(")%s\n" % stack.pop())
158 if stack and envname == stack[-1]:
159 self.write(")%s\n" % entry.outputname)
160 del stack[-1]
161 else:
162 raise LaTeXStackError(envname, stack)
163 line = line[m.end():]
164 continue
165 m = _begin_macro_rx.match(line)
166 if m:
167 # start of macro
168 macroname = m.group(1)
Fred Drake691a5a72000-11-22 17:56:43 +0000169 if macroname == "c":
170 # Ugh! This is a combining character...
171 endpos = m.end()
172 self.combining_char("c", line[endpos])
173 line = line[endpos + 1:]
174 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000175 entry = self.get_entry(macroname)
176 if entry.verbatim:
177 # magic case!
178 pos = string.find(line, "\\end{%s}" % macroname)
179 text = line[m.end(1):pos]
180 stack.append(entry.name)
181 self.write("(%s\n" % entry.outputname)
182 self.write("-%s\n" % encode(text))
183 self.write(")%s\n" % entry.outputname)
184 stack.pop()
185 line = line[pos + len("\\end{%s}" % macroname):]
186 continue
187 while stack and stack[-1] in entry.closes:
188 top = stack.pop()
189 topentry = self.get_entry(top)
190 if topentry.outputname:
191 self.write(")%s\n-\\n\n" % topentry.outputname)
192 #
193 if entry.outputname:
194 if entry.empty:
195 self.write("e\n")
Fred Drake96e4a061999-07-29 22:22:13 +0000196 #
197 params, optional, empty, environ = self.start_macro(macroname)
198 # rip off the macroname
199 if params:
200 line = line[m.end(1):]
201 elif empty:
202 line = line[m.end(1):]
203 else:
204 line = line[m.end():]
205 opened = 0
206 implied_content = 0
207
208 # handle attribute mappings here:
209 for pentry in params:
210 if pentry.type == "attribute":
211 if pentry.optional:
212 m = _optional_rx.match(line)
Fred Drake4fbdf971999-08-02 14:35:25 +0000213 if m and entry.outputname:
Fred Drake96e4a061999-07-29 22:22:13 +0000214 line = line[m.end():]
215 self.dump_attr(pentry, m.group(1))
Fred Drake4fbdf971999-08-02 14:35:25 +0000216 elif pentry.text and entry.outputname:
Fred Drake96e4a061999-07-29 22:22:13 +0000217 # value supplied by conversion spec:
218 self.dump_attr(pentry, pentry.text)
219 else:
220 m = _parameter_rx.match(line)
221 if not m:
222 raise LaTeXFormatError(
223 "could not extract parameter %s for %s: %s"
224 % (pentry.name, macroname, `line[:100]`))
Fred Drake4fbdf971999-08-02 14:35:25 +0000225 if entry.outputname:
226 self.dump_attr(pentry, m.group(1))
Fred Drake96e4a061999-07-29 22:22:13 +0000227 line = line[m.end():]
228 elif pentry.type == "child":
229 if pentry.optional:
230 m = _optional_rx.match(line)
231 if m:
232 line = line[m.end():]
233 if entry.outputname and not opened:
234 opened = 1
235 self.write("(%s\n" % entry.outputname)
236 stack.append(macroname)
237 stack.append(pentry.name)
238 self.write("(%s\n" % pentry.name)
239 self.write("-%s\n" % encode(m.group(1)))
240 self.write(")%s\n" % pentry.name)
241 stack.pop()
242 else:
243 if entry.outputname and not opened:
244 opened = 1
245 self.write("(%s\n" % entry.outputname)
246 stack.append(entry.name)
247 self.write("(%s\n" % pentry.name)
248 stack.append(pentry.name)
249 self.line = skip_white(line)[1:]
250 line = self.subconvert(
251 "}", len(stack) + depth + 1)[1:]
252 self.write(")%s\n" % stack.pop())
253 elif pentry.type == "content":
254 if pentry.implied:
255 implied_content = 1
256 else:
257 if entry.outputname and not opened:
258 opened = 1
259 self.write("(%s\n" % entry.outputname)
260 stack.append(entry.name)
261 line = skip_white(line)
262 if line[0] != "{":
263 raise LaTeXFormatError(
264 "missing content for " + macroname)
265 self.line = line[1:]
266 line = self.subconvert("}", len(stack) + depth + 1)
267 if line and line[0] == "}":
268 line = line[1:]
Fred Drake4fbdf971999-08-02 14:35:25 +0000269 elif pentry.type == "text" and pentry.text:
270 if entry.outputname and not opened:
271 opened = 1
272 stack.append(entry.name)
273 self.write("(%s\n" % entry.outputname)
274 self.err_write("--- text: %s\n" % `pentry.text`)
275 self.write("-%s\n" % encode(pentry.text))
Fred Drakef6199ed1999-08-26 17:54:16 +0000276 elif pentry.type == "entityref":
277 self.write("&%s\n" % pentry.name)
Fred Drake96e4a061999-07-29 22:22:13 +0000278 if entry.outputname:
279 if not opened:
280 self.write("(%s\n" % entry.outputname)
281 stack.append(entry.name)
282 if not implied_content:
283 self.write(")%s\n" % entry.outputname)
284 stack.pop()
Fred Drake96e4a061999-07-29 22:22:13 +0000285 continue
286 if line[0] == endchar and not stack:
287 self.line = line[1:]
288 return self.line
289 if line[0] == "}":
290 # end of macro or group
291 macroname = stack[-1]
292 if macroname:
293 conversion = self.table.get(macroname)
294 if conversion.outputname:
295 # otherwise, it was just a bare group
296 self.write(")%s\n" % conversion.outputname)
297 del stack[-1]
298 line = line[1:]
299 continue
Fred Drake691a5a72000-11-22 17:56:43 +0000300 if line[0] == "~":
301 # don't worry about the "tie" aspect of this command
302 line = line[1:]
303 self.write("- \n")
304 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000305 if line[0] == "{":
306 stack.append("")
307 line = line[1:]
308 continue
309 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
310 self.write("-%s\n" % encode(line[1]))
311 line = line[2:]
312 continue
313 if line[:2] == r"\\":
314 self.write("(BREAK\n)BREAK\n")
315 line = line[2:]
316 continue
Fred Drake691a5a72000-11-22 17:56:43 +0000317 if line[:2] == r"\_":
318 line = "_" + line[2:]
319 continue
320 if line[:2] in (r"\'", r'\"'):
321 # combining characters...
322 self.combining_char(line[1], line[2])
323 line = line[3:]
324 continue
Fred Drake96e4a061999-07-29 22:22:13 +0000325 m = _text_rx.match(line)
326 if m:
327 text = encode(m.group())
328 self.write("-%s\n" % text)
329 line = line[m.end():]
330 continue
331 # special case because of \item[]
332 # XXX can we axe this???
333 if line[0] == "]":
334 self.write("-]\n")
335 line = line[1:]
336 continue
337 # avoid infinite loops
338 extra = ""
339 if len(line) > 100:
340 extra = "..."
341 raise LaTeXFormatError("could not identify markup: %s%s"
342 % (`line[:100]`, extra))
343 while stack:
344 entry = self.get_entry(stack[-1])
345 if entry.closes:
346 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
347 del stack[-1]
348 else:
349 break
350 if stack:
351 raise LaTeXFormatError("elements remain on stack: "
352 + string.join(stack, ", "))
353 # otherwise we just ran out of input here...
354
Fred Drake691a5a72000-11-22 17:56:43 +0000355 # This is a really limited table of combinations, but it will have
356 # to do for now.
357 _combinations = {
358 ("c", "c"): 0x00E7,
359 ("'", "e"): 0x00E9,
360 ('"', "o"): 0x00F6,
361 }
362
363 def combining_char(self, prefix, char):
364 ordinal = self._combinations[(prefix, char)]
365 self.write("-\\%%%d;\n" % ordinal)
366
Fred Drake96e4a061999-07-29 22:22:13 +0000367 def start_macro(self, name):
368 conversion = self.get_entry(name)
369 parameters = conversion.parameters
370 optional = parameters and parameters[0].optional
Fred Drake96e4a061999-07-29 22:22:13 +0000371 return parameters, optional, conversion.empty, conversion.environment
372
373 def get_entry(self, name):
374 entry = self.table.get(name)
375 if entry is None:
376 self.err_write("get_entry(%s) failing; building default entry!"
377 % `name`)
378 # not defined; build a default entry:
379 entry = TableEntry(name)
380 entry.has_content = 1
381 entry.parameters.append(Parameter("content"))
382 self.table[name] = entry
383 return entry
384
385 def get_env_entry(self, name):
386 entry = self.table.get(name)
387 if entry is None:
388 # not defined; build a default entry:
389 entry = TableEntry(name, 1)
390 entry.has_content = 1
391 entry.parameters.append(Parameter("content"))
392 entry.parameters[-1].implied = 1
393 self.table[name] = entry
394 elif not entry.environment:
395 raise LaTeXFormatError(
396 name + " is defined as a macro; expected environment")
397 return entry
398
399 def dump_attr(self, pentry, value):
400 if not (pentry.name and value):
401 return
402 if _token_rx.match(value):
403 dtype = "TOKEN"
404 else:
405 dtype = "CDATA"
406 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
407
408
Fred Drakeeac8abe1999-07-29 22:42:27 +0000409def convert(ifp, ofp, table):
410 c = Conversion(ifp, ofp, table)
Fred Drake96e4a061999-07-29 22:22:13 +0000411 try:
412 c.convert()
413 except IOError, (err, msg):
414 if err != errno.EPIPE:
415 raise
416
417
Fred Draked7acf021999-01-14 17:38:12 +0000418def skip_white(line):
Fred Drake96e4a061999-07-29 22:22:13 +0000419 while line and line[0] in " %\n\t\r":
Fred Draked7acf021999-01-14 17:38:12 +0000420 line = string.lstrip(line[1:])
421 return line
422
423
Fred Drake96e4a061999-07-29 22:22:13 +0000424
425class TableEntry:
426 def __init__(self, name, environment=0):
427 self.name = name
428 self.outputname = name
429 self.environment = environment
430 self.empty = not environment
431 self.has_content = 0
432 self.verbatim = 0
433 self.auto_close = 0
434 self.parameters = []
435 self.closes = []
436 self.endcloses = []
437
438class Parameter:
439 def __init__(self, type, name=None, optional=0):
440 self.type = type
441 self.name = name
442 self.optional = optional
443 self.text = ''
444 self.implied = 0
445
446
447class TableParser(XMLParser):
Fred Drake4fbdf971999-08-02 14:35:25 +0000448 def __init__(self, table=None):
449 if table is None:
450 table = {}
451 self.__table = table
Fred Drake96e4a061999-07-29 22:22:13 +0000452 self.__current = None
453 self.__buffer = ''
454 XMLParser.__init__(self)
455
456 def get_table(self):
457 for entry in self.__table.values():
458 if entry.environment and not entry.has_content:
459 p = Parameter("content")
460 p.implied = 1
461 entry.parameters.append(p)
462 entry.has_content = 1
463 return self.__table
464
465 def start_environment(self, attrs):
466 name = attrs["name"]
467 self.__current = TableEntry(name, environment=1)
468 self.__current.verbatim = attrs.get("verbatim") == "yes"
469 if attrs.has_key("outputname"):
470 self.__current.outputname = attrs.get("outputname")
471 self.__current.endcloses = string.split(attrs.get("endcloses", ""))
472 def end_environment(self):
473 self.end_macro()
474
475 def start_macro(self, attrs):
476 name = attrs["name"]
477 self.__current = TableEntry(name)
478 self.__current.closes = string.split(attrs.get("closes", ""))
479 if attrs.has_key("outputname"):
480 self.__current.outputname = attrs.get("outputname")
481 def end_macro(self):
Fred Drake96e4a061999-07-29 22:22:13 +0000482 self.__table[self.__current.name] = self.__current
483 self.__current = None
484
485 def start_attribute(self, attrs):
486 name = attrs.get("name")
487 optional = attrs.get("optional") == "yes"
488 if name:
489 p = Parameter("attribute", name, optional=optional)
490 else:
491 p = Parameter("attribute", optional=optional)
492 self.__current.parameters.append(p)
493 self.__buffer = ''
494 def end_attribute(self):
495 self.__current.parameters[-1].text = self.__buffer
496
Fred Drakef6199ed1999-08-26 17:54:16 +0000497 def start_entityref(self, attrs):
498 name = attrs["name"]
499 p = Parameter("entityref", name)
500 self.__current.parameters.append(p)
501
Fred Drake96e4a061999-07-29 22:22:13 +0000502 def start_child(self, attrs):
503 name = attrs["name"]
504 p = Parameter("child", name, attrs.get("optional") == "yes")
505 self.__current.parameters.append(p)
506 self.__current.empty = 0
507
508 def start_content(self, attrs):
509 p = Parameter("content")
510 p.implied = attrs.get("implied") == "yes"
511 if self.__current.environment:
512 p.implied = 1
513 self.__current.parameters.append(p)
514 self.__current.has_content = 1
515 self.__current.empty = 0
516
517 def start_text(self, attrs):
Fred Drake4fbdf971999-08-02 14:35:25 +0000518 self.__current.empty = 0
Fred Drake96e4a061999-07-29 22:22:13 +0000519 self.__buffer = ''
520 def end_text(self):
521 p = Parameter("text")
522 p.text = self.__buffer
523 self.__current.parameters.append(p)
524
525 def handle_data(self, data):
526 self.__buffer = self.__buffer + data
527
528
Fred Drake4fbdf971999-08-02 14:35:25 +0000529def load_table(fp, table=None):
530 parser = TableParser(table=table)
Fred Drake96e4a061999-07-29 22:22:13 +0000531 parser.feed(fp.read())
532 parser.close()
533 return parser.get_table()
534
535
Fred Drake30a68c71998-11-23 16:59:39 +0000536def main():
Fred Drake96e4a061999-07-29 22:22:13 +0000537 global DEBUG
538 #
Fred Drakeeac8abe1999-07-29 22:42:27 +0000539 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
Fred Drake96e4a061999-07-29 22:22:13 +0000540 for opt, arg in opts:
Fred Drakeeac8abe1999-07-29 22:42:27 +0000541 if opt in ("-D", "--debug"):
Fred Drake96e4a061999-07-29 22:22:13 +0000542 DEBUG = DEBUG + 1
543 if len(args) == 0:
544 ifp = sys.stdin
Fred Drake30a68c71998-11-23 16:59:39 +0000545 ofp = sys.stdout
Fred Drake96e4a061999-07-29 22:22:13 +0000546 elif len(args) == 1:
547 ifp = open(args)
548 ofp = sys.stdout
549 elif len(args) == 2:
550 ifp = open(args[0])
551 ofp = open(args[1], "w")
Fred Drake30a68c71998-11-23 16:59:39 +0000552 else:
553 usage()
554 sys.exit(2)
Fred Drakeeac8abe1999-07-29 22:42:27 +0000555
556 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
557 convert(ifp, ofp, table)
Fred Drake30a68c71998-11-23 16:59:39 +0000558
559
560if __name__ == "__main__":
561 main()