blob: 74e1dc7e12fb5dbbe41ec22c9a4ca98c5be50fc8 [file] [log] [blame]
Jean-Paul Calderone897bc252008-02-18 20:50:23 -05001#! /usr/bin/env python
2
3"""Generate ESIS events based on a LaTeX source document and
4configuration data.
5
6The conversion is not strong enough to work with arbitrary LaTeX
7documents; it has only been designed to work with the highly stylized
8markup used in the standard Python documentation. A lot of
9information about specific markup is encoded in the control table
10passed to the convert() function; changing this table can allow this
11tool to support additional LaTeX markups.
12
13The format of the table is largely undocumented; see the commented
14headers where the table is specified in main(). There is no provision
15to load an alternate table from an external file.
16"""
17
18import errno
19import getopt
20import os
21import re
22import string
23import sys
24import UserList
25import xml.sax.saxutils
26
27from types import ListType, StringType, TupleType
28
29try:
30 from xml.parsers.xmllib import XMLParser
31except ImportError:
32 from xmllib import XMLParser
33
34
35from esistools import encode
36
37
38DEBUG = 0
39
40
41class LaTeXFormatError(Exception):
42 pass
43
44
45class LaTeXStackError(LaTeXFormatError):
46 def __init__(self, found, stack):
47 msg = "environment close for %s doesn't match;\n stack = %s" \
48 % (found, stack)
49 self.found = found
50 self.stack = stack[:]
51 LaTeXFormatError.__init__(self, msg)
52
53
54_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
55_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
56_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
57_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
58_text_rx = re.compile(r"[^]~%\\{}]+")
59_optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
60# _parameter_rx is this complicated to allow {...} inside a parameter;
61# this is useful to match tabular layout specifications like {c|p{24pt}}
62_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
63_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
64_start_group_rx = re.compile("[ \n]*{")
65_start_optional_rx = re.compile("[ \n]*[[]")
66
67
68ESCAPED_CHARS = "$%#^ {}&~"
69
70
71def dbgmsg(msg):
72 if DEBUG:
73 sys.stderr.write(msg + "\n")
74
75def pushing(name, point, depth):
76 dbgmsg("pushing <%s> at %s" % (name, point))
77
78def popping(name, point, depth):
79 dbgmsg("popping </%s> at %s" % (name, point))
80
81
82class _Stack(UserList.UserList):
83 def append(self, entry):
84 if type(entry) is not StringType:
85 raise LaTeXFormatError("cannot push non-string on stack: "
86 + `entry`)
87 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
88 self.data.append(entry)
89
90 def pop(self, index=-1):
91 entry = self.data[index]
92 del self.data[index]
93 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
94
95 def __delitem__(self, index):
96 entry = self.data[index]
97 del self.data[index]
98 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
99
100
101def new_stack():
102 if DEBUG:
103 return _Stack()
104 return []
105
106
107class Conversion:
108 def __init__(self, ifp, ofp, table):
109 self.write = ofp.write
110 self.ofp = ofp
111 self.table = table
112 self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
113 self.preamble = 1
114
115 def convert(self):
116 self.subconvert()
117
118 def subconvert(self, endchar=None, depth=0):
119 #
120 # Parses content, including sub-structures, until the character
121 # 'endchar' is found (with no open structures), or until the end
122 # of the input data is endchar is None.
123 #
124 stack = new_stack()
125 line = self.line
126 while line:
127 if line[0] == endchar and not stack:
128 self.line = line
129 return line
130 m = _comment_rx.match(line)
131 if m:
132 text = m.group(1)
133 if text:
134 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
135 % encode(text))
136 line = line[m.end():]
137 continue
138 m = _begin_env_rx.match(line)
139 if m:
140 name = m.group(1)
141 entry = self.get_env_entry(name)
142 # re-write to use the macro handler
143 line = r"\%s %s" % (name, line[m.end():])
144 continue
145 m = _end_env_rx.match(line)
146 if m:
147 # end of environment
148 envname = m.group(1)
149 entry = self.get_entry(envname)
150 while stack and envname != stack[-1] \
151 and stack[-1] in entry.endcloses:
152 self.write(")%s\n" % stack.pop())
153 if stack and envname == stack[-1]:
154 self.write(")%s\n" % entry.outputname)
155 del stack[-1]
156 else:
157 raise LaTeXStackError(envname, stack)
158 line = line[m.end():]
159 continue
160 m = _begin_macro_rx.match(line)
161 if m:
162 # start of macro
163 macroname = m.group(1)
164 if macroname == "c":
165 # Ugh! This is a combining character...
166 endpos = m.end()
167 self.combining_char("c", line[endpos])
168 line = line[endpos + 1:]
169 continue
170 entry = self.get_entry(macroname)
171 if entry.verbatim:
172 # magic case!
173 pos = string.find(line, "\\end{%s}" % macroname)
174 text = line[m.end(1):pos]
175 stack.append(entry.name)
176 self.write("(%s\n" % entry.outputname)
177 self.write("-%s\n" % encode(text))
178 self.write(")%s\n" % entry.outputname)
179 stack.pop()
180 line = line[pos + len("\\end{%s}" % macroname):]
181 continue
182 while stack and stack[-1] in entry.closes:
183 top = stack.pop()
184 topentry = self.get_entry(top)
185 if topentry.outputname:
186 self.write(")%s\n-\\n\n" % topentry.outputname)
187 #
188 if entry.outputname:
189 if entry.empty:
190 self.write("e\n")
191 #
192 params, optional, empty, environ = self.start_macro(macroname)
193 # rip off the macroname
194 if params:
195 line = line[m.end(1):]
196 elif empty:
197 line = line[m.end(1):]
198 else:
199 line = line[m.end():]
200 opened = 0
201 implied_content = 0
202
203 # handle attribute mappings here:
204 for pentry in params:
205 if pentry.type == "attribute":
206 if pentry.optional:
207 m = _optional_rx.match(line)
208 if m and entry.outputname:
209 line = line[m.end():]
210 self.dump_attr(pentry, m.group(1))
211 elif pentry.text and entry.outputname:
212 # value supplied by conversion spec:
213 self.dump_attr(pentry, pentry.text)
214 else:
215 m = _parameter_rx.match(line)
216 if not m:
217 raise LaTeXFormatError(
218 "could not extract parameter %s for %s: %s"
219 % (pentry.name, macroname, `line[:100]`))
220 if entry.outputname:
221 self.dump_attr(pentry, m.group(1))
222 line = line[m.end():]
223 elif pentry.type == "child":
224 if pentry.optional:
225 m = _optional_rx.match(line)
226 if m:
227 line = line[m.end():]
228 if entry.outputname and not opened:
229 opened = 1
230 self.write("(%s\n" % entry.outputname)
231 stack.append(macroname)
232 stack.append(pentry.name)
233 self.write("(%s\n" % pentry.name)
234 self.write("-%s\n" % encode(m.group(1)))
235 self.write(")%s\n" % pentry.name)
236 stack.pop()
237 else:
238 if entry.outputname and not opened:
239 opened = 1
240 self.write("(%s\n" % entry.outputname)
241 stack.append(entry.name)
242 self.write("(%s\n" % pentry.name)
243 stack.append(pentry.name)
244 self.line = skip_white(line)[1:]
245 line = self.subconvert(
246 "}", len(stack) + depth + 1)[1:]
247 self.write(")%s\n" % stack.pop())
248 elif pentry.type == "content":
249 if pentry.implied:
250 implied_content = 1
251 else:
252 if entry.outputname and not opened:
253 opened = 1
254 self.write("(%s\n" % entry.outputname)
255 stack.append(entry.name)
256 line = skip_white(line)
257 if line[0] != "{":
258 raise LaTeXFormatError(
259 "missing content for " + macroname)
260 self.line = line[1:]
261 line = self.subconvert("}", len(stack) + depth + 1)
262 if line and line[0] == "}":
263 line = line[1:]
264 elif pentry.type == "text" and pentry.text:
265 if entry.outputname and not opened:
266 opened = 1
267 stack.append(entry.name)
268 self.write("(%s\n" % entry.outputname)
269 #dbgmsg("--- text: %s" % `pentry.text`)
270 self.write("-%s\n" % encode(pentry.text))
271 elif pentry.type == "entityref":
272 self.write("&%s\n" % pentry.name)
273 if entry.outputname:
274 if not opened:
275 self.write("(%s\n" % entry.outputname)
276 stack.append(entry.name)
277 if not implied_content:
278 self.write(")%s\n" % entry.outputname)
279 stack.pop()
280 continue
281 if line[0] == endchar and not stack:
282 self.line = line[1:]
283 return self.line
284 if line[0] == "}":
285 # end of macro or group
286 macroname = stack[-1]
287 if macroname:
288 conversion = self.table[macroname]
289 if conversion.outputname:
290 # otherwise, it was just a bare group
291 self.write(")%s\n" % conversion.outputname)
292 del stack[-1]
293 line = line[1:]
294 continue
295 if line[0] == "~":
296 # don't worry about the "tie" aspect of this command
297 line = line[1:]
298 self.write("- \n")
299 continue
300 if line[0] == "{":
301 stack.append("")
302 line = line[1:]
303 continue
304 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
305 self.write("-%s\n" % encode(line[1]))
306 line = line[2:]
307 continue
308 if line[:2] == r"\\":
309 self.write("(BREAK\n)BREAK\n")
310 line = line[2:]
311 continue
312 if line[:2] == r"\_":
313 line = "_" + line[2:]
314 continue
315 if line[:2] in (r"\'", r'\"'):
316 # combining characters...
317 self.combining_char(line[1], line[2])
318 line = line[3:]
319 continue
320 m = _text_rx.match(line)
321 if m:
322 text = encode(m.group())
323 self.write("-%s\n" % text)
324 line = line[m.end():]
325 continue
326 # special case because of \item[]
327 # XXX can we axe this???
328 if line[0] == "]":
329 self.write("-]\n")
330 line = line[1:]
331 continue
332 # avoid infinite loops
333 extra = ""
334 if len(line) > 100:
335 extra = "..."
336 raise LaTeXFormatError("could not identify markup: %s%s"
337 % (`line[:100]`, extra))
338 while stack:
339 entry = self.get_entry(stack[-1])
340 if entry.closes:
341 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
342 del stack[-1]
343 else:
344 break
345 if stack:
346 raise LaTeXFormatError("elements remain on stack: "
347 + string.join(stack, ", "))
348 # otherwise we just ran out of input here...
349
350 # This is a really limited table of combinations, but it will have
351 # to do for now.
352 _combinations = {
353 ("c", "c"): 0x00E7,
354 ("'", "e"): 0x00E9,
355 ('"', "o"): 0x00F6,
356 }
357
358 def combining_char(self, prefix, char):
359 ordinal = self._combinations[(prefix, char)]
360 self.write("-\\%%%d;\n" % ordinal)
361
362 def start_macro(self, name):
363 conversion = self.get_entry(name)
364 parameters = conversion.parameters
365 optional = parameters and parameters[0].optional
366 return parameters, optional, conversion.empty, conversion.environment
367
368 def get_entry(self, name):
369 entry = self.table.get(name)
370 if entry is None:
371 dbgmsg("get_entry(%s) failing; building default entry!" % `name`)
372 # not defined; build a default entry:
373 entry = TableEntry(name)
374 entry.has_content = 1
375 entry.parameters.append(Parameter("content"))
376 self.table[name] = entry
377 return entry
378
379 def get_env_entry(self, name):
380 entry = self.table.get(name)
381 if entry is None:
382 # not defined; build a default entry:
383 entry = TableEntry(name, 1)
384 entry.has_content = 1
385 entry.parameters.append(Parameter("content"))
386 entry.parameters[-1].implied = 1
387 self.table[name] = entry
388 elif not entry.environment:
389 raise LaTeXFormatError(
390 name + " is defined as a macro; expected environment")
391 return entry
392
393 def dump_attr(self, pentry, value):
394 if not (pentry.name and value):
395 return
396 if _token_rx.match(value):
397 dtype = "TOKEN"
398 else:
399 dtype = "CDATA"
400 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
401
402
403def convert(ifp, ofp, table):
404 c = Conversion(ifp, ofp, table)
405 try:
406 c.convert()
407 except IOError, (err, msg):
408 if err != errno.EPIPE:
409 raise
410
411
412def skip_white(line):
413 while line and line[0] in " %\n\t\r":
414 line = string.lstrip(line[1:])
415 return line
416
417
418
419class TableEntry:
420 def __init__(self, name, environment=0):
421 self.name = name
422 self.outputname = name
423 self.environment = environment
424 self.empty = not environment
425 self.has_content = 0
426 self.verbatim = 0
427 self.auto_close = 0
428 self.parameters = []
429 self.closes = []
430 self.endcloses = []
431
432class Parameter:
433 def __init__(self, type, name=None, optional=0):
434 self.type = type
435 self.name = name
436 self.optional = optional
437 self.text = ''
438 self.implied = 0
439
440
441class TableParser(XMLParser):
442 def __init__(self, table=None):
443 if table is None:
444 table = {}
445 self.__table = table
446 self.__current = None
447 self.__buffer = ''
448 XMLParser.__init__(self)
449
450 def get_table(self):
451 for entry in self.__table.values():
452 if entry.environment and not entry.has_content:
453 p = Parameter("content")
454 p.implied = 1
455 entry.parameters.append(p)
456 entry.has_content = 1
457 return self.__table
458
459 def start_environment(self, attrs):
460 name = attrs["name"]
461 self.__current = TableEntry(name, environment=1)
462 self.__current.verbatim = attrs.get("verbatim") == "yes"
463 if attrs.has_key("outputname"):
464 self.__current.outputname = attrs.get("outputname")
465 self.__current.endcloses = string.split(attrs.get("endcloses", ""))
466 def end_environment(self):
467 self.end_macro()
468
469 def start_macro(self, attrs):
470 name = attrs["name"]
471 self.__current = TableEntry(name)
472 self.__current.closes = string.split(attrs.get("closes", ""))
473 if attrs.has_key("outputname"):
474 self.__current.outputname = attrs.get("outputname")
475 def end_macro(self):
476 self.__table[self.__current.name] = self.__current
477 self.__current = None
478
479 def start_attribute(self, attrs):
480 name = attrs.get("name")
481 optional = attrs.get("optional") == "yes"
482 if name:
483 p = Parameter("attribute", name, optional=optional)
484 else:
485 p = Parameter("attribute", optional=optional)
486 self.__current.parameters.append(p)
487 self.__buffer = ''
488 def end_attribute(self):
489 self.__current.parameters[-1].text = self.__buffer
490
491 def start_entityref(self, attrs):
492 name = attrs["name"]
493 p = Parameter("entityref", name)
494 self.__current.parameters.append(p)
495
496 def start_child(self, attrs):
497 name = attrs["name"]
498 p = Parameter("child", name, attrs.get("optional") == "yes")
499 self.__current.parameters.append(p)
500 self.__current.empty = 0
501
502 def start_content(self, attrs):
503 p = Parameter("content")
504 p.implied = attrs.get("implied") == "yes"
505 if self.__current.environment:
506 p.implied = 1
507 self.__current.parameters.append(p)
508 self.__current.has_content = 1
509 self.__current.empty = 0
510
511 def start_text(self, attrs):
512 self.__current.empty = 0
513 self.__buffer = ''
514 def end_text(self):
515 p = Parameter("text")
516 p.text = self.__buffer
517 self.__current.parameters.append(p)
518
519 def handle_data(self, data):
520 self.__buffer = self.__buffer + data
521
522
523def load_table(fp, table=None):
524 parser = TableParser(table=table)
525 parser.feed(fp.read())
526 parser.close()
527 return parser.get_table()
528
529
530def main():
531 global DEBUG
532 #
533 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
534 for opt, arg in opts:
535 if opt in ("-D", "--debug"):
536 DEBUG = DEBUG + 1
537 if len(args) == 0:
538 ifp = sys.stdin
539 ofp = sys.stdout
540 elif len(args) == 1:
541 ifp = open(args)
542 ofp = sys.stdout
543 elif len(args) == 2:
544 ifp = open(args[0])
545 ofp = open(args[1], "w")
546 else:
547 usage()
548 sys.exit(2)
549
550 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
551 convert(ifp, ofp, table)
552
553
554if __name__ == "__main__":
555 main()