blob: 07f46f8cb16beb17f6b9e2269a9ff5b632bee1ce [file] [log] [blame]
William M. Brack68aca052003-10-11 15:22:13 +00001#!/usr/bin/python -u
2#
3# Portions of this script have been (shamelessly) stolen from the
4# prior work of Daniel Veillard (genUnicode.py)
5#
6# I, however, take full credit for any bugs, errors or difficulties :-)
7#
8# William Brack
9# October 2003
10#
11
12import sys
13import string
14import time
15
16#
William M. Brack68aca052003-10-11 15:22:13 +000017# A routine to take a list of yes/no (1, 0) values and turn it
18# into a list of ranges. This will later be used to determine whether
19# to generate single-byte lookup tables, or inline comparisons
20#
21def makeRange(lst):
22 ret = []
23 pos = 0
24 while pos < len(lst):
25 try: # index generates exception if not present
26 s = lst[pos:].index(1) # look for start of next range
27 except:
28 break # if no more, finished
29 pos += s # pointer to start of possible range
30 try:
31 e = lst[pos:].index(0) # look for end of range
32 e += pos
33 except: # if no end, set to end of list
34 e = len(lst)
35 ret.append((pos, e-1)) # append range tuple to list
36 pos = e + 1 # ready to check for next range
37 return ret
38
39sources = "chvalid.def" # input filename
40
41# minTableSize gives the minimum number of ranges which must be present
42# before a 256-byte lookup table is produced. If there are less than this
43# number, a macro with inline comparisons is generated
44minTableSize = 6
45
William M. Brack68aca052003-10-11 15:22:13 +000046# dictionary of functions, key=name, element contains char-map and range-list
47Functs = {}
48
49state = 0
50
51try:
52 defines = open("chvalid.def", "r")
53except:
54 print "Missing chvalid.def, aborting ..."
55 sys.exit(1)
56
57#
58# The lines in the .def file have three types:-
59# name: Defines a new function block
60# ur: Defines individual or ranges of unicode values
61# end: Indicates the end of the function block
62#
63# These lines are processed below.
64#
65for line in defines.readlines():
66 # ignore blank lines, or lines beginning with '#'
67 if line[0] == '#':
68 continue
69 line = string.strip(line)
70 if line == '':
71 continue
72 # split line into space-separated fields, then split on type
73 try:
74 fields = string.split(line, ' ')
75 #
76 # name line:
77 # validate any previous function block already ended
78 # validate this function not already defined
79 # initialize an entry in the function dicitonary
80 # including a mask table with no values yet defined
81 #
82 if fields[0] == 'name':
83 name = fields[1]
84 if state != 0:
85 print "'name' %s found before previous name" \
86 "completed" % (fields[1])
87 continue
88 state = 1
89 if Functs.has_key(name):
90 print "name '%s' already present - may give" \
91 " wrong results" % (name)
92 else:
93 # dict entry with two list elements (chdata, rangedata)
94 Functs[name] = [ [], [] ]
95 for v in range(256):
96 Functs[name][0].append(0)
97 #
98 # end line:
99 # validate there was a preceding function name line
100 # set state to show no current function active
101 #
102 elif fields[0] == 'end':
103 if state == 0:
104 print "'end' found outside of function block"
105 continue
106 state = 0
107
108 #
109 # ur line:
110 # validate function has been defined
111 # process remaining fields on the line, which may be either
112 # individual unicode values or ranges of values
113 #
114 elif fields[0] == 'ur':
115 if state != 1:
116 raise ValidationError, "'ur' found outside of 'name' block"
117 for el in fields[1:]:
118 pos = string.find(el, '..')
119 # pos <=0 means not a range, so must be individual value
120 if pos <= 0:
121 # cheap handling of hex or decimal values
122 if el[0:2] == '0x':
123 value = int(el[2:],16)
124 elif el[0] == "'":
125 value = ord(el[1])
126 else:
127 value = int(el)
128 if ((value < 0) | (value > 0x1fffff)):
129 raise ValidationError, 'Illegal value (%s) in ch for'\
130 ' name %s' % (el,name)
131 # for ur we have only ranges (makes things simpler),
132 # so convert val to range
133 currange = (value, value)
134 # pos > 0 means this is a range, so isolate/validate
135 # the interval
136 else:
137 # split the range into it's first-val, last-val
138 (first, last) = string.split(el, "..")
139 # convert values from text into binary
140 if first[0:2] == '0x':
141 start = int(first[2:],16)
142 elif first[0] == "'":
143 start = ord(first[1])
144 else:
145 start = int(first)
146 if last[0:2] == '0x':
147 end = int(last[2:],16)
148 elif last[0] == "'":
149 end = ord(last[1])
150 else:
151 end = int(last)
152 if (start < 0) | (end > 0x1fffff) | (start > end):
153 raise ValidationError, "Invalid range '%s'" % el
154 currange = (start, end)
155 # common path - 'currange' has the range, now take care of it
156 # We split on single-byte values vs. multibyte
157 if currange[1] < 0x100: # single-byte
158 for ch in range(currange[0],currange[1]+1):
159 # validate that value not previously defined
160 if Functs[name][0][ch]:
161 msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
162 raise ValidationError, msg
163 Functs[name][0][ch] = 1
164 else: # multi-byte
William M. Brack68aca052003-10-11 15:22:13 +0000165 if currange in Functs[name][1]:
166 raise ValidationError, "range already defined in" \
167 " function"
168 else:
169 Functs[name][1].append(currange)
170
171 except:
172 print "Failed to process line: %s" % (line)
173 raise
174#
175# At this point, the entire definition file has been processed. Now we
176# enter the output phase, where we generate the two files chvalid.c and'
177# chvalid.h
178#
179# To do this, we first output the 'static' data (heading, fixed
180# definitions, etc.), then output the 'dynamic' data (the results
181# of the above processing), and finally output closing 'static' data
182# (e.g. the subroutine to process the ranges)
183#
184
185#
186# Generate the headings:
187#
188try:
Daniel Veillard1a993962003-10-11 20:58:06 +0000189 header = open("include/libxml/chvalid.h", "w")
William M. Brack68aca052003-10-11 15:22:13 +0000190except:
Daniel Veillard1a993962003-10-11 20:58:06 +0000191 print "Failed to open include/libxml/chvalid.h"
William M. Brack68aca052003-10-11 15:22:13 +0000192 sys.exit(1)
193
194try:
195 output = open("chvalid.c", "w")
196except:
197 print "Failed to open chvalid.c"
198 sys.exit(1)
199
200date = time.asctime(time.localtime(time.time()))
201
202header.write(
203"""/*
204 * chvalid.h: this header exports interfaces for the character
205 * range validation APIs
206 *
207 * This file is automatically generated from the cvs source
208 * definition files using the genChRanges.py Python script
209 *
210 * Generation date: %s
211 * Sources: %s
212 * William Brack <wbrack@mmm.com.hk>
213 */
214
215#ifndef __XML_CHVALID_H__
216#define __XML_CHVALID_H__
217
218#ifdef __cplusplus
219extern "C" {
220#endif
221
222/*
223 * Define our typedefs and structures
224 *
225 */
226typedef struct _xmlChSRange xmlChSRange;
227typedef xmlChSRange *xmlChSRangePtr;
228struct _xmlChSRange {
229 unsigned short low;
230 unsigned short high;
231};
232
233typedef struct _xmlChLRange xmlChLRange;
234typedef xmlChLRange *xmlChLRangePtr;
235struct _xmlChLRange {
236 unsigned low;
237 unsigned high;
238};
239
240typedef struct _xmlChRangeGroup xmlChRangeGroup;
241typedef xmlChRangeGroup *xmlChRangeGroupPtr;
242struct _xmlChRangeGroup {
243 int nbShortRange;
244 int nbLongRange;
245 xmlChSRangePtr shortRange; /* points to an array of ranges */
246 xmlChLRangePtr longRange;
247};
248
249/* Range checking routine */
250int xmlCharInRange(unsigned int val, const xmlChRangeGroupPtr group);
251
252""" % (date, sources));
253output.write(
254"""/*
255 * chvalid.c: this module implements the character range
256 * validation APIs
257 *
258 * This file is automatically generated from the cvs source
259 * definition files using the genChRanges.py Python script
260 *
261 * Generation date: %s
262 * Sources: %s
263 * William Brack <wbrack@mmm.com.hk>
264 */
265
William M. Brack6819a4e2003-10-11 15:59:36 +0000266#include <libxml/chvalid.h>
William M. Brack68aca052003-10-11 15:22:13 +0000267
268/*
269 * The initial tables ({func_name}_tab) are used to validate whether a
270 * single-byte character is within the specified group. Each table
271 * contains 256 bytes, with each byte representing one of the 256
272 * possible characters. If the table byte is set, the character is
273 * allowed.
274 *
275 */
276""" % (date, sources));
277
278#
279# Now output the generated data.
280# We try to produce the best execution times. Tests have shown that validation
281# with direct table lookup is, when there are a "small" number of valid items,
282# still not as fast as a sequence of inline compares. So, if the single-byte
283# portion of a range has a "small" number of ranges, we output a macro for inline
284# compares, otherwise we output a 256-byte table and a macro to use it.
285#
286
287fkeys = Functs.keys() # Dictionary of all defined functions
288fkeys.sort() # Put some order to our output
289
290for f in fkeys:
291
292# First we convert the specified single-byte values into a group of ranges.
293# If the total number of such ranges is less than minTableSize, we generate
294# an inline macro for direct comparisons; if greater, we generate a lookup
295# table.
296 if max(Functs[f][0]) > 0: # only check if at least one entry
297 rangeTable = makeRange(Functs[f][0])
298 numRanges = len(rangeTable)
299 if numRanges >= minTableSize: # table is worthwhile
300 header.write("extern unsigned char %s_tab[256];\n" % f)
301 header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
302
303 # write the constant data to the code file
304 output.write("unsigned char %s_tab[256] = {\n" % f)
305 pline = " "
306 for n in range(255):
307 pline += " 0x%02x," % Functs[f][0][n]
308 if len(pline) > 72:
309 output.write(pline + "\n")
310 pline = " "
311 output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
312
313 else: # inline check is used
314 # first another little optimisation - if space is present,
315 # put it at the front of the list so it is checked first
316 try:
317 ix = rangeTable.remove((0x20, 0x20))
318 rangeTable.insert(0, (0x20, 0x20))
319 except:
320 pass
William M. Brack68aca052003-10-11 15:22:13 +0000321 firstFlag = 1
William M. Brackc4b81892003-10-12 10:42:46 +0000322 # okay, I'm tired of the messy lineup - let's automate it!
323 pline = "#define %s_ch(c)" % f
324 # 'ntab' is number of tabs needed to position to col. 33 from name end
325 ntab = 4 - (len(pline)) / 8
326 if ntab < 0:
327 ntab = 0
328 just = ""
329 for i in range(ntab):
330 just += "\t"
331 pline = pline + just + "("
William M. Brack68aca052003-10-11 15:22:13 +0000332 for rg in rangeTable:
333 if not firstFlag:
William M. Brackc4b81892003-10-12 10:42:46 +0000334 pline += " || \\\n\t\t\t\t "
William M. Brack68aca052003-10-11 15:22:13 +0000335 else:
336 firstFlag = 0
337 if rg[0] == rg[1]: # single value - check equal
William M. Brackc4b81892003-10-12 10:42:46 +0000338 pline += "((c) == 0x%x)" % rg[0]
339 else: # value range
340 pline += "((0x%x <= (c)) &&" % rg[0]
341 pline += " ((c) <= 0x%x))" % rg[1]
William M. Brack68aca052003-10-11 15:22:13 +0000342 pline += ")\n"
343 header.write(pline)
344
William M. Brackc4b81892003-10-12 10:42:46 +0000345 pline = "#define %s(c)" % f
346 ntab = 4 - (len(pline)) / 8
347 if ntab < 0:
348 ntab = 0
349 just = ""
350 for i in range(ntab):
351 just += "\t"
352 header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
William M. Brack68aca052003-10-11 15:22:13 +0000353 if max(Functs[f][0]) > 0:
354 header.write("%s_ch((c)) :" % f)
355 else:
356 header.write("0 :")
357
358 # if no ranges defined, value invalid if >= 0x100
William M. Brackc4b81892003-10-12 10:42:46 +0000359 numRanges = len(Functs[f][1])
360 if numRanges == 0:
William M. Brack68aca052003-10-11 15:22:13 +0000361 header.write(" 0)\n\n")
362 else:
William M. Brackc4b81892003-10-12 10:42:46 +0000363 if numRanges >= minTableSize:
364 header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n" % f)
365 else: # if < minTableSize, generate inline code
366 firstFlag = 1
367 for rg in Functs[f][1]:
368 if not firstFlag:
369 pline += " || \\\n\t\t\t\t "
370 else:
371 firstFlag = 0
372 pline = "\\\n\t\t\t\t("
373 if rg[0] == rg[1]: # single value - check equal
374 pline += "((c) == 0x%x)" % rg[0]
375 else: # value range
376 pline += "((0x%x <= (c)) &&" % rg[0]
377 pline += " ((c) <= 0x%x))" % rg[1]
378 pline += "))\n\n"
379 header.write(pline)
380
William M. Brack68aca052003-10-11 15:22:13 +0000381
382 if len(Functs[f][1]) > 0:
383 header.write("extern xmlChRangeGroup %sGroup;\n" % f)
384
385
386#
387# Next we do the unicode ranges
388#
389
390for f in fkeys:
391 if len(Functs[f][1]) > 0: # only generate if unicode ranges present
392 rangeTable = Functs[f][1]
393 rangeTable.sort() # ascending tuple sequence
394 numShort = 0
395 numLong = 0
396 for rg in rangeTable:
397 if rg[1] < 0x10000: # if short value
398 if numShort == 0: # first occurence
399 pline = "static xmlChSRange %s_srng[] = { " % f
400 else:
401 pline += ", "
402 numShort += 1
403 if len(pline) > 60:
404 output.write(pline + "\n")
405 pline = " "
406 pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
407 else: # if long value
408 if numLong == 0: # first occurence
409 if numShort > 0: # if there were shorts, finish them off
410 output.write(pline + "};\n")
411 pline = "static xmlChLRange %s_lrng[] = { " % f
412 else:
413 pline += ", "
414 numLong += 1
415 if len(pline) > 60:
416 output.write(pline + "\n")
417 pline = " "
418 pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
419 output.write(pline + "};\n") # finish off last group
420
William M. Brackc4b81892003-10-12 10:42:46 +0000421 pline = "xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
William M. Brack68aca052003-10-11 15:22:13 +0000422 if numShort > 0:
423 pline += "%s_srng" % f
William M. Brackc4b81892003-10-12 10:42:46 +0000424 else:
425 pline += "(xmlChSRangePtr)0"
William M. Brack68aca052003-10-11 15:22:13 +0000426 if numLong > 0:
427 pline += ", %s_lrng" % f
William M. Brackc4b81892003-10-12 10:42:46 +0000428 else:
429 pline += ", (xmlChLRangePtr)0"
William M. Brack68aca052003-10-11 15:22:13 +0000430
431 output.write(pline + "};\n\n")
432#
433# Run complete - write trailers and close the output files
434#
435
436header.write("""
437#ifdef __cplusplus
438}
439#endif
440#endif /* __XML_CHVALID_H__ */
441""");
442
443header.close()
444
445output.write(
446"""
447int
448xmlCharInRange (unsigned int val, xmlChRangeGroupPtr rptr) {
449 int low, high, mid;
450 xmlChSRangePtr sptr;
451 xmlChLRangePtr lptr;
452 if (val < 0x10000) { /* is val in 'short' or 'long' array? */
453 if (rptr->nbShortRange == 0)
454 return 0;
455 low = 0;
William M. Brackc4b81892003-10-12 10:42:46 +0000456 high = rptr->nbShortRange - 1;
William M. Brack68aca052003-10-11 15:22:13 +0000457 sptr = rptr->shortRange;
458 while (low <= high) {
459 mid = (low + high) / 2;
William M. Brackc4b81892003-10-12 10:42:46 +0000460 if ((unsigned short) val < sptr[mid].low) {
William M. Brack68aca052003-10-11 15:22:13 +0000461 high = mid - 1;
William M. Brackc4b81892003-10-12 10:42:46 +0000462 } else {
463 if ((unsigned short) val > sptr[mid].high) {
464 low = mid + 1;
465 } else {
466 return 1;
467 }
468 }
William M. Brack68aca052003-10-11 15:22:13 +0000469 }
470 } else {
William M. Brackc4b81892003-10-12 10:42:46 +0000471 if (rptr->nbLongRange == 0) {
William M. Brack68aca052003-10-11 15:22:13 +0000472 return 0;
William M. Brackc4b81892003-10-12 10:42:46 +0000473 }
William M. Brack68aca052003-10-11 15:22:13 +0000474 low = 0;
William M. Brackc4b81892003-10-12 10:42:46 +0000475 high = rptr->nbLongRange - 1;
William M. Brack68aca052003-10-11 15:22:13 +0000476 lptr = rptr->longRange;
477 while (low <= high) {
478 mid = (low + high) / 2;
William M. Brackc4b81892003-10-12 10:42:46 +0000479 if (val < lptr[mid].low) {
William M. Brack68aca052003-10-11 15:22:13 +0000480 high = mid - 1;
William M. Brackc4b81892003-10-12 10:42:46 +0000481 } else {
482 if (val > lptr[mid].high) {
483 low = mid + 1;
484 } else {
485 return 1;
486 }
487 }
William M. Brack68aca052003-10-11 15:22:13 +0000488 }
489 }
490 return 0;
491}
492
493""");
494
495output.close()