blob: c5668fdcf3f711c0e0cc109cb14eb83f3bb2a2c2 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001#!/usr/bin/python -u
2import sys
3import string
4import time
5
6sources = "Blocks-4.txt UnicodeData-3.1.0.txt"
7
8try:
9 blocks = open("Blocks-4.txt", "r")
10except:
11 print "Missing Blocks-4.txt, aborting ..."
12 sys.exit(1)
13
14BlockNames = {}
15for line in blocks.readlines():
16 if line[0] == '#':
17 continue
18 line = string.strip(line)
19 if line == '':
20 continue
21 try:
22 fields = string.split(line, ';')
23 range = string.strip(fields[0])
24 (start, end) = string.split(range, "..")
25 name = string.strip(fields[1])
26 name = string.replace(name, ' ', '')
27 except:
28 print "Failed to process line: %s" % (line)
29 continue
30 BlockNames[name] = ("0x"+start, "0x"+end)
31blocks.close()
32print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
33
34try:
35 data = open("UnicodeData-3.1.0.txt", "r")
36except:
37 print "Missing UnicodeData-3.1.0.txt, aborting ..."
38 sys.exit(1)
39
40nbchar = 0;
41Categories = {}
42for line in data.readlines():
43 if line[0] == '#':
44 continue
45 line = string.strip(line)
46 if line == '':
47 continue
48 try:
49 fields = string.split(line, ';')
50 point = string.strip(fields[0])
51 value = 0
52 while point != '':
53 value = value * 16
54 if point[0] >= '0' and point[0] <= '9':
55 value = value + ord(point[0]) - ord('0')
56 elif point[0] >= 'A' and point[0] <= 'F':
57 value = value + 10 + ord(point[0]) - ord('A')
58 elif point[0] >= 'a' and point[0] <= 'f':
59 value = value + 10 + ord(point[0]) - ord('a')
60 point = point[1:]
61 name = fields[2]
62 except:
63 print "Failed to process line: %s" % (line)
64 continue
65
66 nbchar = nbchar + 1
67 try:
68 Categories[name].append(value)
69 except:
70 try:
71 Categories[name] = [value]
72 except:
73 print "Failed to process line: %s" % (line)
74 try:
75 Categories[name[0]].append(value)
76 except:
77 try:
78 Categories[name[0]] = [value]
79 except:
80 print "Failed to process line: %s" % (line)
81
82blocks.close()
83print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
84#reduce the number list into ranges
85for cat in Categories.keys():
86 list = Categories[cat]
87 start = -1
88 prev = -1
89 end = -1
90 ranges = []
91 for val in list:
92 if start == -1:
93 start = val
94 prev = val
95 continue
96 elif val == prev + 1:
97 prev = val
98 continue
99 elif prev == start:
100 ranges.append((prev, prev))
101 start = val
102 prev = val
103 continue
104 else:
105 ranges.append((start, prev))
106 start = val
107 prev = val
108 continue
109 if prev == start:
110 ranges.append((prev, prev))
111 else:
112 ranges.append((start, prev))
113 Categories[cat] = ranges
114
115#
116# Generate the resulting files
117#
118try:
119 header = open("xmlunicode.h", "w")
120except:
121 print "Failed to open xmlunicode.h"
122 sys.exit(1)
123
124try:
125 output = open("xmlunicode.c", "w")
126except:
127 print "Failed to open xmlunicode.c"
128 sys.exit(1)
129
130date = time.asctime(time.localtime(time.time()))
131
132header.write(
133"""/*
134 * xmlunicode.h: this header exports interfaces for the Unicode character APIs
135 *
136 * This file is automatically generated from the
137 * UCS description files of the Unicode Character Database
138 * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
139 * using the genUnicode.py Python script.
140 *
141 * Generation date: %s
142 * Sources: %s
143 * Daniel Veillard <veillard@redhat.com>
144 */
145
146#ifndef __XML_UNICODE_H__
147#define __XML_UNICODE_H__
148
149#ifdef __cplusplus
150extern "C" {
151#endif
152
153""" % (date, sources));
154output.write(
155"""/*
156 * xmlunicode.c: this module implements the Unicode character APIs
157 *
158 * This file is automatically generated from the
159 * UCS description files of the Unicode Character Database
160 * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
161 * using the genUnicode.py Python script.
162 *
163 * Generation date: %s
164 * Sources: %s
165 * Daniel Veillard <veillard@redhat.com>
166 */
167
168#define IN_LIBXML
169#include "libxml.h"
170
171#ifdef LIBXML_UNICODE_ENABLED
172
173#include <string.h>
174#include <libxml/xmlversion.h>
175#include <libxml/xmlunicode.h>
176
177""" % (date, sources));
178
179keys = BlockNames.keys()
180keys.sort()
181for block in keys:
182 (start, end) = BlockNames[block]
183 name = string.replace(block, '-', '')
184 header.write("int\txmlUCSIs%s\t(int code);\n" % name)
185 output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
186 output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
187 (block))
188 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
189 output.write("int\nxmlUCSIs%s(int code) {\n" % name)
190 output.write(" return((code >= %s) && (code <= %s));\n" % (start, end))
191 output.write("}\n\n")
192
193header.write("\nint\txmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n")
194output.write("/**\n * xmlUCSIsBlock:\n * @code: UCS code point\n")
195output.write(" * @block: UCS block name\n")
196output.write(" *\n * Check whether the caracter is part of the UCS Block\n")
197output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown block\n */\n");
198output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n")
199keys = BlockNames.keys()
200keys.sort()
201for block in keys:
202 name = string.replace(block, '-', '')
203 output.write(" if (!strcmp(block, \"%s\"))\n return(xmlUCSIs%s(code));\n" %
204 (block, name));
205output.write(" return(-1);\n}\n\n")
206
207
208keys = Categories.keys()
209keys.sort()
210for name in keys:
211 ranges = Categories[name]
212 header.write("int\txmlUCSIsCat%s\t(int code);\n" % name)
213 output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
214 output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
215 (name))
216 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
217 output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
218 start = 1
219 for range in ranges:
220 (begin, end) = range;
221 if start:
222 output.write(" return(");
223 start = 0
224 else:
225 output.write(" ||\n ");
226 if (begin == end):
227 output.write("(code == %s)" % (hex(begin)))
228 else:
229 output.write("((code >= %s) && (code <= %s))" % (
230 hex(begin), hex(end)))
231 output.write(");\n}\n\n")
232
233header.write("\nint\txmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n")
234output.write("/**\n * xmlUCSIsCat:\n * @code: UCS code point\n")
235output.write(" * @cat: UCS Category name\n")
236output.write(" *\n * Check whether the caracter is part of the UCS Category\n")
237output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown category\n */\n");
238output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n")
239keys = Categories.keys()
240keys.sort()
241for name in keys:
242 output.write(" if (!strcmp(cat, \"%s\"))\n return(xmlUCSIsCat%s(code));\n" %
243 (name, name));
244output.write(" return(-1);\n}\n\n")
245
246header.write("""
247#ifdef __cplusplus
248}
249#endif
250#endif /* __XML_UNICODE_H__ */
251""");
252output.write("""
253#endif /* LIBXML_UNICODE_ENABLED */
254""");
255header.close()
256output.close()