blob: 6e6f5eb477fcc7640b6ca54b2ea4d4afd5824978 [file] [log] [blame]
Tor Norbye3a2425a2013-11-04 10:16:08 -08001from bisect import bisect_left
2import operator
3import java.lang.Character
4
5# XXX - this is intended as a stopgap measure until 2.5.1, which will have a Java implementation
6# requires java 6 for `normalize` function
7# only has one version of the database
8# does not normalized ideographs
9
10_codepoints = {}
11_eaw = {}
12_names = {}
13_segments = []
14_eaw_segments = []
15Nonesuch = object()
16
17def get_int(col):
18 try:
19 return int(col)
20 except ValueError:
21 return None
22
23def get_yn(col):
24 if col == 'Y': return 1
25 else: return 0
26
27def get_numeric(col):
28 try:
29 return float(col)
30 except ValueError:
31 try:
32 a, b = col.split('/')
33 return float(a)/float(b)
34 except:
35 return None
36
37def init_unicodedata(data):
38 for row in data:
39 cols = row.split(';')
40 codepoint = int(cols[0], 16)
41 name = cols[1]
42 if name == '<CJK Ideograph, Last>':
43 lookup_name = 'CJK UNIFIED IDEOGRAPH'
44 else:
45 lookup_name = name
46 data = (
47 cols[2],
48 get_int(cols[3]),
49 cols[4],
50 cols[5],
51 get_int(cols[6]),
52 get_int(cols[7]),
53 get_numeric(cols[8]),
54 get_yn(cols[9]),
55 lookup_name,
56 )
57
58 if name.find('First') >= 0:
59 start = codepoint
60 elif name.find('Last') >= 0:
61 _segments.append((start, (start, codepoint), data))
62 else:
63 _names[name] = unichr(codepoint)
64 _codepoints[codepoint] = data
65
66def init_east_asian_width(data):
67 for row in data:
68 if row.startswith('#'):
69 continue
70 row = row.partition('#')[0]
71 cols = row.split(';')
72 if len(cols) < 2:
73 continue
74 cr = cols[0].split('..')
75 width = cols[1].rstrip()
76 if len(cr) == 1:
77 codepoint = int(cr[0], 16)
78 _eaw[codepoint] = width
79 else:
80 start = int(cr[0], 16)
81 end = int(cr[1], 16)
82 _eaw_segments.append((start, (start, end), width))
83
84# xxx - need to normalize the segments, so
85# <CJK Ideograph, Last> ==> CJK UNIFIED IDEOGRAPH;
86# may need to do some sort of analysis against CPython for the normalization!
87
88def name(unichr, default=None):
89 codepoint = get_codepoint(unichr, "name")
90 v = _codepoints.get(codepoint, None)
91 if v is None:
92 v = check_segments(codepoint, _segments)
93 if v is not None:
94 return "%s-%X" % (v[8], codepoint)
95
96 if v is None:
97 if default is not Nonesuch:
98 return default
99 raise ValueError()
100 return v[8]
101
102# xxx - also need to add logic here so that if it's CJK UNIFIED
103# IDEOGRAPH-8000, we go against the segment to verify the prefix
104
105def lookup(name):
106 return _names[name]
107
108def check_segments(codepoint, segments):
109 i = bisect_left(segments, (codepoint,))
110 if i < len(segments):
111 segment = segments[i - 1]
112 if codepoint <= segment[1][1]:
113 return segment[2]
114 return None
115
116
117def get_codepoint(unichr, fn=None):
118 if not(isinstance(unichr, unicode)):
119 raise TypeError(fn, "() argument 1 must be unicode, not " + type(unichr))
120 if len(unichr) > 1 or len(unichr) == 0:
121 raise TypeError("need a single Unicode character as parameter")
122 return ord(unichr)
123
124def get_eaw(unichr, default, fn):
125 codepoint = get_codepoint(unichr, fn)
126 v = _eaw.get(codepoint, None)
127 if v is None:
128 v = check_segments(codepoint, _eaw_segments)
129
130 if v is None:
131 if default is not Nonesuch:
132 return default
133 raise ValueError()
134 return v
135
136def get(unichr, default, fn, getter):
137 codepoint = get_codepoint(unichr, fn)
138 data = _codepoints.get(codepoint, None)
139 if data is None:
140 data = check_segments(codepoint, _segments)
141 if data is None:
142 if default is not Nonesuch:
143 return default
144 raise ValueError()
145 v = getter(data)
146 if v is None:
147 if default is not Nonesuch:
148 return default
149 raise ValueError()
150 else:
151 return v
152
153category_getter = operator.itemgetter(0)
154combining_getter = operator.itemgetter(1)
155bidirectional_getter = operator.itemgetter(2)
156decomposition_getter = operator.itemgetter(3)
157decimal_getter = operator.itemgetter(4)
158digit_getter = operator.itemgetter(5)
159numeric_getter = operator.itemgetter(6)
160mirrored_getter = operator.itemgetter(7)
161
162def decimal(unichr, default=Nonesuch):
163 return get(unichr, default, 'decimal', decimal_getter)
164
165def decomposition(unichr, default=''):
166 return get(unichr, default, 'decomposition', decomposition_getter)
167
168def digit(unichr, default=Nonesuch):
169 return get(unichr, default, 'digit', digit_getter)
170
171def numeric(unichr, default=Nonesuch):
172 return get(unichr, default, 'numeric', numeric_getter)
173
174def category(unichr):
175 return get(unichr, 'Cn', 'catgegory', category_getter)
176
177def bidirectional(unichr):
178 return get(unichr, '', 'bidirectional', bidirectional_getter)
179
180def combining(unichr):
181 return get(unichr, 0, 'combining', combining_getter)
182
183def mirrored(unichr):
184 return get(unichr, 0, 'mirrored', mirrored_getter)
185
186def east_asian_width(unichr):
187 return get_eaw(unichr, 'N', 'east_asian_width')
188
189def jymirrored(unichr):
190 return java.lang.Character.isMirrored(get_codepoint(unichr, 'mirrored'))
191
192try:
193 from java.text import Normalizer
194
195 _forms = {
196 'NFC': Normalizer.Form.NFC,
197 'NFKC': Normalizer.Form.NFKC,
198 'NFD': Normalizer.Form.NFD,
199 'NFKD': Normalizer.Form.NFKD
200 }
201
202 def normalize(form, unistr):
203 """
204 Return the normal form 'form' for the Unicode string unistr. Valid
205 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
206 """
207
208 try:
209 normalizer_form = _forms[form]
210 except KeyError:
211 raise ValueError('invalid normalization form')
212 return Normalizer.normalize(unistr, normalizer_form)
213
214except ImportError:
215 pass
216
217
218def init():
219 import pkgutil
220 import os.path
221 import StringIO
222 import sys
223
224 my_path = os.path.dirname(__file__)
225 loader = pkgutil.get_loader('unicodedata')
226 init_unicodedata(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'UnicodeData.txt'))))
227 init_east_asian_width(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'EastAsianWidth.txt'))))
228
229init()