blob: 3bc19b2d447f3470e4811c2d1fa3c3638c0956ff [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Jack Jansen56cdce32000-07-06 13:57:38 +000022
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000023typedef struct {
24 const unsigned short flags;
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30} _PyUnicode_TypeRecord;
31
32#include "unicodetype_db.h"
33
34static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000035gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000036{
37 int index;
38
Fredrik Lundhee13dba2001-06-26 20:36:12 +000039 if (code >= 65536)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000040 index = 0;
41 else {
42 index = index1[(code>>SHIFT)];
43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
44 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000045
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046 return &_PyUnicode_TypeRecords[index];
47}
Jack Jansen56cdce32000-07-06 13:57:38 +000048
Guido van Rossum603484d2000-03-10 22:52:46 +000049/* Returns 1 for Unicode characters having the category 'Zl' or type
50 'B', 0 otherwise. */
51
52int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
53{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000054 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
55
56 return (ctype->flags & LINEBREAK_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000057}
58
59/* Returns the titlecase Unicode characters corresponding to ch or just
60 ch if no titlecase mapping is known. */
61
62Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
63{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000064 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
65
66 if (ctype->title)
67 return ch + ctype->title;
68
69 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +000070}
71
72/* Returns 1 for Unicode characters having the category 'Lt', 0
73 otherwise. */
74
75int _PyUnicode_IsTitlecase(register const Py_UNICODE ch)
76{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000077 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
78
79 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000080}
81
82/* Returns the integer decimal (0-9) for Unicode characters having
83 this property, -1 otherwise. */
84
85int _PyUnicode_ToDecimalDigit(register const Py_UNICODE ch)
86{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000087 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
88
89 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +000090}
91
92int _PyUnicode_IsDecimalDigit(register const Py_UNICODE ch)
93{
94 if (_PyUnicode_ToDecimalDigit(ch) < 0)
95 return 0;
96 return 1;
97}
98
99/* Returns the integer digit (0-9) for Unicode characters having
100 this property, -1 otherwise. */
101
102int _PyUnicode_ToDigit(register const Py_UNICODE ch)
103{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000104 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
105
106 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000107}
108
109int _PyUnicode_IsDigit(register const Py_UNICODE ch)
110{
111 if (_PyUnicode_ToDigit(ch) < 0)
112 return 0;
113 return 1;
114}
115
116/* Returns the numeric value as double for Unicode characters having
117 this property, -1.0 otherwise. */
118
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000119/* TODO: replace with unicodetype_db.h table */
120
Guido van Rossum603484d2000-03-10 22:52:46 +0000121double _PyUnicode_ToNumeric(register const Py_UNICODE ch)
122{
123 switch (ch) {
124 case 0x3007:
125 return (double) 0;
126 case 0x09F4:
127 case 0x215F:
128 case 0x2160:
129 case 0x2170:
130 case 0x3021:
131 case 0x3280:
132 return (double) 1;
133 case 0x00BD:
134 return (double) 1 / 2;
135 case 0x2153:
136 return (double) 1 / 3;
137 case 0x00BC:
138 return (double) 1 / 4;
139 case 0x2155:
140 return (double) 1 / 5;
141 case 0x2159:
142 return (double) 1 / 6;
143 case 0x215B:
144 return (double) 1 / 8;
145 case 0x0BF0:
146 case 0x1372:
147 case 0x2169:
148 case 0x2179:
149 case 0x2469:
150 case 0x247D:
151 case 0x2491:
152 case 0x277F:
153 case 0x2789:
154 case 0x2793:
155 case 0x3038:
156 case 0x3289:
157 return (double) 10;
158 case 0x0BF1:
159 case 0x137B:
160 case 0x216D:
161 case 0x217D:
162 return (double) 100;
163 case 0x0BF2:
164 case 0x216F:
165 case 0x217F:
166 case 0x2180:
167 return (double) 1000;
168 case 0x137C:
169 case 0x2182:
170 return (double) 10000;
171 case 0x216A:
172 case 0x217A:
173 case 0x246A:
174 case 0x247E:
175 case 0x2492:
176 return (double) 11;
177 case 0x216B:
178 case 0x217B:
179 case 0x246B:
180 case 0x247F:
181 case 0x2493:
182 return (double) 12;
183 case 0x246C:
184 case 0x2480:
185 case 0x2494:
186 return (double) 13;
187 case 0x246D:
188 case 0x2481:
189 case 0x2495:
190 return (double) 14;
191 case 0x246E:
192 case 0x2482:
193 case 0x2496:
194 return (double) 15;
195 case 0x09F9:
196 case 0x246F:
197 case 0x2483:
198 case 0x2497:
199 return (double) 16;
200 case 0x16EE:
201 case 0x2470:
202 case 0x2484:
203 case 0x2498:
204 return (double) 17;
205 case 0x16EF:
206 case 0x2471:
207 case 0x2485:
208 case 0x2499:
209 return (double) 18;
210 case 0x16F0:
211 case 0x2472:
212 case 0x2486:
213 case 0x249A:
214 return (double) 19;
215 case 0x09F5:
216 case 0x2161:
217 case 0x2171:
218 case 0x3022:
219 case 0x3281:
220 return (double) 2;
221 case 0x2154:
222 return (double) 2 / 3;
223 case 0x2156:
224 return (double) 2 / 5;
225 case 0x1373:
226 case 0x2473:
227 case 0x2487:
228 case 0x249B:
229 case 0x3039:
230 return (double) 20;
231 case 0x09F6:
232 case 0x2162:
233 case 0x2172:
234 case 0x3023:
235 case 0x3282:
236 return (double) 3;
237 case 0x00BE:
238 return (double) 3 / 4;
239 case 0x2157:
240 return (double) 3 / 5;
241 case 0x215C:
242 return (double) 3 / 8;
243 case 0x1374:
244 case 0x303A:
245 return (double) 30;
246 case 0x09F7:
247 case 0x2163:
248 case 0x2173:
249 case 0x3024:
250 case 0x3283:
251 return (double) 4;
252 case 0x2158:
253 return (double) 4 / 5;
254 case 0x1375:
255 return (double) 40;
256 case 0x2164:
257 case 0x2174:
258 case 0x3025:
259 case 0x3284:
260 return (double) 5;
261 case 0x215A:
262 return (double) 5 / 6;
263 case 0x215D:
264 return (double) 5 / 8;
265 case 0x1376:
266 case 0x216C:
267 case 0x217C:
268 return (double) 50;
269 case 0x216E:
270 case 0x217E:
271 return (double) 500;
272 case 0x2181:
273 return (double) 5000;
274 case 0x2165:
275 case 0x2175:
276 case 0x3026:
277 case 0x3285:
278 return (double) 6;
279 case 0x1377:
280 return (double) 60;
281 case 0x2166:
282 case 0x2176:
283 case 0x3027:
284 case 0x3286:
285 return (double) 7;
286 case 0x215E:
287 return (double) 7 / 8;
288 case 0x1378:
289 return (double) 70;
290 case 0x2167:
291 case 0x2177:
292 case 0x3028:
293 case 0x3287:
294 return (double) 8;
295 case 0x1379:
296 return (double) 80;
297 case 0x2168:
298 case 0x2178:
299 case 0x3029:
300 case 0x3288:
301 return (double) 9;
302 case 0x137A:
303 return (double) 90;
304 default:
305 return (double) _PyUnicode_ToDigit(ch);
306 }
307}
308
309int _PyUnicode_IsNumeric(register const Py_UNICODE ch)
310{
311 if (_PyUnicode_ToNumeric(ch) < 0.0)
312 return 0;
313 return 1;
314}
315
316#ifndef WANT_WCTYPE_FUNCTIONS
317
Guido van Rossumdc742b32000-04-11 15:39:02 +0000318/* Returns 1 for Unicode characters having the bidirectional type
319 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000320
321int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
322{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000323 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
324
325 return (ctype->flags & SPACE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000326}
327
328/* Returns 1 for Unicode characters having the category 'Ll', 0
329 otherwise. */
330
331int _PyUnicode_IsLowercase(register const Py_UNICODE ch)
332{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000333 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
334
335 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000336}
337
338/* Returns 1 for Unicode characters having the category 'Lu', 0
339 otherwise. */
340
341int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
342{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000343 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
344
345 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000346}
347
348/* Returns the uppercase Unicode characters corresponding to ch or just
349 ch if no uppercase mapping is known. */
350
351Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
352{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000353 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
354
355 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +0000356}
357
358/* Returns the lowercase Unicode characters corresponding to ch or just
359 ch if no lowercase mapping is known. */
360
361Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
362{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000363 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
364
365 return ch + ctype->lower;
Guido van Rossum603484d2000-03-10 22:52:46 +0000366}
367
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000368/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
369 'Lo' or 'Lm', 0 otherwise. */
370
371int _PyUnicode_IsAlpha(register const Py_UNICODE ch)
372{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000373 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000374
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000375 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000376}
377
Guido van Rossum603484d2000-03-10 22:52:46 +0000378#else
379
380/* Export the interfaces using the wchar_t type for portability
381 reasons: */
382
383int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
384{
385 return iswspace(ch);
386}
387
388int _PyUnicode_IsLowercase(register const Py_UNICODE ch)
389{
390 return iswlower(ch);
391}
392
393int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
394{
395 return iswupper(ch);
396}
397
398Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
399{
400 return towlower(ch);
401}
402
403Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
404{
405 return towupper(ch);
406}
407
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000408int _PyUnicode_IsAlpha(register const Py_UNICODE ch)
409{
410 return iswalpha(ch);
411}
412
Guido van Rossum603484d2000-03-10 22:52:46 +0000413#endif