blob: 18c9a89a8ab4862db44367e95d70ffbac4ea9afb [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Jack Jansen56cdce32000-07-06 13:57:38 +000022
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000023typedef struct {
24 const unsigned short flags;
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30} _PyUnicode_TypeRecord;
31
32#include "unicodetype_db.h"
33
34static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000035gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000036{
37 int index;
38
Fredrik Lundhee13dba2001-06-26 20:36:12 +000039 if (code >= 65536)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000040 index = 0;
41 else {
42 index = index1[(code>>SHIFT)];
43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
44 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000045
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046 return &_PyUnicode_TypeRecords[index];
47}
Jack Jansen56cdce32000-07-06 13:57:38 +000048
Guido van Rossum603484d2000-03-10 22:52:46 +000049/* Returns 1 for Unicode characters having the category 'Zl' or type
50 'B', 0 otherwise. */
51
Fredrik Lundh72b06852001-06-27 22:08:26 +000052int _PyUnicode_IsLinebreak(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000053{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000054 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
55
56 return (ctype->flags & LINEBREAK_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000057}
58
59/* Returns the titlecase Unicode characters corresponding to ch or just
60 ch if no titlecase mapping is known. */
61
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000062Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000063{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000064 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
65
66 if (ctype->title)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000067 ch += ctype->title;
68 else
69 ch += ctype->upper;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000070
Fredrik Lundh8f455852001-06-27 18:59:43 +000071#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000072 /* The database assumes that the values wrap around at 0x10000. */
73 if (ch > 0x10000)
74 ch -= 0x10000;
75#endif
76 return ch;
Guido van Rossum603484d2000-03-10 22:52:46 +000077}
78
79/* Returns 1 for Unicode characters having the category 'Lt', 0
80 otherwise. */
81
Fredrik Lundh72b06852001-06-27 22:08:26 +000082int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000083{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000084 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
85
86 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000087}
88
89/* Returns the integer decimal (0-9) for Unicode characters having
90 this property, -1 otherwise. */
91
Fredrik Lundh72b06852001-06-27 22:08:26 +000092int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000093{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000094 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
95
96 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +000097}
98
Fredrik Lundh72b06852001-06-27 22:08:26 +000099int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000100{
101 if (_PyUnicode_ToDecimalDigit(ch) < 0)
102 return 0;
103 return 1;
104}
105
106/* Returns the integer digit (0-9) for Unicode characters having
107 this property, -1 otherwise. */
108
Fredrik Lundh72b06852001-06-27 22:08:26 +0000109int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000110{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000111 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
112
113 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000114}
115
Fredrik Lundh72b06852001-06-27 22:08:26 +0000116int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000117{
118 if (_PyUnicode_ToDigit(ch) < 0)
119 return 0;
120 return 1;
121}
122
123/* Returns the numeric value as double for Unicode characters having
124 this property, -1.0 otherwise. */
125
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000126/* TODO: replace with unicodetype_db.h table */
127
Fredrik Lundh72b06852001-06-27 22:08:26 +0000128double _PyUnicode_ToNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000129{
130 switch (ch) {
131 case 0x3007:
132 return (double) 0;
133 case 0x09F4:
134 case 0x215F:
135 case 0x2160:
136 case 0x2170:
137 case 0x3021:
138 case 0x3280:
139 return (double) 1;
140 case 0x00BD:
141 return (double) 1 / 2;
142 case 0x2153:
143 return (double) 1 / 3;
144 case 0x00BC:
145 return (double) 1 / 4;
146 case 0x2155:
147 return (double) 1 / 5;
148 case 0x2159:
149 return (double) 1 / 6;
150 case 0x215B:
151 return (double) 1 / 8;
152 case 0x0BF0:
153 case 0x1372:
154 case 0x2169:
155 case 0x2179:
156 case 0x2469:
157 case 0x247D:
158 case 0x2491:
159 case 0x277F:
160 case 0x2789:
161 case 0x2793:
162 case 0x3038:
163 case 0x3289:
164 return (double) 10;
165 case 0x0BF1:
166 case 0x137B:
167 case 0x216D:
168 case 0x217D:
169 return (double) 100;
170 case 0x0BF2:
171 case 0x216F:
172 case 0x217F:
173 case 0x2180:
174 return (double) 1000;
175 case 0x137C:
176 case 0x2182:
177 return (double) 10000;
178 case 0x216A:
179 case 0x217A:
180 case 0x246A:
181 case 0x247E:
182 case 0x2492:
183 return (double) 11;
184 case 0x216B:
185 case 0x217B:
186 case 0x246B:
187 case 0x247F:
188 case 0x2493:
189 return (double) 12;
190 case 0x246C:
191 case 0x2480:
192 case 0x2494:
193 return (double) 13;
194 case 0x246D:
195 case 0x2481:
196 case 0x2495:
197 return (double) 14;
198 case 0x246E:
199 case 0x2482:
200 case 0x2496:
201 return (double) 15;
202 case 0x09F9:
203 case 0x246F:
204 case 0x2483:
205 case 0x2497:
206 return (double) 16;
207 case 0x16EE:
208 case 0x2470:
209 case 0x2484:
210 case 0x2498:
211 return (double) 17;
212 case 0x16EF:
213 case 0x2471:
214 case 0x2485:
215 case 0x2499:
216 return (double) 18;
217 case 0x16F0:
218 case 0x2472:
219 case 0x2486:
220 case 0x249A:
221 return (double) 19;
222 case 0x09F5:
223 case 0x2161:
224 case 0x2171:
225 case 0x3022:
226 case 0x3281:
227 return (double) 2;
228 case 0x2154:
229 return (double) 2 / 3;
230 case 0x2156:
231 return (double) 2 / 5;
232 case 0x1373:
233 case 0x2473:
234 case 0x2487:
235 case 0x249B:
236 case 0x3039:
237 return (double) 20;
238 case 0x09F6:
239 case 0x2162:
240 case 0x2172:
241 case 0x3023:
242 case 0x3282:
243 return (double) 3;
244 case 0x00BE:
245 return (double) 3 / 4;
246 case 0x2157:
247 return (double) 3 / 5;
248 case 0x215C:
249 return (double) 3 / 8;
250 case 0x1374:
251 case 0x303A:
252 return (double) 30;
253 case 0x09F7:
254 case 0x2163:
255 case 0x2173:
256 case 0x3024:
257 case 0x3283:
258 return (double) 4;
259 case 0x2158:
260 return (double) 4 / 5;
261 case 0x1375:
262 return (double) 40;
263 case 0x2164:
264 case 0x2174:
265 case 0x3025:
266 case 0x3284:
267 return (double) 5;
268 case 0x215A:
269 return (double) 5 / 6;
270 case 0x215D:
271 return (double) 5 / 8;
272 case 0x1376:
273 case 0x216C:
274 case 0x217C:
275 return (double) 50;
276 case 0x216E:
277 case 0x217E:
278 return (double) 500;
279 case 0x2181:
280 return (double) 5000;
281 case 0x2165:
282 case 0x2175:
283 case 0x3026:
284 case 0x3285:
285 return (double) 6;
286 case 0x1377:
287 return (double) 60;
288 case 0x2166:
289 case 0x2176:
290 case 0x3027:
291 case 0x3286:
292 return (double) 7;
293 case 0x215E:
294 return (double) 7 / 8;
295 case 0x1378:
296 return (double) 70;
297 case 0x2167:
298 case 0x2177:
299 case 0x3028:
300 case 0x3287:
301 return (double) 8;
302 case 0x1379:
303 return (double) 80;
304 case 0x2168:
305 case 0x2178:
306 case 0x3029:
307 case 0x3288:
308 return (double) 9;
309 case 0x137A:
310 return (double) 90;
311 default:
312 return (double) _PyUnicode_ToDigit(ch);
313 }
314}
315
Fredrik Lundh72b06852001-06-27 22:08:26 +0000316int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000317{
318 if (_PyUnicode_ToNumeric(ch) < 0.0)
319 return 0;
320 return 1;
321}
322
323#ifndef WANT_WCTYPE_FUNCTIONS
324
Guido van Rossumdc742b32000-04-11 15:39:02 +0000325/* Returns 1 for Unicode characters having the bidirectional type
326 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000327
Fredrik Lundh72b06852001-06-27 22:08:26 +0000328int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000329{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000330 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
331
332 return (ctype->flags & SPACE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000333}
334
335/* Returns 1 for Unicode characters having the category 'Ll', 0
336 otherwise. */
337
Fredrik Lundh72b06852001-06-27 22:08:26 +0000338int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000339{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000340 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
341
342 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000343}
344
345/* Returns 1 for Unicode characters having the category 'Lu', 0
346 otherwise. */
347
Fredrik Lundh72b06852001-06-27 22:08:26 +0000348int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000349{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000350 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
351
352 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000353}
354
355/* Returns the uppercase Unicode characters corresponding to ch or just
356 ch if no uppercase mapping is known. */
357
Fredrik Lundh72b06852001-06-27 22:08:26 +0000358Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000359{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000360 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
361
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000362 ch += ctype->upper;
Fredrik Lundh8f455852001-06-27 18:59:43 +0000363#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000364 /* The database assumes that the values wrap around at 0x10000. */
365 if (ch > 0x10000)
366 ch -= 0x10000;
367#endif
368 return ch;
Guido van Rossum603484d2000-03-10 22:52:46 +0000369}
370
371/* Returns the lowercase Unicode characters corresponding to ch or just
372 ch if no lowercase mapping is known. */
373
Fredrik Lundh72b06852001-06-27 22:08:26 +0000374Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000375{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000376 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
377
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000378 ch += ctype->lower;
Fredrik Lundh8f455852001-06-27 18:59:43 +0000379#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000380 /* The database assumes that the values wrap around at 0x10000. */
381 if (ch > 0x10000)
382 ch -= 0x10000;
383#endif
384 return ch;
Guido van Rossum603484d2000-03-10 22:52:46 +0000385}
386
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000387/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
388 'Lo' or 'Lm', 0 otherwise. */
389
Fredrik Lundh72b06852001-06-27 22:08:26 +0000390int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000391{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000392 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000393
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000394 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000395}
396
Guido van Rossum603484d2000-03-10 22:52:46 +0000397#else
398
399/* Export the interfaces using the wchar_t type for portability
400 reasons: */
401
Fredrik Lundh72b06852001-06-27 22:08:26 +0000402int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000403{
404 return iswspace(ch);
405}
406
Fredrik Lundh72b06852001-06-27 22:08:26 +0000407int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000408{
409 return iswlower(ch);
410}
411
Fredrik Lundh72b06852001-06-27 22:08:26 +0000412int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000413{
414 return iswupper(ch);
415}
416
Fredrik Lundh72b06852001-06-27 22:08:26 +0000417Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000418{
419 return towlower(ch);
420}
421
Fredrik Lundh72b06852001-06-27 22:08:26 +0000422Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000423{
424 return towupper(ch);
425}
426
Fredrik Lundh72b06852001-06-27 22:08:26 +0000427int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000428{
429 return iswalpha(ch);
430}
431
Guido van Rossum603484d2000-03-10 22:52:46 +0000432#endif