blob: 7a330d9052fe313b40a7a8b8350ebf03a910576e [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Jack Jansen56cdce32000-07-06 13:57:38 +000022
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000023typedef struct {
24 const unsigned short flags;
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30} _PyUnicode_TypeRecord;
31
32#include "unicodetype_db.h"
33
34static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000035gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000036{
37 int index;
38
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000039#ifdef Py_UNICODE_WIDE
Martin v. Löwis9def6a32002-10-18 16:11:54 +000040 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000041 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000042 else
43#endif
44 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000045 index = index1[(code>>SHIFT)];
46 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
47 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000048
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000049 return &_PyUnicode_TypeRecords[index];
50}
Jack Jansen56cdce32000-07-06 13:57:38 +000051
Guido van Rossum603484d2000-03-10 22:52:46 +000052/* Returns 1 for Unicode characters having the category 'Zl' or type
53 'B', 0 otherwise. */
54
Fredrik Lundh72b06852001-06-27 22:08:26 +000055int _PyUnicode_IsLinebreak(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000056{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000057 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
58
59 return (ctype->flags & LINEBREAK_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000060}
61
62/* Returns the titlecase Unicode characters corresponding to ch or just
63 ch if no titlecase mapping is known. */
64
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000065Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000066{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000067 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +000068 int delta;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000069
70 if (ctype->title)
Martin v. Löwisedf368c2002-10-18 16:40:36 +000071 delta = ctype->title;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000072 else
Martin v. Löwisedf368c2002-10-18 16:40:36 +000073 delta = ctype->upper;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000074
Martin v. Löwisedf368c2002-10-18 16:40:36 +000075 if (delta >= 32768)
76 delta -= 65536;
77
78 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000079}
80
81/* Returns 1 for Unicode characters having the category 'Lt', 0
82 otherwise. */
83
Fredrik Lundh72b06852001-06-27 22:08:26 +000084int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000085{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000086 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000089}
90
91/* Returns the integer decimal (0-9) for Unicode characters having
92 this property, -1 otherwise. */
93
Fredrik Lundh72b06852001-06-27 22:08:26 +000094int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000095{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000096 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97
98 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +000099}
100
Fredrik Lundh72b06852001-06-27 22:08:26 +0000101int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000102{
103 if (_PyUnicode_ToDecimalDigit(ch) < 0)
104 return 0;
105 return 1;
106}
107
108/* Returns the integer digit (0-9) for Unicode characters having
109 this property, -1 otherwise. */
110
Fredrik Lundh72b06852001-06-27 22:08:26 +0000111int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000112{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000113 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
114
115 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000116}
117
Fredrik Lundh72b06852001-06-27 22:08:26 +0000118int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000119{
120 if (_PyUnicode_ToDigit(ch) < 0)
121 return 0;
122 return 1;
123}
124
125/* Returns the numeric value as double for Unicode characters having
126 this property, -1.0 otherwise. */
127
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000128/* TODO: replace with unicodetype_db.h table */
129
Fredrik Lundh72b06852001-06-27 22:08:26 +0000130double _PyUnicode_ToNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000131{
132 switch (ch) {
133 case 0x3007:
134 return (double) 0;
135 case 0x09F4:
136 case 0x215F:
137 case 0x2160:
138 case 0x2170:
139 case 0x3021:
140 case 0x3280:
141 return (double) 1;
142 case 0x00BD:
143 return (double) 1 / 2;
144 case 0x2153:
145 return (double) 1 / 3;
146 case 0x00BC:
147 return (double) 1 / 4;
148 case 0x2155:
149 return (double) 1 / 5;
150 case 0x2159:
151 return (double) 1 / 6;
152 case 0x215B:
153 return (double) 1 / 8;
154 case 0x0BF0:
155 case 0x1372:
156 case 0x2169:
157 case 0x2179:
158 case 0x2469:
159 case 0x247D:
160 case 0x2491:
161 case 0x277F:
162 case 0x2789:
163 case 0x2793:
164 case 0x3038:
165 case 0x3289:
166 return (double) 10;
167 case 0x0BF1:
168 case 0x137B:
169 case 0x216D:
170 case 0x217D:
171 return (double) 100;
172 case 0x0BF2:
173 case 0x216F:
174 case 0x217F:
175 case 0x2180:
176 return (double) 1000;
177 case 0x137C:
178 case 0x2182:
179 return (double) 10000;
180 case 0x216A:
181 case 0x217A:
182 case 0x246A:
183 case 0x247E:
184 case 0x2492:
185 return (double) 11;
186 case 0x216B:
187 case 0x217B:
188 case 0x246B:
189 case 0x247F:
190 case 0x2493:
191 return (double) 12;
192 case 0x246C:
193 case 0x2480:
194 case 0x2494:
195 return (double) 13;
196 case 0x246D:
197 case 0x2481:
198 case 0x2495:
199 return (double) 14;
200 case 0x246E:
201 case 0x2482:
202 case 0x2496:
203 return (double) 15;
204 case 0x09F9:
205 case 0x246F:
206 case 0x2483:
207 case 0x2497:
208 return (double) 16;
209 case 0x16EE:
210 case 0x2470:
211 case 0x2484:
212 case 0x2498:
213 return (double) 17;
214 case 0x16EF:
215 case 0x2471:
216 case 0x2485:
217 case 0x2499:
218 return (double) 18;
219 case 0x16F0:
220 case 0x2472:
221 case 0x2486:
222 case 0x249A:
223 return (double) 19;
224 case 0x09F5:
225 case 0x2161:
226 case 0x2171:
227 case 0x3022:
228 case 0x3281:
229 return (double) 2;
230 case 0x2154:
231 return (double) 2 / 3;
232 case 0x2156:
233 return (double) 2 / 5;
234 case 0x1373:
235 case 0x2473:
236 case 0x2487:
237 case 0x249B:
238 case 0x3039:
239 return (double) 20;
240 case 0x09F6:
241 case 0x2162:
242 case 0x2172:
243 case 0x3023:
244 case 0x3282:
245 return (double) 3;
246 case 0x00BE:
247 return (double) 3 / 4;
248 case 0x2157:
249 return (double) 3 / 5;
250 case 0x215C:
251 return (double) 3 / 8;
252 case 0x1374:
253 case 0x303A:
254 return (double) 30;
255 case 0x09F7:
256 case 0x2163:
257 case 0x2173:
258 case 0x3024:
259 case 0x3283:
260 return (double) 4;
261 case 0x2158:
262 return (double) 4 / 5;
263 case 0x1375:
264 return (double) 40;
265 case 0x2164:
266 case 0x2174:
267 case 0x3025:
268 case 0x3284:
269 return (double) 5;
270 case 0x215A:
271 return (double) 5 / 6;
272 case 0x215D:
273 return (double) 5 / 8;
274 case 0x1376:
275 case 0x216C:
276 case 0x217C:
277 return (double) 50;
278 case 0x216E:
279 case 0x217E:
280 return (double) 500;
281 case 0x2181:
282 return (double) 5000;
283 case 0x2165:
284 case 0x2175:
285 case 0x3026:
286 case 0x3285:
287 return (double) 6;
288 case 0x1377:
289 return (double) 60;
290 case 0x2166:
291 case 0x2176:
292 case 0x3027:
293 case 0x3286:
294 return (double) 7;
295 case 0x215E:
296 return (double) 7 / 8;
297 case 0x1378:
298 return (double) 70;
299 case 0x2167:
300 case 0x2177:
301 case 0x3028:
302 case 0x3287:
303 return (double) 8;
304 case 0x1379:
305 return (double) 80;
306 case 0x2168:
307 case 0x2178:
308 case 0x3029:
309 case 0x3288:
310 return (double) 9;
311 case 0x137A:
312 return (double) 90;
313 default:
314 return (double) _PyUnicode_ToDigit(ch);
315 }
316}
317
Fredrik Lundh72b06852001-06-27 22:08:26 +0000318int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000319{
320 if (_PyUnicode_ToNumeric(ch) < 0.0)
321 return 0;
322 return 1;
323}
324
325#ifndef WANT_WCTYPE_FUNCTIONS
326
Guido van Rossumdc742b32000-04-11 15:39:02 +0000327/* Returns 1 for Unicode characters having the bidirectional type
328 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000329
Fredrik Lundh72b06852001-06-27 22:08:26 +0000330int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000331{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000332 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
333
334 return (ctype->flags & SPACE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000335}
336
337/* Returns 1 for Unicode characters having the category 'Ll', 0
338 otherwise. */
339
Fredrik Lundh72b06852001-06-27 22:08:26 +0000340int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000341{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000342 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
343
344 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000345}
346
347/* Returns 1 for Unicode characters having the category 'Lu', 0
348 otherwise. */
349
Fredrik Lundh72b06852001-06-27 22:08:26 +0000350int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000351{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000352 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
353
354 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000355}
356
357/* Returns the uppercase Unicode characters corresponding to ch or just
358 ch if no uppercase mapping is known. */
359
Fredrik Lundh72b06852001-06-27 22:08:26 +0000360Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000361{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000362 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000363 int delta = ctype->upper;
364 if (delta >= 32768)
365 delta -= 65536;
366 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000367}
368
369/* Returns the lowercase Unicode characters corresponding to ch or just
370 ch if no lowercase mapping is known. */
371
Fredrik Lundh72b06852001-06-27 22:08:26 +0000372Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000373{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000374 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000375 int delta = ctype->lower;
376 if (delta >= 32768)
377 delta -= 65536;
378 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000379}
380
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000381/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
382 'Lo' or 'Lm', 0 otherwise. */
383
Fredrik Lundh72b06852001-06-27 22:08:26 +0000384int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000385{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000386 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000387
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000388 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000389}
390
Guido van Rossum603484d2000-03-10 22:52:46 +0000391#else
392
393/* Export the interfaces using the wchar_t type for portability
394 reasons: */
395
Fredrik Lundh72b06852001-06-27 22:08:26 +0000396int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000397{
398 return iswspace(ch);
399}
400
Fredrik Lundh72b06852001-06-27 22:08:26 +0000401int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000402{
403 return iswlower(ch);
404}
405
Fredrik Lundh72b06852001-06-27 22:08:26 +0000406int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000407{
408 return iswupper(ch);
409}
410
Fredrik Lundh72b06852001-06-27 22:08:26 +0000411Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000412{
413 return towlower(ch);
414}
415
Fredrik Lundh72b06852001-06-27 22:08:26 +0000416Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000417{
418 return towupper(ch);
419}
420
Fredrik Lundh72b06852001-06-27 22:08:26 +0000421int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000422{
423 return iswalpha(ch);
424}
425
Guido van Rossum603484d2000-03-10 22:52:46 +0000426#endif