blob: 1a2bb69ea1ed7d805059cd6db6f6e654ea3dcccc [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000022#define XID_START_MASK 0x100
23#define XID_CONTINUE_MASK 0x200
Georg Brandl559e5d72008-06-11 18:37:52 +000024#define NONPRINTABLE_MASK 0x400
Jack Jansen56cdce32000-07-06 13:57:38 +000025
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000026typedef struct {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000027 const Py_UNICODE upper;
28 const Py_UNICODE lower;
29 const Py_UNICODE title;
30 const unsigned char decimal;
31 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000032 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000033} _PyUnicode_TypeRecord;
34
35#include "unicodetype_db.h"
36
37static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000038gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000039{
40 int index;
41
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000042#ifdef Py_UNICODE_WIDE
Martin v. Löwis9def6a32002-10-18 16:11:54 +000043 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000044 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000045 else
46#endif
47 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000048 index = index1[(code>>SHIFT)];
49 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
50 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000051
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000052 return &_PyUnicode_TypeRecords[index];
53}
Jack Jansen56cdce32000-07-06 13:57:38 +000054
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000055/* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
56 type 'B', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +000057
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000058int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000059{
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000060 switch (ch) {
61 case 0x000A: /* LINE FEED */
62 case 0x000D: /* CARRIAGE RETURN */
63 case 0x001C: /* FILE SEPARATOR */
64 case 0x001D: /* GROUP SEPARATOR */
65 case 0x001E: /* RECORD SEPARATOR */
66 case 0x0085: /* NEXT LINE */
67 case 0x2028: /* LINE SEPARATOR */
68 case 0x2029: /* PARAGRAPH SEPARATOR */
69 return 1;
70 default:
71 return 0;
72 }
Guido van Rossum603484d2000-03-10 22:52:46 +000073}
74
75/* Returns the titlecase Unicode characters corresponding to ch or just
76 ch if no titlecase mapping is known. */
77
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000078Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000079{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000080 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +000081 int delta;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000082
83 if (ctype->title)
Martin v. Löwisedf368c2002-10-18 16:40:36 +000084 delta = ctype->title;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000085 else
Martin v. Löwisedf368c2002-10-18 16:40:36 +000086 delta = ctype->upper;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000087
Martin v. Löwisedf368c2002-10-18 16:40:36 +000088 if (delta >= 32768)
89 delta -= 65536;
90
91 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000092}
93
94/* Returns 1 for Unicode characters having the category 'Lt', 0
95 otherwise. */
96
Fredrik Lundh72b06852001-06-27 22:08:26 +000097int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000098{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000099 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
100
101 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000102}
103
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000104/* Returns 1 for Unicode characters having the XID_Start property, 0
105 otherwise. */
106
107int _PyUnicode_IsXidStart(Py_UNICODE ch)
108{
109 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
110
111 return (ctype->flags & XID_START_MASK) != 0;
112}
113
114/* Returns 1 for Unicode characters having the XID_Continue property,
115 0 otherwise. */
116
117int _PyUnicode_IsXidContinue(Py_UNICODE ch)
118{
119 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
120
121 return (ctype->flags & XID_CONTINUE_MASK) != 0;
122}
123
Guido van Rossum603484d2000-03-10 22:52:46 +0000124/* Returns the integer decimal (0-9) for Unicode characters having
125 this property, -1 otherwise. */
126
Fredrik Lundh72b06852001-06-27 22:08:26 +0000127int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000128{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000129 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
130
131 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000132}
133
Fredrik Lundh72b06852001-06-27 22:08:26 +0000134int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000135{
136 if (_PyUnicode_ToDecimalDigit(ch) < 0)
137 return 0;
138 return 1;
139}
140
141/* Returns the integer digit (0-9) for Unicode characters having
142 this property, -1 otherwise. */
143
Fredrik Lundh72b06852001-06-27 22:08:26 +0000144int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000145{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000146 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
147
148 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000149}
150
Fredrik Lundh72b06852001-06-27 22:08:26 +0000151int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000152{
153 if (_PyUnicode_ToDigit(ch) < 0)
154 return 0;
155 return 1;
156}
157
158/* Returns the numeric value as double for Unicode characters having
159 this property, -1.0 otherwise. */
160
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000161/* TODO: replace with unicodetype_db.h table */
162
Fredrik Lundh72b06852001-06-27 22:08:26 +0000163double _PyUnicode_ToNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000164{
165 switch (ch) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000166 case 0x0F33:
167 return (double) -1 / 2;
168 case 0x17F0:
Guido van Rossum603484d2000-03-10 22:52:46 +0000169 case 0x3007:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000170#ifdef Py_UNICODE_WIDE
171 case 0x1018A:
172#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000173 return (double) 0;
174 case 0x09F4:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 case 0x17F1:
Guido van Rossum603484d2000-03-10 22:52:46 +0000176 case 0x215F:
177 case 0x2160:
178 case 0x2170:
179 case 0x3021:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 case 0x3192:
181 case 0x3220:
Guido van Rossum603484d2000-03-10 22:52:46 +0000182 case 0x3280:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183#ifdef Py_UNICODE_WIDE
184 case 0x10107:
185 case 0x10142:
186 case 0x10158:
187 case 0x10159:
188 case 0x1015A:
189 case 0x10320:
190 case 0x103D1:
191#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000192 return (double) 1;
193 case 0x00BD:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194 case 0x0F2A:
195 case 0x2CFD:
196#ifdef Py_UNICODE_WIDE
197 case 0x10141:
198 case 0x10175:
199 case 0x10176:
200#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000201 return (double) 1 / 2;
202 case 0x2153:
203 return (double) 1 / 3;
204 case 0x00BC:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205#ifdef Py_UNICODE_WIDE
206 case 0x10140:
207#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000208 return (double) 1 / 4;
209 case 0x2155:
210 return (double) 1 / 5;
211 case 0x2159:
212 return (double) 1 / 6;
213 case 0x215B:
214 return (double) 1 / 8;
215 case 0x0BF0:
216 case 0x1372:
217 case 0x2169:
218 case 0x2179:
219 case 0x2469:
220 case 0x247D:
221 case 0x2491:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 case 0x24FE:
Guido van Rossum603484d2000-03-10 22:52:46 +0000223 case 0x277F:
224 case 0x2789:
225 case 0x2793:
226 case 0x3038:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227 case 0x3229:
Guido van Rossum603484d2000-03-10 22:52:46 +0000228 case 0x3289:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229#ifdef Py_UNICODE_WIDE
230 case 0x10110:
231 case 0x10149:
232 case 0x10150:
233 case 0x10157:
234 case 0x10160:
235 case 0x10161:
236 case 0x10162:
237 case 0x10163:
238 case 0x10164:
239 case 0x10322:
240 case 0x103D3:
241 case 0x10A44:
242#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000243 return (double) 10;
244 case 0x0BF1:
245 case 0x137B:
246 case 0x216D:
247 case 0x217D:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248#ifdef Py_UNICODE_WIDE
249 case 0x10119:
250 case 0x1014B:
251 case 0x10152:
252 case 0x1016A:
253 case 0x103D5:
254 case 0x10A46:
255#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000256 return (double) 100;
257 case 0x0BF2:
258 case 0x216F:
259 case 0x217F:
260 case 0x2180:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000261#ifdef Py_UNICODE_WIDE
262 case 0x10122:
263 case 0x1014D:
264 case 0x10154:
265 case 0x10171:
266 case 0x10A47:
267#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000268 return (double) 1000;
269 case 0x137C:
270 case 0x2182:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271#ifdef Py_UNICODE_WIDE
272 case 0x1012B:
273 case 0x10155:
274#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000275 return (double) 10000;
276 case 0x216A:
277 case 0x217A:
278 case 0x246A:
279 case 0x247E:
280 case 0x2492:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281 case 0x24EB:
Guido van Rossum603484d2000-03-10 22:52:46 +0000282 return (double) 11;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 case 0x0F2F:
284 return (double) 11 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000285 case 0x216B:
286 case 0x217B:
287 case 0x246B:
288 case 0x247F:
289 case 0x2493:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000290 case 0x24EC:
Guido van Rossum603484d2000-03-10 22:52:46 +0000291 return (double) 12;
292 case 0x246C:
293 case 0x2480:
294 case 0x2494:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000295 case 0x24ED:
Guido van Rossum603484d2000-03-10 22:52:46 +0000296 return (double) 13;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000297 case 0x0F30:
298 return (double) 13 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000299 case 0x246D:
300 case 0x2481:
301 case 0x2495:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000302 case 0x24EE:
Guido van Rossum603484d2000-03-10 22:52:46 +0000303 return (double) 14;
304 case 0x246E:
305 case 0x2482:
306 case 0x2496:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000307 case 0x24EF:
Guido van Rossum603484d2000-03-10 22:52:46 +0000308 return (double) 15;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000309 case 0x0F31:
310 return (double) 15 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000311 case 0x09F9:
312 case 0x246F:
313 case 0x2483:
314 case 0x2497:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315 case 0x24F0:
Guido van Rossum603484d2000-03-10 22:52:46 +0000316 return (double) 16;
317 case 0x16EE:
318 case 0x2470:
319 case 0x2484:
320 case 0x2498:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000321 case 0x24F1:
Guido van Rossum603484d2000-03-10 22:52:46 +0000322 return (double) 17;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 case 0x0F32:
324 return (double) 17 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000325 case 0x16EF:
326 case 0x2471:
327 case 0x2485:
328 case 0x2499:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 case 0x24F2:
Guido van Rossum603484d2000-03-10 22:52:46 +0000330 return (double) 18;
331 case 0x16F0:
332 case 0x2472:
333 case 0x2486:
334 case 0x249A:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335 case 0x24F3:
Guido van Rossum603484d2000-03-10 22:52:46 +0000336 return (double) 19;
337 case 0x09F5:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000338 case 0x17F2:
Guido van Rossum603484d2000-03-10 22:52:46 +0000339 case 0x2161:
340 case 0x2171:
341 case 0x3022:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000342 case 0x3193:
343 case 0x3221:
Guido van Rossum603484d2000-03-10 22:52:46 +0000344 case 0x3281:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345#ifdef Py_UNICODE_WIDE
346 case 0x10108:
347 case 0x1015B:
348 case 0x1015C:
349 case 0x1015D:
350 case 0x1015E:
351 case 0x103D2:
352#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000353 return (double) 2;
354 case 0x2154:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355#ifdef Py_UNICODE_WIDE
356 case 0x10177:
357#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000358 return (double) 2 / 3;
359 case 0x2156:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000360 return (double) 2 / 5;
Guido van Rossum603484d2000-03-10 22:52:46 +0000361 case 0x1373:
362 case 0x2473:
363 case 0x2487:
364 case 0x249B:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000365 case 0x24F4:
Guido van Rossum603484d2000-03-10 22:52:46 +0000366 case 0x3039:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000367#ifdef Py_UNICODE_WIDE
368 case 0x10111:
369 case 0x103D4:
370 case 0x10A45:
371#endif
372 return (double) 20;
373#ifdef Py_UNICODE_WIDE
374 case 0x1011A:
375 return (double) 200;
376 case 0x10123:
377 return (double) 2000;
378 case 0x1012C:
379 return (double) 20000;
380#endif
381 case 0x3251:
382 return (double) 21;
383 case 0x3252:
384 return (double) 22;
385 case 0x3253:
386 return (double) 23;
387 case 0x3254:
388 return (double) 24;
389 case 0x3255:
390 return (double) 25;
391 case 0x3256:
392 return (double) 26;
393 case 0x3257:
394 return (double) 27;
395 case 0x3258:
396 return (double) 28;
397 case 0x3259:
398 return (double) 29;
Guido van Rossum603484d2000-03-10 22:52:46 +0000399 case 0x09F6:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000400 case 0x17F3:
Guido van Rossum603484d2000-03-10 22:52:46 +0000401 case 0x2162:
402 case 0x2172:
403 case 0x3023:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404 case 0x3194:
405 case 0x3222:
Guido van Rossum603484d2000-03-10 22:52:46 +0000406 case 0x3282:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000407#ifdef Py_UNICODE_WIDE
408 case 0x10109:
409#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000410 return (double) 3;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411 case 0x0F2B:
412 return (double) 3 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000413 case 0x00BE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000414#ifdef Py_UNICODE_WIDE
415 case 0x10178:
416#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000417 return (double) 3 / 4;
418 case 0x2157:
419 return (double) 3 / 5;
420 case 0x215C:
421 return (double) 3 / 8;
422 case 0x1374:
423 case 0x303A:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000424 case 0x325A:
425#ifdef Py_UNICODE_WIDE
426 case 0x10112:
427 case 0x10165:
428#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000429 return (double) 30;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000430#ifdef Py_UNICODE_WIDE
431 case 0x1011B:
432 case 0x1016B:
433 return (double) 300;
434 case 0x10124:
435 return (double) 3000;
436 case 0x1012D:
437 return (double) 30000;
438#endif
439 case 0x325B:
440 return (double) 31;
441 case 0x325C:
442 return (double) 32;
443 case 0x325D:
444 return (double) 33;
445 case 0x325E:
446 return (double) 34;
447 case 0x325F:
448 return (double) 35;
449 case 0x32B1:
450 return (double) 36;
451 case 0x32B2:
452 return (double) 37;
453 case 0x32B3:
454 return (double) 38;
455 case 0x32B4:
456 return (double) 39;
Guido van Rossum603484d2000-03-10 22:52:46 +0000457 case 0x09F7:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000458 case 0x17F4:
Guido van Rossum603484d2000-03-10 22:52:46 +0000459 case 0x2163:
460 case 0x2173:
461 case 0x3024:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000462 case 0x3195:
463 case 0x3223:
Guido van Rossum603484d2000-03-10 22:52:46 +0000464 case 0x3283:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000465#ifdef Py_UNICODE_WIDE
466 case 0x1010A:
467#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000468 return (double) 4;
469 case 0x2158:
470 return (double) 4 / 5;
471 case 0x1375:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472 case 0x32B5:
473#ifdef Py_UNICODE_WIDE
474 case 0x10113:
475#endif
476 return (double) 40;
477#ifdef Py_UNICODE_WIDE
478 case 0x1011C:
479 return (double) 400;
480 case 0x10125:
481 return (double) 4000;
482 case 0x1012E:
483 return (double) 40000;
484#endif
485 case 0x32B6:
486 return (double) 41;
487 case 0x32B7:
488 return (double) 42;
489 case 0x32B8:
490 return (double) 43;
491 case 0x32B9:
492 return (double) 44;
493 case 0x32BA:
494 return (double) 45;
495 case 0x32BB:
496 return (double) 46;
497 case 0x32BC:
498 return (double) 47;
499 case 0x32BD:
500 return (double) 48;
501 case 0x32BE:
502 return (double) 49;
503 case 0x17F5:
Guido van Rossum603484d2000-03-10 22:52:46 +0000504 case 0x2164:
505 case 0x2174:
506 case 0x3025:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507 case 0x3224:
Guido van Rossum603484d2000-03-10 22:52:46 +0000508 case 0x3284:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000509#ifdef Py_UNICODE_WIDE
510 case 0x1010B:
511 case 0x10143:
512 case 0x10148:
513 case 0x1014F:
514 case 0x1015F:
515 case 0x10173:
516 case 0x10321:
517#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000518 return (double) 5;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000519 case 0x0F2C:
520 return (double) 5 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000521 case 0x215A:
522 return (double) 5 / 6;
523 case 0x215D:
524 return (double) 5 / 8;
525 case 0x1376:
526 case 0x216C:
527 case 0x217C:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528 case 0x32BF:
529#ifdef Py_UNICODE_WIDE
530 case 0x10114:
531 case 0x10144:
532 case 0x1014A:
533 case 0x10151:
534 case 0x10166:
535 case 0x10167:
536 case 0x10168:
537 case 0x10169:
538 case 0x10174:
539 case 0x10323:
540#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000541 return (double) 50;
542 case 0x216E:
543 case 0x217E:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#ifdef Py_UNICODE_WIDE
545 case 0x1011D:
546 case 0x10145:
547 case 0x1014C:
548 case 0x10153:
549 case 0x1016C:
550 case 0x1016D:
551 case 0x1016E:
552 case 0x1016F:
553 case 0x10170:
554#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000555 return (double) 500;
556 case 0x2181:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557#ifdef Py_UNICODE_WIDE
558 case 0x10126:
559 case 0x10146:
560 case 0x1014E:
561 case 0x10172:
562#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000563 return (double) 5000;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564#ifdef Py_UNICODE_WIDE
565 case 0x1012F:
566 case 0x10147:
567 case 0x10156:
568 return (double) 50000;
569#endif
570 case 0x17F6:
Guido van Rossum603484d2000-03-10 22:52:46 +0000571 case 0x2165:
572 case 0x2175:
573 case 0x3026:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000574 case 0x3225:
Guido van Rossum603484d2000-03-10 22:52:46 +0000575 case 0x3285:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000576#ifdef Py_UNICODE_WIDE
577 case 0x1010C:
578#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000579 return (double) 6;
580 case 0x1377:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000581#ifdef Py_UNICODE_WIDE
582 case 0x10115:
583#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000584 return (double) 60;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585#ifdef Py_UNICODE_WIDE
586 case 0x1011E:
587 return (double) 600;
588 case 0x10127:
589 return (double) 6000;
590 case 0x10130:
591 return (double) 60000;
592#endif
593 case 0x17F7:
Guido van Rossum603484d2000-03-10 22:52:46 +0000594 case 0x2166:
595 case 0x2176:
596 case 0x3027:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000597 case 0x3226:
Guido van Rossum603484d2000-03-10 22:52:46 +0000598 case 0x3286:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000599#ifdef Py_UNICODE_WIDE
600 case 0x1010D:
601#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000602 return (double) 7;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000603 case 0x0F2D:
604 return (double) 7 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000605 case 0x215E:
606 return (double) 7 / 8;
607 case 0x1378:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000608#ifdef Py_UNICODE_WIDE
609 case 0x10116:
610#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000611 return (double) 70;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000612#ifdef Py_UNICODE_WIDE
613 case 0x1011F:
614 return (double) 700;
615 case 0x10128:
616 return (double) 7000;
617 case 0x10131:
618 return (double) 70000;
619#endif
620 case 0x17F8:
Guido van Rossum603484d2000-03-10 22:52:46 +0000621 case 0x2167:
622 case 0x2177:
623 case 0x3028:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000624 case 0x3227:
Guido van Rossum603484d2000-03-10 22:52:46 +0000625 case 0x3287:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000626#ifdef Py_UNICODE_WIDE
627 case 0x1010E:
628#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000629 return (double) 8;
630 case 0x1379:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000631#ifdef Py_UNICODE_WIDE
632 case 0x10117:
633#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000634 return (double) 80;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000635#ifdef Py_UNICODE_WIDE
636 case 0x10120:
637 return (double) 800;
638 case 0x10129:
639 return (double) 8000;
640 case 0x10132:
641 return (double) 80000;
642#endif
643 case 0x17F9:
Guido van Rossum603484d2000-03-10 22:52:46 +0000644 case 0x2168:
645 case 0x2178:
646 case 0x3029:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000647 case 0x3228:
Guido van Rossum603484d2000-03-10 22:52:46 +0000648 case 0x3288:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000649#ifdef Py_UNICODE_WIDE
650 case 0x1010F:
651#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000652 return (double) 9;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000653 case 0x0F2E:
654 return (double) 9 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000655 case 0x137A:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000656#ifdef Py_UNICODE_WIDE
657 case 0x10118:
658#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000659 return (double) 90;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000660#ifdef Py_UNICODE_WIDE
661 case 0x10121:
662 case 0x1034A:
663 return (double) 900;
664 case 0x1012A:
665 return (double) 9000;
666 case 0x10133:
667 return (double) 90000;
668#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000669 default:
670 return (double) _PyUnicode_ToDigit(ch);
671 }
672}
673
Fredrik Lundh72b06852001-06-27 22:08:26 +0000674int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000675{
Thomas Wouters477c8d52006-05-27 19:21:47 +0000676 return _PyUnicode_ToNumeric(ch) != -1.0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000677}
678
Georg Brandl559e5d72008-06-11 18:37:52 +0000679/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
680 0 otherwise.
681 All characters except those characters defined in the Unicode character
682 database as following categories are considered printable.
683 * Cc (Other, Control)
684 * Cf (Other, Format)
685 * Cs (Other, Surrogate)
686 * Co (Other, Private Use)
687 * Cn (Other, Not Assigned)
688 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
689 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
690 * Zs (Separator, Space) other than ASCII space('\x20').
691*/
692int _PyUnicode_IsPrintable(Py_UNICODE ch)
693{
694 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
695
696 return (ctype->flags & NONPRINTABLE_MASK) == 0;
697}
698
Guido van Rossum603484d2000-03-10 22:52:46 +0000699#ifndef WANT_WCTYPE_FUNCTIONS
700
Guido van Rossumdc742b32000-04-11 15:39:02 +0000701/* Returns 1 for Unicode characters having the bidirectional type
702 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000703
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +0000704int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000705{
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +0000706 switch (ch) {
707 case 0x0009: /* HORIZONTAL TABULATION */
708 case 0x000A: /* LINE FEED */
709 case 0x000B: /* VERTICAL TABULATION */
710 case 0x000C: /* FORM FEED */
711 case 0x000D: /* CARRIAGE RETURN */
712 case 0x001C: /* FILE SEPARATOR */
713 case 0x001D: /* GROUP SEPARATOR */
714 case 0x001E: /* RECORD SEPARATOR */
715 case 0x001F: /* UNIT SEPARATOR */
716 case 0x0020: /* SPACE */
717 case 0x0085: /* NEXT LINE */
718 case 0x00A0: /* NO-BREAK SPACE */
719 case 0x1680: /* OGHAM SPACE MARK */
720 case 0x2000: /* EN QUAD */
721 case 0x2001: /* EM QUAD */
722 case 0x2002: /* EN SPACE */
723 case 0x2003: /* EM SPACE */
724 case 0x2004: /* THREE-PER-EM SPACE */
725 case 0x2005: /* FOUR-PER-EM SPACE */
726 case 0x2006: /* SIX-PER-EM SPACE */
727 case 0x2007: /* FIGURE SPACE */
728 case 0x2008: /* PUNCTUATION SPACE */
729 case 0x2009: /* THIN SPACE */
730 case 0x200A: /* HAIR SPACE */
731 case 0x200B: /* ZERO WIDTH SPACE */
732 case 0x2028: /* LINE SEPARATOR */
733 case 0x2029: /* PARAGRAPH SEPARATOR */
734 case 0x202F: /* NARROW NO-BREAK SPACE */
735 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
736 case 0x3000: /* IDEOGRAPHIC SPACE */
737 return 1;
738 default:
739 return 0;
740 }
Guido van Rossum603484d2000-03-10 22:52:46 +0000741}
742
743/* Returns 1 for Unicode characters having the category 'Ll', 0
744 otherwise. */
745
Fredrik Lundh72b06852001-06-27 22:08:26 +0000746int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000747{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000748 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
749
750 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000751}
752
753/* Returns 1 for Unicode characters having the category 'Lu', 0
754 otherwise. */
755
Fredrik Lundh72b06852001-06-27 22:08:26 +0000756int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000757{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000758 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
759
760 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000761}
762
763/* Returns the uppercase Unicode characters corresponding to ch or just
764 ch if no uppercase mapping is known. */
765
Fredrik Lundh72b06852001-06-27 22:08:26 +0000766Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000767{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000768 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000769 int delta = ctype->upper;
770 if (delta >= 32768)
771 delta -= 65536;
772 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000773}
774
775/* Returns the lowercase Unicode characters corresponding to ch or just
776 ch if no lowercase mapping is known. */
777
Fredrik Lundh72b06852001-06-27 22:08:26 +0000778Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000779{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000780 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000781 int delta = ctype->lower;
782 if (delta >= 32768)
783 delta -= 65536;
784 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000785}
786
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000787/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
788 'Lo' or 'Lm', 0 otherwise. */
789
Fredrik Lundh72b06852001-06-27 22:08:26 +0000790int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000791{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000792 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000793
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000794 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000795}
796
Guido van Rossum603484d2000-03-10 22:52:46 +0000797#else
798
799/* Export the interfaces using the wchar_t type for portability
800 reasons: */
801
Fredrik Lundh72b06852001-06-27 22:08:26 +0000802int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000803{
804 return iswspace(ch);
805}
806
Fredrik Lundh72b06852001-06-27 22:08:26 +0000807int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000808{
809 return iswlower(ch);
810}
811
Fredrik Lundh72b06852001-06-27 22:08:26 +0000812int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000813{
814 return iswupper(ch);
815}
816
Fredrik Lundh72b06852001-06-27 22:08:26 +0000817Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000818{
819 return towlower(ch);
820}
821
Fredrik Lundh72b06852001-06-27 22:08:26 +0000822Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000823{
824 return towupper(ch);
825}
826
Fredrik Lundh72b06852001-06-27 22:08:26 +0000827int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000828{
829 return iswalpha(ch);
830}
831
Guido van Rossum603484d2000-03-10 22:52:46 +0000832#endif