blob: 1597bbda04920904e1782f92a063dff7513c7f35 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000022#define XID_START_MASK 0x100
23#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000024#define PRINTABLE_MASK 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000025#define NODELTA_MASK 0x800
Jack Jansen56cdce32000-07-06 13:57:38 +000026
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000027typedef struct {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000028 const Py_UNICODE upper;
29 const Py_UNICODE lower;
30 const Py_UNICODE title;
31 const unsigned char decimal;
32 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000033 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000034} _PyUnicode_TypeRecord;
35
36#include "unicodetype_db.h"
37
38static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000039gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000040{
41 int index;
42
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000043#ifdef Py_UNICODE_WIDE
Martin v. Löwis9def6a32002-10-18 16:11:54 +000044 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000045 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000046 else
47#endif
48 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000049 index = index1[(code>>SHIFT)];
50 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
51 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000052
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000053 return &_PyUnicode_TypeRecords[index];
54}
Jack Jansen56cdce32000-07-06 13:57:38 +000055
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000056/* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
57 type 'B', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +000058
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000059int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000060{
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000061 switch (ch) {
62 case 0x000A: /* LINE FEED */
63 case 0x000D: /* CARRIAGE RETURN */
64 case 0x001C: /* FILE SEPARATOR */
65 case 0x001D: /* GROUP SEPARATOR */
66 case 0x001E: /* RECORD SEPARATOR */
67 case 0x0085: /* NEXT LINE */
68 case 0x2028: /* LINE SEPARATOR */
69 case 0x2029: /* PARAGRAPH SEPARATOR */
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +000070 return 1;
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000071 default:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +000072 return 0;
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +000073 }
Guido van Rossum603484d2000-03-10 22:52:46 +000074}
75
76/* Returns the titlecase Unicode characters corresponding to ch or just
77 ch if no titlecase mapping is known. */
78
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000079Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000080{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000081 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwis71efeb72009-04-26 01:02:07 +000082 int delta = ctype->title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000083
Martin v. Löwis93cbca32008-09-10 14:08:48 +000084 if (ctype->flags & NODELTA_MASK)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +000085 return delta;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000086
Martin v. Löwisedf368c2002-10-18 16:40:36 +000087 if (delta >= 32768)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +000088 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +000089
90 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000091}
92
93/* Returns 1 for Unicode characters having the category 'Lt', 0
94 otherwise. */
95
Fredrik Lundh72b06852001-06-27 22:08:26 +000096int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000097{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000098 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
99
100 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000101}
102
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000103/* Returns 1 for Unicode characters having the XID_Start property, 0
104 otherwise. */
105
106int _PyUnicode_IsXidStart(Py_UNICODE ch)
107{
108 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
109
110 return (ctype->flags & XID_START_MASK) != 0;
111}
112
113/* Returns 1 for Unicode characters having the XID_Continue property,
114 0 otherwise. */
115
116int _PyUnicode_IsXidContinue(Py_UNICODE ch)
117{
118 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
119
120 return (ctype->flags & XID_CONTINUE_MASK) != 0;
121}
122
Guido van Rossum603484d2000-03-10 22:52:46 +0000123/* Returns the integer decimal (0-9) for Unicode characters having
124 this property, -1 otherwise. */
125
Fredrik Lundh72b06852001-06-27 22:08:26 +0000126int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000127{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000128 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
129
130 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000131}
132
Fredrik Lundh72b06852001-06-27 22:08:26 +0000133int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000134{
135 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000136 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000137 return 1;
138}
139
140/* Returns the integer digit (0-9) for Unicode characters having
141 this property, -1 otherwise. */
142
Fredrik Lundh72b06852001-06-27 22:08:26 +0000143int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000144{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000145 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
146
147 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000148}
149
Fredrik Lundh72b06852001-06-27 22:08:26 +0000150int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000151{
152 if (_PyUnicode_ToDigit(ch) < 0)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000153 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000154 return 1;
155}
156
157/* Returns the numeric value as double for Unicode characters having
158 this property, -1.0 otherwise. */
159
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000160/* TODO: replace with unicodetype_db.h table */
161
Fredrik Lundh72b06852001-06-27 22:08:26 +0000162double _PyUnicode_ToNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000163{
164 switch (ch) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000165 case 0x0F33:
166 return (double) -1 / 2;
167 case 0x17F0:
Guido van Rossum603484d2000-03-10 22:52:46 +0000168 case 0x3007:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000169#ifdef Py_UNICODE_WIDE
170 case 0x1018A:
171#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000172 return (double) 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000173 case 0x09F4:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000174 case 0x17F1:
Guido van Rossum603484d2000-03-10 22:52:46 +0000175 case 0x215F:
176 case 0x2160:
177 case 0x2170:
178 case 0x3021:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000179 case 0x3192:
180 case 0x3220:
Guido van Rossum603484d2000-03-10 22:52:46 +0000181 case 0x3280:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000182#ifdef Py_UNICODE_WIDE
183 case 0x10107:
184 case 0x10142:
185 case 0x10158:
186 case 0x10159:
187 case 0x1015A:
188 case 0x10320:
189 case 0x103D1:
190#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000191 return (double) 1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000192 case 0x00BD:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 case 0x0F2A:
194 case 0x2CFD:
195#ifdef Py_UNICODE_WIDE
196 case 0x10141:
197 case 0x10175:
198 case 0x10176:
199#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000200 return (double) 1 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000201 case 0x2153:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000202 return (double) 1 / 3;
Guido van Rossum603484d2000-03-10 22:52:46 +0000203 case 0x00BC:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000204#ifdef Py_UNICODE_WIDE
205 case 0x10140:
206#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000207 return (double) 1 / 4;
Guido van Rossum603484d2000-03-10 22:52:46 +0000208 case 0x2155:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000209 return (double) 1 / 5;
Guido van Rossum603484d2000-03-10 22:52:46 +0000210 case 0x2159:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000211 return (double) 1 / 6;
Guido van Rossum603484d2000-03-10 22:52:46 +0000212 case 0x215B:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000213 return (double) 1 / 8;
Guido van Rossum603484d2000-03-10 22:52:46 +0000214 case 0x0BF0:
215 case 0x1372:
216 case 0x2169:
217 case 0x2179:
218 case 0x2469:
219 case 0x247D:
220 case 0x2491:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 case 0x24FE:
Guido van Rossum603484d2000-03-10 22:52:46 +0000222 case 0x277F:
223 case 0x2789:
224 case 0x2793:
225 case 0x3038:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000226 case 0x3229:
Guido van Rossum603484d2000-03-10 22:52:46 +0000227 case 0x3289:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228#ifdef Py_UNICODE_WIDE
229 case 0x10110:
230 case 0x10149:
231 case 0x10150:
232 case 0x10157:
233 case 0x10160:
234 case 0x10161:
235 case 0x10162:
236 case 0x10163:
237 case 0x10164:
238 case 0x10322:
239 case 0x103D3:
240 case 0x10A44:
241#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000242 return (double) 10;
Guido van Rossum603484d2000-03-10 22:52:46 +0000243 case 0x0BF1:
244 case 0x137B:
245 case 0x216D:
246 case 0x217D:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247#ifdef Py_UNICODE_WIDE
248 case 0x10119:
249 case 0x1014B:
250 case 0x10152:
251 case 0x1016A:
252 case 0x103D5:
253 case 0x10A46:
254#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000255 return (double) 100;
Guido van Rossum603484d2000-03-10 22:52:46 +0000256 case 0x0BF2:
257 case 0x216F:
258 case 0x217F:
259 case 0x2180:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#ifdef Py_UNICODE_WIDE
261 case 0x10122:
262 case 0x1014D:
263 case 0x10154:
264 case 0x10171:
265 case 0x10A47:
266#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000267 return (double) 1000;
Guido van Rossum603484d2000-03-10 22:52:46 +0000268 case 0x137C:
269 case 0x2182:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270#ifdef Py_UNICODE_WIDE
271 case 0x1012B:
272 case 0x10155:
273#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000274 return (double) 10000;
Guido van Rossum603484d2000-03-10 22:52:46 +0000275 case 0x216A:
276 case 0x217A:
277 case 0x246A:
278 case 0x247E:
279 case 0x2492:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000280 case 0x24EB:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000281 return (double) 11;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 case 0x0F2F:
283 return (double) 11 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000284 case 0x216B:
285 case 0x217B:
286 case 0x246B:
287 case 0x247F:
288 case 0x2493:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289 case 0x24EC:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000290 return (double) 12;
Guido van Rossum603484d2000-03-10 22:52:46 +0000291 case 0x246C:
292 case 0x2480:
293 case 0x2494:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000294 case 0x24ED:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000295 return (double) 13;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000296 case 0x0F30:
297 return (double) 13 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000298 case 0x246D:
299 case 0x2481:
300 case 0x2495:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000301 case 0x24EE:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000302 return (double) 14;
Guido van Rossum603484d2000-03-10 22:52:46 +0000303 case 0x246E:
304 case 0x2482:
305 case 0x2496:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000306 case 0x24EF:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000307 return (double) 15;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 case 0x0F31:
309 return (double) 15 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000310 case 0x09F9:
311 case 0x246F:
312 case 0x2483:
313 case 0x2497:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000314 case 0x24F0:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000315 return (double) 16;
Guido van Rossum603484d2000-03-10 22:52:46 +0000316 case 0x16EE:
317 case 0x2470:
318 case 0x2484:
319 case 0x2498:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000320 case 0x24F1:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000321 return (double) 17;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 case 0x0F32:
323 return (double) 17 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000324 case 0x16EF:
325 case 0x2471:
326 case 0x2485:
327 case 0x2499:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000328 case 0x24F2:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000329 return (double) 18;
Guido van Rossum603484d2000-03-10 22:52:46 +0000330 case 0x16F0:
331 case 0x2472:
332 case 0x2486:
333 case 0x249A:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000334 case 0x24F3:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000335 return (double) 19;
Guido van Rossum603484d2000-03-10 22:52:46 +0000336 case 0x09F5:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000337 case 0x17F2:
Guido van Rossum603484d2000-03-10 22:52:46 +0000338 case 0x2161:
339 case 0x2171:
340 case 0x3022:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 case 0x3193:
342 case 0x3221:
Guido van Rossum603484d2000-03-10 22:52:46 +0000343 case 0x3281:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000344#ifdef Py_UNICODE_WIDE
345 case 0x10108:
346 case 0x1015B:
347 case 0x1015C:
348 case 0x1015D:
349 case 0x1015E:
350 case 0x103D2:
351#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000352 return (double) 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000353 case 0x2154:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000354#ifdef Py_UNICODE_WIDE
355 case 0x10177:
356#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000357 return (double) 2 / 3;
Guido van Rossum603484d2000-03-10 22:52:46 +0000358 case 0x2156:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000359 return (double) 2 / 5;
Guido van Rossum603484d2000-03-10 22:52:46 +0000360 case 0x1373:
361 case 0x2473:
362 case 0x2487:
363 case 0x249B:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000364 case 0x24F4:
Guido van Rossum603484d2000-03-10 22:52:46 +0000365 case 0x3039:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000366#ifdef Py_UNICODE_WIDE
367 case 0x10111:
368 case 0x103D4:
369 case 0x10A45:
370#endif
371 return (double) 20;
372#ifdef Py_UNICODE_WIDE
373 case 0x1011A:
374 return (double) 200;
375 case 0x10123:
376 return (double) 2000;
377 case 0x1012C:
378 return (double) 20000;
379#endif
380 case 0x3251:
381 return (double) 21;
382 case 0x3252:
383 return (double) 22;
384 case 0x3253:
385 return (double) 23;
386 case 0x3254:
387 return (double) 24;
388 case 0x3255:
389 return (double) 25;
390 case 0x3256:
391 return (double) 26;
392 case 0x3257:
393 return (double) 27;
394 case 0x3258:
395 return (double) 28;
396 case 0x3259:
397 return (double) 29;
Guido van Rossum603484d2000-03-10 22:52:46 +0000398 case 0x09F6:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000399 case 0x17F3:
Guido van Rossum603484d2000-03-10 22:52:46 +0000400 case 0x2162:
401 case 0x2172:
402 case 0x3023:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000403 case 0x3194:
404 case 0x3222:
Guido van Rossum603484d2000-03-10 22:52:46 +0000405 case 0x3282:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000406#ifdef Py_UNICODE_WIDE
407 case 0x10109:
408#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000409 return (double) 3;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000410 case 0x0F2B:
411 return (double) 3 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000412 case 0x00BE:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000413#ifdef Py_UNICODE_WIDE
414 case 0x10178:
415#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000416 return (double) 3 / 4;
Guido van Rossum603484d2000-03-10 22:52:46 +0000417 case 0x2157:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000418 return (double) 3 / 5;
Guido van Rossum603484d2000-03-10 22:52:46 +0000419 case 0x215C:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000420 return (double) 3 / 8;
Guido van Rossum603484d2000-03-10 22:52:46 +0000421 case 0x1374:
422 case 0x303A:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000423 case 0x325A:
424#ifdef Py_UNICODE_WIDE
425 case 0x10112:
426 case 0x10165:
427#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000428 return (double) 30;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000429#ifdef Py_UNICODE_WIDE
430 case 0x1011B:
431 case 0x1016B:
432 return (double) 300;
433 case 0x10124:
434 return (double) 3000;
435 case 0x1012D:
436 return (double) 30000;
437#endif
438 case 0x325B:
439 return (double) 31;
440 case 0x325C:
441 return (double) 32;
442 case 0x325D:
443 return (double) 33;
444 case 0x325E:
445 return (double) 34;
446 case 0x325F:
447 return (double) 35;
448 case 0x32B1:
449 return (double) 36;
450 case 0x32B2:
451 return (double) 37;
452 case 0x32B3:
453 return (double) 38;
454 case 0x32B4:
455 return (double) 39;
Guido van Rossum603484d2000-03-10 22:52:46 +0000456 case 0x09F7:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000457 case 0x17F4:
Guido van Rossum603484d2000-03-10 22:52:46 +0000458 case 0x2163:
459 case 0x2173:
460 case 0x3024:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000461 case 0x3195:
462 case 0x3223:
Guido van Rossum603484d2000-03-10 22:52:46 +0000463 case 0x3283:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000464#ifdef Py_UNICODE_WIDE
465 case 0x1010A:
466#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000467 return (double) 4;
Guido van Rossum603484d2000-03-10 22:52:46 +0000468 case 0x2158:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000469 return (double) 4 / 5;
Guido van Rossum603484d2000-03-10 22:52:46 +0000470 case 0x1375:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000471 case 0x32B5:
472#ifdef Py_UNICODE_WIDE
473 case 0x10113:
474#endif
475 return (double) 40;
476#ifdef Py_UNICODE_WIDE
477 case 0x1011C:
478 return (double) 400;
479 case 0x10125:
480 return (double) 4000;
481 case 0x1012E:
482 return (double) 40000;
483#endif
484 case 0x32B6:
485 return (double) 41;
486 case 0x32B7:
487 return (double) 42;
488 case 0x32B8:
489 return (double) 43;
490 case 0x32B9:
491 return (double) 44;
492 case 0x32BA:
493 return (double) 45;
494 case 0x32BB:
495 return (double) 46;
496 case 0x32BC:
497 return (double) 47;
498 case 0x32BD:
499 return (double) 48;
500 case 0x32BE:
501 return (double) 49;
502 case 0x17F5:
Guido van Rossum603484d2000-03-10 22:52:46 +0000503 case 0x2164:
504 case 0x2174:
505 case 0x3025:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000506 case 0x3224:
Guido van Rossum603484d2000-03-10 22:52:46 +0000507 case 0x3284:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508#ifdef Py_UNICODE_WIDE
509 case 0x1010B:
510 case 0x10143:
511 case 0x10148:
512 case 0x1014F:
513 case 0x1015F:
514 case 0x10173:
515 case 0x10321:
516#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000517 return (double) 5;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518 case 0x0F2C:
519 return (double) 5 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000520 case 0x215A:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000521 return (double) 5 / 6;
Guido van Rossum603484d2000-03-10 22:52:46 +0000522 case 0x215D:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000523 return (double) 5 / 8;
Guido van Rossum603484d2000-03-10 22:52:46 +0000524 case 0x1376:
525 case 0x216C:
526 case 0x217C:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527 case 0x32BF:
528#ifdef Py_UNICODE_WIDE
529 case 0x10114:
530 case 0x10144:
531 case 0x1014A:
532 case 0x10151:
533 case 0x10166:
534 case 0x10167:
535 case 0x10168:
536 case 0x10169:
537 case 0x10174:
538 case 0x10323:
539#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000540 return (double) 50;
Guido van Rossum603484d2000-03-10 22:52:46 +0000541 case 0x216E:
542 case 0x217E:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543#ifdef Py_UNICODE_WIDE
544 case 0x1011D:
545 case 0x10145:
546 case 0x1014C:
547 case 0x10153:
548 case 0x1016C:
549 case 0x1016D:
550 case 0x1016E:
551 case 0x1016F:
552 case 0x10170:
553#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000554 return (double) 500;
Guido van Rossum603484d2000-03-10 22:52:46 +0000555 case 0x2181:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556#ifdef Py_UNICODE_WIDE
557 case 0x10126:
558 case 0x10146:
559 case 0x1014E:
560 case 0x10172:
561#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000562 return (double) 5000;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000563#ifdef Py_UNICODE_WIDE
564 case 0x1012F:
565 case 0x10147:
566 case 0x10156:
567 return (double) 50000;
568#endif
569 case 0x17F6:
Guido van Rossum603484d2000-03-10 22:52:46 +0000570 case 0x2165:
571 case 0x2175:
572 case 0x3026:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000573 case 0x3225:
Guido van Rossum603484d2000-03-10 22:52:46 +0000574 case 0x3285:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000575#ifdef Py_UNICODE_WIDE
576 case 0x1010C:
577#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000578 return (double) 6;
Guido van Rossum603484d2000-03-10 22:52:46 +0000579 case 0x1377:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000580#ifdef Py_UNICODE_WIDE
581 case 0x10115:
582#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000583 return (double) 60;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000584#ifdef Py_UNICODE_WIDE
585 case 0x1011E:
586 return (double) 600;
587 case 0x10127:
588 return (double) 6000;
589 case 0x10130:
590 return (double) 60000;
591#endif
592 case 0x17F7:
Guido van Rossum603484d2000-03-10 22:52:46 +0000593 case 0x2166:
594 case 0x2176:
595 case 0x3027:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000596 case 0x3226:
Guido van Rossum603484d2000-03-10 22:52:46 +0000597 case 0x3286:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000598#ifdef Py_UNICODE_WIDE
599 case 0x1010D:
600#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000601 return (double) 7;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000602 case 0x0F2D:
603 return (double) 7 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000604 case 0x215E:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000605 return (double) 7 / 8;
Guido van Rossum603484d2000-03-10 22:52:46 +0000606 case 0x1378:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000607#ifdef Py_UNICODE_WIDE
608 case 0x10116:
609#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000610 return (double) 70;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000611#ifdef Py_UNICODE_WIDE
612 case 0x1011F:
613 return (double) 700;
614 case 0x10128:
615 return (double) 7000;
616 case 0x10131:
617 return (double) 70000;
618#endif
619 case 0x17F8:
Guido van Rossum603484d2000-03-10 22:52:46 +0000620 case 0x2167:
621 case 0x2177:
622 case 0x3028:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000623 case 0x3227:
Guido van Rossum603484d2000-03-10 22:52:46 +0000624 case 0x3287:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625#ifdef Py_UNICODE_WIDE
626 case 0x1010E:
627#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000628 return (double) 8;
Guido van Rossum603484d2000-03-10 22:52:46 +0000629 case 0x1379:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000630#ifdef Py_UNICODE_WIDE
631 case 0x10117:
632#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000633 return (double) 80;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000634#ifdef Py_UNICODE_WIDE
635 case 0x10120:
636 return (double) 800;
637 case 0x10129:
638 return (double) 8000;
639 case 0x10132:
640 return (double) 80000;
641#endif
642 case 0x17F9:
Guido van Rossum603484d2000-03-10 22:52:46 +0000643 case 0x2168:
644 case 0x2178:
645 case 0x3029:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000646 case 0x3228:
Guido van Rossum603484d2000-03-10 22:52:46 +0000647 case 0x3288:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000648#ifdef Py_UNICODE_WIDE
649 case 0x1010F:
650#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000651 return (double) 9;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000652 case 0x0F2E:
653 return (double) 9 / 2;
Guido van Rossum603484d2000-03-10 22:52:46 +0000654 case 0x137A:
Thomas Wouters477c8d52006-05-27 19:21:47 +0000655#ifdef Py_UNICODE_WIDE
656 case 0x10118:
657#endif
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000658 return (double) 90;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000659#ifdef Py_UNICODE_WIDE
660 case 0x10121:
661 case 0x1034A:
662 return (double) 900;
663 case 0x1012A:
664 return (double) 9000;
665 case 0x10133:
666 return (double) 90000;
667#endif
Guido van Rossum603484d2000-03-10 22:52:46 +0000668 default:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000669 return (double) _PyUnicode_ToDigit(ch);
Guido van Rossum603484d2000-03-10 22:52:46 +0000670 }
671}
672
Fredrik Lundh72b06852001-06-27 22:08:26 +0000673int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000674{
Thomas Wouters477c8d52006-05-27 19:21:47 +0000675 return _PyUnicode_ToNumeric(ch) != -1.0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000676}
677
Georg Brandl559e5d72008-06-11 18:37:52 +0000678/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
679 0 otherwise.
680 All characters except those characters defined in the Unicode character
681 database as following categories are considered printable.
682 * Cc (Other, Control)
683 * Cf (Other, Format)
684 * Cs (Other, Surrogate)
685 * Co (Other, Private Use)
686 * Cn (Other, Not Assigned)
687 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
688 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
689 * Zs (Separator, Space) other than ASCII space('\x20').
690*/
691int _PyUnicode_IsPrintable(Py_UNICODE ch)
692{
693 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
694
Georg Brandld52429f2008-07-04 15:55:02 +0000695 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000696}
697
Guido van Rossum603484d2000-03-10 22:52:46 +0000698#ifndef WANT_WCTYPE_FUNCTIONS
699
Guido van Rossumdc742b32000-04-11 15:39:02 +0000700/* Returns 1 for Unicode characters having the bidirectional type
701 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000702
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +0000703int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000704{
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +0000705 switch (ch) {
706 case 0x0009: /* HORIZONTAL TABULATION */
707 case 0x000A: /* LINE FEED */
708 case 0x000B: /* VERTICAL TABULATION */
709 case 0x000C: /* FORM FEED */
710 case 0x000D: /* CARRIAGE RETURN */
711 case 0x001C: /* FILE SEPARATOR */
712 case 0x001D: /* GROUP SEPARATOR */
713 case 0x001E: /* RECORD SEPARATOR */
714 case 0x001F: /* UNIT SEPARATOR */
715 case 0x0020: /* SPACE */
716 case 0x0085: /* NEXT LINE */
717 case 0x00A0: /* NO-BREAK SPACE */
718 case 0x1680: /* OGHAM SPACE MARK */
719 case 0x2000: /* EN QUAD */
720 case 0x2001: /* EM QUAD */
721 case 0x2002: /* EN SPACE */
722 case 0x2003: /* EM SPACE */
723 case 0x2004: /* THREE-PER-EM SPACE */
724 case 0x2005: /* FOUR-PER-EM SPACE */
725 case 0x2006: /* SIX-PER-EM SPACE */
726 case 0x2007: /* FIGURE SPACE */
727 case 0x2008: /* PUNCTUATION SPACE */
728 case 0x2009: /* THIN SPACE */
729 case 0x200A: /* HAIR SPACE */
730 case 0x200B: /* ZERO WIDTH SPACE */
731 case 0x2028: /* LINE SEPARATOR */
732 case 0x2029: /* PARAGRAPH SEPARATOR */
733 case 0x202F: /* NARROW NO-BREAK SPACE */
734 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
735 case 0x3000: /* IDEOGRAPHIC SPACE */
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000736 return 1;
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +0000737 default:
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000738 return 0;
Marc-André Lemburg2cb94ab2005-10-20 19:06:35 +0000739 }
Guido van Rossum603484d2000-03-10 22:52:46 +0000740}
741
742/* Returns 1 for Unicode characters having the category 'Ll', 0
743 otherwise. */
744
Fredrik Lundh72b06852001-06-27 22:08:26 +0000745int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000746{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000747 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
748
749 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000750}
751
752/* Returns 1 for Unicode characters having the category 'Lu', 0
753 otherwise. */
754
Fredrik Lundh72b06852001-06-27 22:08:26 +0000755int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000756{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000757 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
758
759 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000760}
761
762/* Returns the uppercase Unicode characters corresponding to ch or just
763 ch if no uppercase mapping is known. */
764
Fredrik Lundh72b06852001-06-27 22:08:26 +0000765Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000766{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000767 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000768 int delta = ctype->upper;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000769 if (ctype->flags & NODELTA_MASK)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000770 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000771 if (delta >= 32768)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000772 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000773 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000774}
775
776/* Returns the lowercase Unicode characters corresponding to ch or just
777 ch if no lowercase mapping is known. */
778
Fredrik Lundh72b06852001-06-27 22:08:26 +0000779Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000780{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000781 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000782 int delta = ctype->lower;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000783 if (ctype->flags & NODELTA_MASK)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000784 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000785 if (delta >= 32768)
Antoine Pitrou7f14f0d2010-05-09 16:14:21 +0000786 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000787 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000788}
789
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000790/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
791 'Lo' or 'Lm', 0 otherwise. */
792
Fredrik Lundh72b06852001-06-27 22:08:26 +0000793int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000794{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000795 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000796
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000797 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000798}
799
Guido van Rossum603484d2000-03-10 22:52:46 +0000800#else
801
802/* Export the interfaces using the wchar_t type for portability
803 reasons: */
804
Fredrik Lundh72b06852001-06-27 22:08:26 +0000805int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000806{
807 return iswspace(ch);
808}
809
Fredrik Lundh72b06852001-06-27 22:08:26 +0000810int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000811{
812 return iswlower(ch);
813}
814
Fredrik Lundh72b06852001-06-27 22:08:26 +0000815int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000816{
817 return iswupper(ch);
818}
819
Fredrik Lundh72b06852001-06-27 22:08:26 +0000820Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000821{
822 return towlower(ch);
823}
824
Fredrik Lundh72b06852001-06-27 22:08:26 +0000825Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000826{
827 return towupper(ch);
828}
829
Fredrik Lundh72b06852001-06-27 22:08:26 +0000830int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000831{
832 return iswalpha(ch);
833}
834
Guido van Rossum603484d2000-03-10 22:52:46 +0000835#endif