blob: d770bdbf6582fbda4cc6cac00d8b39aaf656c11d [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000022#define WIDE_MASK 0x100
Jack Jansen56cdce32000-07-06 13:57:38 +000023
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000024typedef struct {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000025 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000030 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000031} _PyUnicode_TypeRecord;
32
33#include "unicodetype_db.h"
34
35static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000036gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000037{
38 int index;
39
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000040#ifdef Py_UNICODE_WIDE
Martin v. Löwis9def6a32002-10-18 16:11:54 +000041 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000042 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000043 else
44#endif
45 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046 index = index1[(code>>SHIFT)];
47 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
48 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000049
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000050 return &_PyUnicode_TypeRecords[index];
51}
Jack Jansen56cdce32000-07-06 13:57:38 +000052
Guido van Rossum603484d2000-03-10 22:52:46 +000053/* Returns 1 for Unicode characters having the category 'Zl' or type
54 'B', 0 otherwise. */
55
Fredrik Lundh72b06852001-06-27 22:08:26 +000056int _PyUnicode_IsLinebreak(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000057{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000058 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
59
60 return (ctype->flags & LINEBREAK_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000061}
62
63/* Returns the titlecase Unicode characters corresponding to ch or just
64 ch if no titlecase mapping is known. */
65
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000066Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000067{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000068 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +000069 int delta;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000070
71 if (ctype->title)
Martin v. Löwisedf368c2002-10-18 16:40:36 +000072 delta = ctype->title;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000073 else
Martin v. Löwisedf368c2002-10-18 16:40:36 +000074 delta = ctype->upper;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000075
Martin v. Löwisedf368c2002-10-18 16:40:36 +000076 if (delta >= 32768)
77 delta -= 65536;
78
79 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000080}
81
82/* Returns 1 for Unicode characters having the category 'Lt', 0
83 otherwise. */
84
Fredrik Lundh72b06852001-06-27 22:08:26 +000085int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000086{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000087 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
88
89 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000090}
91
92/* Returns the integer decimal (0-9) for Unicode characters having
93 this property, -1 otherwise. */
94
Fredrik Lundh72b06852001-06-27 22:08:26 +000095int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000096{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000097 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
98
99 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000100}
101
Fredrik Lundh72b06852001-06-27 22:08:26 +0000102int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000103{
104 if (_PyUnicode_ToDecimalDigit(ch) < 0)
105 return 0;
106 return 1;
107}
108
109/* Returns the integer digit (0-9) for Unicode characters having
110 this property, -1 otherwise. */
111
Fredrik Lundh72b06852001-06-27 22:08:26 +0000112int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000113{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000114 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
115
116 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000117}
118
Fredrik Lundh72b06852001-06-27 22:08:26 +0000119int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000120{
121 if (_PyUnicode_ToDigit(ch) < 0)
122 return 0;
123 return 1;
124}
125
126/* Returns the numeric value as double for Unicode characters having
127 this property, -1.0 otherwise. */
128
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000129/* TODO: replace with unicodetype_db.h table */
130
Fredrik Lundh72b06852001-06-27 22:08:26 +0000131double _PyUnicode_ToNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000132{
133 switch (ch) {
134 case 0x3007:
135 return (double) 0;
136 case 0x09F4:
137 case 0x215F:
138 case 0x2160:
139 case 0x2170:
140 case 0x3021:
141 case 0x3280:
142 return (double) 1;
143 case 0x00BD:
144 return (double) 1 / 2;
145 case 0x2153:
146 return (double) 1 / 3;
147 case 0x00BC:
148 return (double) 1 / 4;
149 case 0x2155:
150 return (double) 1 / 5;
151 case 0x2159:
152 return (double) 1 / 6;
153 case 0x215B:
154 return (double) 1 / 8;
155 case 0x0BF0:
156 case 0x1372:
157 case 0x2169:
158 case 0x2179:
159 case 0x2469:
160 case 0x247D:
161 case 0x2491:
162 case 0x277F:
163 case 0x2789:
164 case 0x2793:
165 case 0x3038:
166 case 0x3289:
167 return (double) 10;
168 case 0x0BF1:
169 case 0x137B:
170 case 0x216D:
171 case 0x217D:
172 return (double) 100;
173 case 0x0BF2:
174 case 0x216F:
175 case 0x217F:
176 case 0x2180:
177 return (double) 1000;
178 case 0x137C:
179 case 0x2182:
180 return (double) 10000;
181 case 0x216A:
182 case 0x217A:
183 case 0x246A:
184 case 0x247E:
185 case 0x2492:
186 return (double) 11;
187 case 0x216B:
188 case 0x217B:
189 case 0x246B:
190 case 0x247F:
191 case 0x2493:
192 return (double) 12;
193 case 0x246C:
194 case 0x2480:
195 case 0x2494:
196 return (double) 13;
197 case 0x246D:
198 case 0x2481:
199 case 0x2495:
200 return (double) 14;
201 case 0x246E:
202 case 0x2482:
203 case 0x2496:
204 return (double) 15;
205 case 0x09F9:
206 case 0x246F:
207 case 0x2483:
208 case 0x2497:
209 return (double) 16;
210 case 0x16EE:
211 case 0x2470:
212 case 0x2484:
213 case 0x2498:
214 return (double) 17;
215 case 0x16EF:
216 case 0x2471:
217 case 0x2485:
218 case 0x2499:
219 return (double) 18;
220 case 0x16F0:
221 case 0x2472:
222 case 0x2486:
223 case 0x249A:
224 return (double) 19;
225 case 0x09F5:
226 case 0x2161:
227 case 0x2171:
228 case 0x3022:
229 case 0x3281:
230 return (double) 2;
231 case 0x2154:
232 return (double) 2 / 3;
233 case 0x2156:
234 return (double) 2 / 5;
235 case 0x1373:
236 case 0x2473:
237 case 0x2487:
238 case 0x249B:
239 case 0x3039:
240 return (double) 20;
241 case 0x09F6:
242 case 0x2162:
243 case 0x2172:
244 case 0x3023:
245 case 0x3282:
246 return (double) 3;
247 case 0x00BE:
248 return (double) 3 / 4;
249 case 0x2157:
250 return (double) 3 / 5;
251 case 0x215C:
252 return (double) 3 / 8;
253 case 0x1374:
254 case 0x303A:
255 return (double) 30;
256 case 0x09F7:
257 case 0x2163:
258 case 0x2173:
259 case 0x3024:
260 case 0x3283:
261 return (double) 4;
262 case 0x2158:
263 return (double) 4 / 5;
264 case 0x1375:
265 return (double) 40;
266 case 0x2164:
267 case 0x2174:
268 case 0x3025:
269 case 0x3284:
270 return (double) 5;
271 case 0x215A:
272 return (double) 5 / 6;
273 case 0x215D:
274 return (double) 5 / 8;
275 case 0x1376:
276 case 0x216C:
277 case 0x217C:
278 return (double) 50;
279 case 0x216E:
280 case 0x217E:
281 return (double) 500;
282 case 0x2181:
283 return (double) 5000;
284 case 0x2165:
285 case 0x2175:
286 case 0x3026:
287 case 0x3285:
288 return (double) 6;
289 case 0x1377:
290 return (double) 60;
291 case 0x2166:
292 case 0x2176:
293 case 0x3027:
294 case 0x3286:
295 return (double) 7;
296 case 0x215E:
297 return (double) 7 / 8;
298 case 0x1378:
299 return (double) 70;
300 case 0x2167:
301 case 0x2177:
302 case 0x3028:
303 case 0x3287:
304 return (double) 8;
305 case 0x1379:
306 return (double) 80;
307 case 0x2168:
308 case 0x2178:
309 case 0x3029:
310 case 0x3288:
311 return (double) 9;
312 case 0x137A:
313 return (double) 90;
314 default:
315 return (double) _PyUnicode_ToDigit(ch);
316 }
317}
318
Fredrik Lundh72b06852001-06-27 22:08:26 +0000319int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000320{
321 if (_PyUnicode_ToNumeric(ch) < 0.0)
322 return 0;
323 return 1;
324}
325
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000326/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
327
328int _PyUnicode_IsWide(Py_UNICODE ch)
329{
330 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
331
332 return (ctype->flags & WIDE_MASK) != 0;
333}
334
Guido van Rossum603484d2000-03-10 22:52:46 +0000335#ifndef WANT_WCTYPE_FUNCTIONS
336
Guido van Rossumdc742b32000-04-11 15:39:02 +0000337/* Returns 1 for Unicode characters having the bidirectional type
338 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000339
Fredrik Lundh72b06852001-06-27 22:08:26 +0000340int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000341{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000342 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
343
344 return (ctype->flags & SPACE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000345}
346
347/* Returns 1 for Unicode characters having the category 'Ll', 0
348 otherwise. */
349
Fredrik Lundh72b06852001-06-27 22:08:26 +0000350int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000351{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000352 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
353
354 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000355}
356
357/* Returns 1 for Unicode characters having the category 'Lu', 0
358 otherwise. */
359
Fredrik Lundh72b06852001-06-27 22:08:26 +0000360int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000361{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000362 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
363
364 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000365}
366
367/* Returns the uppercase Unicode characters corresponding to ch or just
368 ch if no uppercase mapping is known. */
369
Fredrik Lundh72b06852001-06-27 22:08:26 +0000370Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000371{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000372 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000373 int delta = ctype->upper;
374 if (delta >= 32768)
375 delta -= 65536;
376 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000377}
378
379/* Returns the lowercase Unicode characters corresponding to ch or just
380 ch if no lowercase mapping is known. */
381
Fredrik Lundh72b06852001-06-27 22:08:26 +0000382Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000383{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000384 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000385 int delta = ctype->lower;
386 if (delta >= 32768)
387 delta -= 65536;
388 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000389}
390
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000391/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
392 'Lo' or 'Lm', 0 otherwise. */
393
Fredrik Lundh72b06852001-06-27 22:08:26 +0000394int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000395{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000396 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000397
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000398 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000399}
400
Guido van Rossum603484d2000-03-10 22:52:46 +0000401#else
402
403/* Export the interfaces using the wchar_t type for portability
404 reasons: */
405
Fredrik Lundh72b06852001-06-27 22:08:26 +0000406int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000407{
408 return iswspace(ch);
409}
410
Fredrik Lundh72b06852001-06-27 22:08:26 +0000411int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000412{
413 return iswlower(ch);
414}
415
Fredrik Lundh72b06852001-06-27 22:08:26 +0000416int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000417{
418 return iswupper(ch);
419}
420
Fredrik Lundh72b06852001-06-27 22:08:26 +0000421Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000422{
423 return towlower(ch);
424}
425
Fredrik Lundh72b06852001-06-27 22:08:26 +0000426Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000427{
428 return towupper(ch);
429}
430
Fredrik Lundh72b06852001-06-27 22:08:26 +0000431int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000432{
433 return iswalpha(ch);
434}
435
Guido van Rossum603484d2000-03-10 22:52:46 +0000436#endif