blob: 106726d7fceefd26221446c6aa4e54942ad10437 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Jack Jansen56cdce32000-07-06 13:57:38 +000022
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000023typedef struct {
24 const unsigned short flags;
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30} _PyUnicode_TypeRecord;
31
32#include "unicodetype_db.h"
33
34static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000035gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000036{
37 int index;
38
Martin v. Löwis9def6a32002-10-18 16:11:54 +000039 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000040 index = 0;
41 else {
42 index = index1[(code>>SHIFT)];
43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
44 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000045
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046 return &_PyUnicode_TypeRecords[index];
47}
Jack Jansen56cdce32000-07-06 13:57:38 +000048
Guido van Rossum603484d2000-03-10 22:52:46 +000049/* Returns 1 for Unicode characters having the category 'Zl' or type
50 'B', 0 otherwise. */
51
Fredrik Lundh72b06852001-06-27 22:08:26 +000052int _PyUnicode_IsLinebreak(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000053{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000054 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
55
56 return (ctype->flags & LINEBREAK_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000057}
58
59/* Returns the titlecase Unicode characters corresponding to ch or just
60 ch if no titlecase mapping is known. */
61
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000062Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000063{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000064 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +000065 int delta;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000066
67 if (ctype->title)
Martin v. Löwisedf368c2002-10-18 16:40:36 +000068 delta = ctype->title;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000069 else
Martin v. Löwisedf368c2002-10-18 16:40:36 +000070 delta = ctype->upper;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000071
Martin v. Löwisedf368c2002-10-18 16:40:36 +000072 if (delta >= 32768)
73 delta -= 65536;
74
75 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000076}
77
78/* Returns 1 for Unicode characters having the category 'Lt', 0
79 otherwise. */
80
Fredrik Lundh72b06852001-06-27 22:08:26 +000081int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000082{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000083 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
84
85 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000086}
87
88/* Returns the integer decimal (0-9) for Unicode characters having
89 this property, -1 otherwise. */
90
Fredrik Lundh72b06852001-06-27 22:08:26 +000091int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000092{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000093 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
94
95 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +000096}
97
Fredrik Lundh72b06852001-06-27 22:08:26 +000098int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000099{
100 if (_PyUnicode_ToDecimalDigit(ch) < 0)
101 return 0;
102 return 1;
103}
104
105/* Returns the integer digit (0-9) for Unicode characters having
106 this property, -1 otherwise. */
107
Fredrik Lundh72b06852001-06-27 22:08:26 +0000108int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000109{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000110 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
111
112 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000113}
114
Fredrik Lundh72b06852001-06-27 22:08:26 +0000115int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000116{
117 if (_PyUnicode_ToDigit(ch) < 0)
118 return 0;
119 return 1;
120}
121
122/* Returns the numeric value as double for Unicode characters having
123 this property, -1.0 otherwise. */
124
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000125/* TODO: replace with unicodetype_db.h table */
126
Fredrik Lundh72b06852001-06-27 22:08:26 +0000127double _PyUnicode_ToNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000128{
129 switch (ch) {
130 case 0x3007:
131 return (double) 0;
132 case 0x09F4:
133 case 0x215F:
134 case 0x2160:
135 case 0x2170:
136 case 0x3021:
137 case 0x3280:
138 return (double) 1;
139 case 0x00BD:
140 return (double) 1 / 2;
141 case 0x2153:
142 return (double) 1 / 3;
143 case 0x00BC:
144 return (double) 1 / 4;
145 case 0x2155:
146 return (double) 1 / 5;
147 case 0x2159:
148 return (double) 1 / 6;
149 case 0x215B:
150 return (double) 1 / 8;
151 case 0x0BF0:
152 case 0x1372:
153 case 0x2169:
154 case 0x2179:
155 case 0x2469:
156 case 0x247D:
157 case 0x2491:
158 case 0x277F:
159 case 0x2789:
160 case 0x2793:
161 case 0x3038:
162 case 0x3289:
163 return (double) 10;
164 case 0x0BF1:
165 case 0x137B:
166 case 0x216D:
167 case 0x217D:
168 return (double) 100;
169 case 0x0BF2:
170 case 0x216F:
171 case 0x217F:
172 case 0x2180:
173 return (double) 1000;
174 case 0x137C:
175 case 0x2182:
176 return (double) 10000;
177 case 0x216A:
178 case 0x217A:
179 case 0x246A:
180 case 0x247E:
181 case 0x2492:
182 return (double) 11;
183 case 0x216B:
184 case 0x217B:
185 case 0x246B:
186 case 0x247F:
187 case 0x2493:
188 return (double) 12;
189 case 0x246C:
190 case 0x2480:
191 case 0x2494:
192 return (double) 13;
193 case 0x246D:
194 case 0x2481:
195 case 0x2495:
196 return (double) 14;
197 case 0x246E:
198 case 0x2482:
199 case 0x2496:
200 return (double) 15;
201 case 0x09F9:
202 case 0x246F:
203 case 0x2483:
204 case 0x2497:
205 return (double) 16;
206 case 0x16EE:
207 case 0x2470:
208 case 0x2484:
209 case 0x2498:
210 return (double) 17;
211 case 0x16EF:
212 case 0x2471:
213 case 0x2485:
214 case 0x2499:
215 return (double) 18;
216 case 0x16F0:
217 case 0x2472:
218 case 0x2486:
219 case 0x249A:
220 return (double) 19;
221 case 0x09F5:
222 case 0x2161:
223 case 0x2171:
224 case 0x3022:
225 case 0x3281:
226 return (double) 2;
227 case 0x2154:
228 return (double) 2 / 3;
229 case 0x2156:
230 return (double) 2 / 5;
231 case 0x1373:
232 case 0x2473:
233 case 0x2487:
234 case 0x249B:
235 case 0x3039:
236 return (double) 20;
237 case 0x09F6:
238 case 0x2162:
239 case 0x2172:
240 case 0x3023:
241 case 0x3282:
242 return (double) 3;
243 case 0x00BE:
244 return (double) 3 / 4;
245 case 0x2157:
246 return (double) 3 / 5;
247 case 0x215C:
248 return (double) 3 / 8;
249 case 0x1374:
250 case 0x303A:
251 return (double) 30;
252 case 0x09F7:
253 case 0x2163:
254 case 0x2173:
255 case 0x3024:
256 case 0x3283:
257 return (double) 4;
258 case 0x2158:
259 return (double) 4 / 5;
260 case 0x1375:
261 return (double) 40;
262 case 0x2164:
263 case 0x2174:
264 case 0x3025:
265 case 0x3284:
266 return (double) 5;
267 case 0x215A:
268 return (double) 5 / 6;
269 case 0x215D:
270 return (double) 5 / 8;
271 case 0x1376:
272 case 0x216C:
273 case 0x217C:
274 return (double) 50;
275 case 0x216E:
276 case 0x217E:
277 return (double) 500;
278 case 0x2181:
279 return (double) 5000;
280 case 0x2165:
281 case 0x2175:
282 case 0x3026:
283 case 0x3285:
284 return (double) 6;
285 case 0x1377:
286 return (double) 60;
287 case 0x2166:
288 case 0x2176:
289 case 0x3027:
290 case 0x3286:
291 return (double) 7;
292 case 0x215E:
293 return (double) 7 / 8;
294 case 0x1378:
295 return (double) 70;
296 case 0x2167:
297 case 0x2177:
298 case 0x3028:
299 case 0x3287:
300 return (double) 8;
301 case 0x1379:
302 return (double) 80;
303 case 0x2168:
304 case 0x2178:
305 case 0x3029:
306 case 0x3288:
307 return (double) 9;
308 case 0x137A:
309 return (double) 90;
310 default:
311 return (double) _PyUnicode_ToDigit(ch);
312 }
313}
314
Fredrik Lundh72b06852001-06-27 22:08:26 +0000315int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000316{
317 if (_PyUnicode_ToNumeric(ch) < 0.0)
318 return 0;
319 return 1;
320}
321
322#ifndef WANT_WCTYPE_FUNCTIONS
323
Guido van Rossumdc742b32000-04-11 15:39:02 +0000324/* Returns 1 for Unicode characters having the bidirectional type
325 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000326
Fredrik Lundh72b06852001-06-27 22:08:26 +0000327int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000328{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000329 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
330
331 return (ctype->flags & SPACE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000332}
333
334/* Returns 1 for Unicode characters having the category 'Ll', 0
335 otherwise. */
336
Fredrik Lundh72b06852001-06-27 22:08:26 +0000337int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000338{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000339 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
340
341 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000342}
343
344/* Returns 1 for Unicode characters having the category 'Lu', 0
345 otherwise. */
346
Fredrik Lundh72b06852001-06-27 22:08:26 +0000347int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000348{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000349 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
350
351 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000352}
353
354/* Returns the uppercase Unicode characters corresponding to ch or just
355 ch if no uppercase mapping is known. */
356
Fredrik Lundh72b06852001-06-27 22:08:26 +0000357Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000358{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000359 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000360 int delta = ctype->upper;
361 if (delta >= 32768)
362 delta -= 65536;
363 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000364}
365
366/* Returns the lowercase Unicode characters corresponding to ch or just
367 ch if no lowercase mapping is known. */
368
Fredrik Lundh72b06852001-06-27 22:08:26 +0000369Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000370{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000371 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000372 int delta = ctype->lower;
373 if (delta >= 32768)
374 delta -= 65536;
375 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000376}
377
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000378/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
379 'Lo' or 'Lm', 0 otherwise. */
380
Fredrik Lundh72b06852001-06-27 22:08:26 +0000381int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000382{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000383 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000384
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000385 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000386}
387
Guido van Rossum603484d2000-03-10 22:52:46 +0000388#else
389
390/* Export the interfaces using the wchar_t type for portability
391 reasons: */
392
Fredrik Lundh72b06852001-06-27 22:08:26 +0000393int _PyUnicode_IsWhitespace(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000394{
395 return iswspace(ch);
396}
397
Fredrik Lundh72b06852001-06-27 22:08:26 +0000398int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000399{
400 return iswlower(ch);
401}
402
Fredrik Lundh72b06852001-06-27 22:08:26 +0000403int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000404{
405 return iswupper(ch);
406}
407
Fredrik Lundh72b06852001-06-27 22:08:26 +0000408Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000409{
410 return towlower(ch);
411}
412
Fredrik Lundh72b06852001-06-27 22:08:26 +0000413Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000414{
415 return towupper(ch);
416}
417
Fredrik Lundh72b06852001-06-27 22:08:26 +0000418int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000419{
420 return iswalpha(ch);
421}
422
Guido van Rossum603484d2000-03-10 22:52:46 +0000423#endif