blob: 7ee6a6c01538aba3f860bdbc3e06742d817f7647 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Jack Jansen56cdce32000-07-06 13:57:38 +000022
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000023typedef struct {
24 const unsigned short flags;
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30} _PyUnicode_TypeRecord;
31
32#include "unicodetype_db.h"
33
34static const _PyUnicode_TypeRecord *
35gettyperecord(int code)
36{
37 int index;
38
39 if (code < 0 || code >= 65536)
40 index = 0;
41 else {
42 index = index1[(code>>SHIFT)];
43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
44 }
45 return &_PyUnicode_TypeRecords[index];
46}
Jack Jansen56cdce32000-07-06 13:57:38 +000047
Guido van Rossum603484d2000-03-10 22:52:46 +000048/* Returns 1 for Unicode characters having the category 'Zl' or type
49 'B', 0 otherwise. */
50
51int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
52{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000053 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
54
55 return (ctype->flags & LINEBREAK_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000056}
57
58/* Returns the titlecase Unicode characters corresponding to ch or just
59 ch if no titlecase mapping is known. */
60
61Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
62{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000063 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
64
65 if (ctype->title)
66 return ch + ctype->title;
67
68 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +000069}
70
71/* Returns 1 for Unicode characters having the category 'Lt', 0
72 otherwise. */
73
74int _PyUnicode_IsTitlecase(register const Py_UNICODE ch)
75{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000076 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
77
78 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000079}
80
81/* Returns the integer decimal (0-9) for Unicode characters having
82 this property, -1 otherwise. */
83
84int _PyUnicode_ToDecimalDigit(register const Py_UNICODE ch)
85{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000086 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +000089}
90
91int _PyUnicode_IsDecimalDigit(register const Py_UNICODE ch)
92{
93 if (_PyUnicode_ToDecimalDigit(ch) < 0)
94 return 0;
95 return 1;
96}
97
98/* Returns the integer digit (0-9) for Unicode characters having
99 this property, -1 otherwise. */
100
101int _PyUnicode_ToDigit(register const Py_UNICODE ch)
102{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000103 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
104
105 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000106}
107
108int _PyUnicode_IsDigit(register const Py_UNICODE ch)
109{
110 if (_PyUnicode_ToDigit(ch) < 0)
111 return 0;
112 return 1;
113}
114
115/* Returns the numeric value as double for Unicode characters having
116 this property, -1.0 otherwise. */
117
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000118/* TODO: replace with unicodetype_db.h table */
119
Guido van Rossum603484d2000-03-10 22:52:46 +0000120double _PyUnicode_ToNumeric(register const Py_UNICODE ch)
121{
122 switch (ch) {
123 case 0x3007:
124 return (double) 0;
125 case 0x09F4:
126 case 0x215F:
127 case 0x2160:
128 case 0x2170:
129 case 0x3021:
130 case 0x3280:
131 return (double) 1;
132 case 0x00BD:
133 return (double) 1 / 2;
134 case 0x2153:
135 return (double) 1 / 3;
136 case 0x00BC:
137 return (double) 1 / 4;
138 case 0x2155:
139 return (double) 1 / 5;
140 case 0x2159:
141 return (double) 1 / 6;
142 case 0x215B:
143 return (double) 1 / 8;
144 case 0x0BF0:
145 case 0x1372:
146 case 0x2169:
147 case 0x2179:
148 case 0x2469:
149 case 0x247D:
150 case 0x2491:
151 case 0x277F:
152 case 0x2789:
153 case 0x2793:
154 case 0x3038:
155 case 0x3289:
156 return (double) 10;
157 case 0x0BF1:
158 case 0x137B:
159 case 0x216D:
160 case 0x217D:
161 return (double) 100;
162 case 0x0BF2:
163 case 0x216F:
164 case 0x217F:
165 case 0x2180:
166 return (double) 1000;
167 case 0x137C:
168 case 0x2182:
169 return (double) 10000;
170 case 0x216A:
171 case 0x217A:
172 case 0x246A:
173 case 0x247E:
174 case 0x2492:
175 return (double) 11;
176 case 0x216B:
177 case 0x217B:
178 case 0x246B:
179 case 0x247F:
180 case 0x2493:
181 return (double) 12;
182 case 0x246C:
183 case 0x2480:
184 case 0x2494:
185 return (double) 13;
186 case 0x246D:
187 case 0x2481:
188 case 0x2495:
189 return (double) 14;
190 case 0x246E:
191 case 0x2482:
192 case 0x2496:
193 return (double) 15;
194 case 0x09F9:
195 case 0x246F:
196 case 0x2483:
197 case 0x2497:
198 return (double) 16;
199 case 0x16EE:
200 case 0x2470:
201 case 0x2484:
202 case 0x2498:
203 return (double) 17;
204 case 0x16EF:
205 case 0x2471:
206 case 0x2485:
207 case 0x2499:
208 return (double) 18;
209 case 0x16F0:
210 case 0x2472:
211 case 0x2486:
212 case 0x249A:
213 return (double) 19;
214 case 0x09F5:
215 case 0x2161:
216 case 0x2171:
217 case 0x3022:
218 case 0x3281:
219 return (double) 2;
220 case 0x2154:
221 return (double) 2 / 3;
222 case 0x2156:
223 return (double) 2 / 5;
224 case 0x1373:
225 case 0x2473:
226 case 0x2487:
227 case 0x249B:
228 case 0x3039:
229 return (double) 20;
230 case 0x09F6:
231 case 0x2162:
232 case 0x2172:
233 case 0x3023:
234 case 0x3282:
235 return (double) 3;
236 case 0x00BE:
237 return (double) 3 / 4;
238 case 0x2157:
239 return (double) 3 / 5;
240 case 0x215C:
241 return (double) 3 / 8;
242 case 0x1374:
243 case 0x303A:
244 return (double) 30;
245 case 0x09F7:
246 case 0x2163:
247 case 0x2173:
248 case 0x3024:
249 case 0x3283:
250 return (double) 4;
251 case 0x2158:
252 return (double) 4 / 5;
253 case 0x1375:
254 return (double) 40;
255 case 0x2164:
256 case 0x2174:
257 case 0x3025:
258 case 0x3284:
259 return (double) 5;
260 case 0x215A:
261 return (double) 5 / 6;
262 case 0x215D:
263 return (double) 5 / 8;
264 case 0x1376:
265 case 0x216C:
266 case 0x217C:
267 return (double) 50;
268 case 0x216E:
269 case 0x217E:
270 return (double) 500;
271 case 0x2181:
272 return (double) 5000;
273 case 0x2165:
274 case 0x2175:
275 case 0x3026:
276 case 0x3285:
277 return (double) 6;
278 case 0x1377:
279 return (double) 60;
280 case 0x2166:
281 case 0x2176:
282 case 0x3027:
283 case 0x3286:
284 return (double) 7;
285 case 0x215E:
286 return (double) 7 / 8;
287 case 0x1378:
288 return (double) 70;
289 case 0x2167:
290 case 0x2177:
291 case 0x3028:
292 case 0x3287:
293 return (double) 8;
294 case 0x1379:
295 return (double) 80;
296 case 0x2168:
297 case 0x2178:
298 case 0x3029:
299 case 0x3288:
300 return (double) 9;
301 case 0x137A:
302 return (double) 90;
303 default:
304 return (double) _PyUnicode_ToDigit(ch);
305 }
306}
307
308int _PyUnicode_IsNumeric(register const Py_UNICODE ch)
309{
310 if (_PyUnicode_ToNumeric(ch) < 0.0)
311 return 0;
312 return 1;
313}
314
315#ifndef WANT_WCTYPE_FUNCTIONS
316
Guido van Rossumdc742b32000-04-11 15:39:02 +0000317/* Returns 1 for Unicode characters having the bidirectional type
318 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
Guido van Rossum603484d2000-03-10 22:52:46 +0000319
320int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
321{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000322 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
323
324 return (ctype->flags & SPACE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000325}
326
327/* Returns 1 for Unicode characters having the category 'Ll', 0
328 otherwise. */
329
330int _PyUnicode_IsLowercase(register const Py_UNICODE ch)
331{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000332 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
333
334 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000335}
336
337/* Returns 1 for Unicode characters having the category 'Lu', 0
338 otherwise. */
339
340int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
341{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000342 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
343
344 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000345}
346
347/* Returns the uppercase Unicode characters corresponding to ch or just
348 ch if no uppercase mapping is known. */
349
350Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
351{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000352 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
353
354 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +0000355}
356
357/* Returns the lowercase Unicode characters corresponding to ch or just
358 ch if no lowercase mapping is known. */
359
360Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
361{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000362 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
363
364 return ch + ctype->lower;
Guido van Rossum603484d2000-03-10 22:52:46 +0000365}
366
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000367/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
368 'Lo' or 'Lm', 0 otherwise. */
369
370int _PyUnicode_IsAlpha(register const Py_UNICODE ch)
371{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000372 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000373
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000374 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000375}
376
Guido van Rossum603484d2000-03-10 22:52:46 +0000377#else
378
379/* Export the interfaces using the wchar_t type for portability
380 reasons: */
381
382int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
383{
384 return iswspace(ch);
385}
386
387int _PyUnicode_IsLowercase(register const Py_UNICODE ch)
388{
389 return iswlower(ch);
390}
391
392int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
393{
394 return iswupper(ch);
395}
396
397Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
398{
399 return towlower(ch);
400}
401
402Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
403{
404 return towupper(ch);
405}
406
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000407int _PyUnicode_IsAlpha(register const Py_UNICODE ch)
408{
409 return iswalpha(ch);
410}
411
Guido van Rossum603484d2000-03-10 22:52:46 +0000412#endif