blob: a10d31914565536d5e049ea63c3ee608dbd06b88 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000013#define ALPHA_MASK 0x01
14#define DECIMAL_MASK 0x02
15#define DIGIT_MASK 0x04
16#define LOWER_MASK 0x08
17#define LINEBREAK_MASK 0x10
18#define SPACE_MASK 0x20
19#define TITLE_MASK 0x40
20#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000021#define XID_START_MASK 0x100
22#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000023#define PRINTABLE_MASK 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024#define NUMERIC_MASK 0x800
25#define CASE_IGNORABLE_MASK 0x1000
26#define CASED_MASK 0x2000
27#define EXTENDED_CASE_MASK 0x4000
Jack Jansen56cdce32000-07-06 13:57:38 +000028
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000029typedef struct {
Benjamin Petersonad9c5692012-01-15 21:19:20 -050030 /*
31 These are either deltas to the character or offsets in
32 _PyUnicode_ExtendedCase.
33 */
34 const int upper;
35 const int lower;
36 const int title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000037 const unsigned char decimal;
38 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000039 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000040} _PyUnicode_TypeRecord;
41
42#include "unicodetype_db.h"
43
44static const _PyUnicode_TypeRecord *
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000045gettyperecord(Py_UCS4 code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046{
47 int index;
48
Martin v. Löwis9def6a32002-10-18 16:11:54 +000049 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000050 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000051 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000052 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000053 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000056
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000057 return &_PyUnicode_TypeRecords[index];
58}
Jack Jansen56cdce32000-07-06 13:57:38 +000059
Guido van Rossum603484d2000-03-10 22:52:46 +000060/* Returns the titlecase Unicode characters corresponding to ch or just
61 ch if no titlecase mapping is known. */
62
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000063Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000064{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000065 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000066
Benjamin Petersonad9c5692012-01-15 21:19:20 -050067 return ch + ctype->title;
Guido van Rossum603484d2000-03-10 22:52:46 +000068}
69
70/* Returns 1 for Unicode characters having the category 'Lt', 0
71 otherwise. */
72
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000073int _PyUnicode_IsTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000074{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000075 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
76
77 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000078}
79
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080/* Returns 1 for Unicode characters having the XID_Start property, 0
81 otherwise. */
82
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000083int _PyUnicode_IsXidStart(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000084{
85 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
86
87 return (ctype->flags & XID_START_MASK) != 0;
88}
89
90/* Returns 1 for Unicode characters having the XID_Continue property,
91 0 otherwise. */
92
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000093int _PyUnicode_IsXidContinue(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000094{
95 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
96
97 return (ctype->flags & XID_CONTINUE_MASK) != 0;
98}
99
Guido van Rossum603484d2000-03-10 22:52:46 +0000100/* Returns the integer decimal (0-9) for Unicode characters having
101 this property, -1 otherwise. */
102
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000103int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000104{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000105 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
106
107 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000108}
109
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000110int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000111{
112 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000113 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000114 return 1;
115}
116
117/* Returns the integer digit (0-9) for Unicode characters having
118 this property, -1 otherwise. */
119
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000120int _PyUnicode_ToDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000121{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000122 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
123
124 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000125}
126
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000127int _PyUnicode_IsDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000128{
129 if (_PyUnicode_ToDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000130 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000131 return 1;
132}
133
134/* Returns the numeric value as double for Unicode characters having
135 this property, -1.0 otherwise. */
136
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000137int _PyUnicode_IsNumeric(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000138{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000139 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
140
141 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000142}
143
Georg Brandl559e5d72008-06-11 18:37:52 +0000144/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
145 0 otherwise.
146 All characters except those characters defined in the Unicode character
147 database as following categories are considered printable.
148 * Cc (Other, Control)
149 * Cf (Other, Format)
150 * Cs (Other, Surrogate)
151 * Co (Other, Private Use)
152 * Cn (Other, Not Assigned)
153 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
154 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
155 * Zs (Separator, Space) other than ASCII space('\x20').
156*/
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000157int _PyUnicode_IsPrintable(Py_UCS4 ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000158{
159 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
160
Georg Brandld52429f2008-07-04 15:55:02 +0000161 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000162}
163
Guido van Rossum603484d2000-03-10 22:52:46 +0000164/* Returns 1 for Unicode characters having the category 'Ll', 0
165 otherwise. */
166
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000167int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000168{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000169 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
170
171 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000172}
173
174/* Returns 1 for Unicode characters having the category 'Lu', 0
175 otherwise. */
176
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000177int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000178{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000179 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
180
181 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000182}
183
184/* Returns the uppercase Unicode characters corresponding to ch or just
185 ch if no uppercase mapping is known. */
186
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000187Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000188{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000189 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500190
191 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500192 return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500193 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +0000194}
195
196/* Returns the lowercase Unicode characters corresponding to ch or just
197 ch if no lowercase mapping is known. */
198
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000199Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000200{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000201 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500202
203 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500204 return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500205 return ch + ctype->lower;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500206}
207
208int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
209{
210 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
211
212 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500213 int index = ctype->lower & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500214 int n = ctype->lower >> 24;
215 int i;
216 for (i = 0; i < n; i++)
217 res[i] = _PyUnicode_ExtendedCase[index + i];
218 return n;
219 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500220 res[0] = ch + ctype->lower;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500221 return 1;
222}
223
224int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
225{
226 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
227
228 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500229 int index = ctype->title & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500230 int n = ctype->title >> 24;
231 int i;
232 for (i = 0; i < n; i++)
233 res[i] = _PyUnicode_ExtendedCase[index + i];
234 return n;
235 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500236 res[0] = ch + ctype->title;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500237 return 1;
238}
239
240int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
241{
242 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
243
244 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500245 int index = ctype->upper & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500246 int n = ctype->upper >> 24;
247 int i;
248 for (i = 0; i < n; i++)
249 res[i] = _PyUnicode_ExtendedCase[index + i];
250 return n;
251 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500252 res[0] = ch + ctype->upper;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500253 return 1;
254}
255
Benjamin Petersond5890c82012-01-14 13:23:30 -0500256int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
257{
258 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
259
260 if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
261 int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
262 int n = (ctype->lower >> 20) & 7;
263 int i;
264 for (i = 0; i < n; i++)
265 res[i] = _PyUnicode_ExtendedCase[index + i];
266 return n;
267 }
268 return _PyUnicode_ToLowerFull(ch, res);
269}
270
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500271int _PyUnicode_IsCased(Py_UCS4 ch)
272{
273 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
274
275 return (ctype->flags & CASED_MASK) != 0;
276}
277
278int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
279{
280 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
281
282 return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000283}
284
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000285/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
286 'Lo' or 'Lm', 0 otherwise. */
287
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000288int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000289{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000290 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000291
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000292 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000293}
294