blob: d8c95c8b44cea71ec201812af4a0c4884b933555 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000013#define ALPHA_MASK 0x01
14#define DECIMAL_MASK 0x02
15#define DIGIT_MASK 0x04
16#define LOWER_MASK 0x08
17#define LINEBREAK_MASK 0x10
18#define SPACE_MASK 0x20
19#define TITLE_MASK 0x40
20#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000021#define XID_START_MASK 0x100
22#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000023#define PRINTABLE_MASK 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024#define NUMERIC_MASK 0x800
25#define CASE_IGNORABLE_MASK 0x1000
26#define CASED_MASK 0x2000
27#define EXTENDED_CASE_MASK 0x4000
Jack Jansen56cdce32000-07-06 13:57:38 +000028
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000029typedef struct {
Serhiy Storchaka009b8112015-03-18 21:53:15 +020030 /*
Benjamin Petersonad9c5692012-01-15 21:19:20 -050031 These are either deltas to the character or offsets in
32 _PyUnicode_ExtendedCase.
33 */
34 const int upper;
35 const int lower;
36 const int title;
Benjamin Peterson9487c4d2012-01-15 21:26:23 -050037 /* Note if more flag space is needed, decimal and digit could be unified. */
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000038 const unsigned char decimal;
39 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000040 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000041} _PyUnicode_TypeRecord;
42
43#include "unicodetype_db.h"
44
45static const _PyUnicode_TypeRecord *
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000046gettyperecord(Py_UCS4 code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000047{
48 int index;
49
Martin v. Löwis9def6a32002-10-18 16:11:54 +000050 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000051 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000052 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000053 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000054 index = index1[(code>>SHIFT)];
55 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
56 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000057
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000058 return &_PyUnicode_TypeRecords[index];
59}
Jack Jansen56cdce32000-07-06 13:57:38 +000060
Guido van Rossum603484d2000-03-10 22:52:46 +000061/* Returns the titlecase Unicode characters corresponding to ch or just
62 ch if no titlecase mapping is known. */
63
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020064Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000065{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000066 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000067
Benjamin Petersonc6630b92012-01-15 21:33:32 -050068 if (ctype->flags & EXTENDED_CASE_MASK)
69 return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -050070 return ch + ctype->title;
Guido van Rossum603484d2000-03-10 22:52:46 +000071}
72
73/* Returns 1 for Unicode characters having the category 'Lt', 0
74 otherwise. */
75
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000076int _PyUnicode_IsTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000077{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000078 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
79
80 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000081}
82
Martin v. Löwis13c3e382007-08-14 22:37:03 +000083/* Returns 1 for Unicode characters having the XID_Start property, 0
84 otherwise. */
85
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000086int _PyUnicode_IsXidStart(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000087{
88 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
89
90 return (ctype->flags & XID_START_MASK) != 0;
91}
92
93/* Returns 1 for Unicode characters having the XID_Continue property,
94 0 otherwise. */
95
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000096int _PyUnicode_IsXidContinue(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000097{
98 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
99
100 return (ctype->flags & XID_CONTINUE_MASK) != 0;
101}
102
Guido van Rossum603484d2000-03-10 22:52:46 +0000103/* Returns the integer decimal (0-9) for Unicode characters having
104 this property, -1 otherwise. */
105
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000106int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000107{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000108 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
109
110 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000111}
112
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000113int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000114{
115 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000116 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000117 return 1;
118}
119
120/* Returns the integer digit (0-9) for Unicode characters having
121 this property, -1 otherwise. */
122
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000123int _PyUnicode_ToDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000124{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000125 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
126
127 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000128}
129
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000130int _PyUnicode_IsDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000131{
132 if (_PyUnicode_ToDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000133 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000134 return 1;
135}
136
137/* Returns the numeric value as double for Unicode characters having
138 this property, -1.0 otherwise. */
139
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000140int _PyUnicode_IsNumeric(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000141{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000142 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
143
144 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000145}
146
Georg Brandl559e5d72008-06-11 18:37:52 +0000147/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
148 0 otherwise.
149 All characters except those characters defined in the Unicode character
150 database as following categories are considered printable.
151 * Cc (Other, Control)
152 * Cf (Other, Format)
153 * Cs (Other, Surrogate)
154 * Co (Other, Private Use)
155 * Cn (Other, Not Assigned)
156 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
157 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
158 * Zs (Separator, Space) other than ASCII space('\x20').
159*/
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000160int _PyUnicode_IsPrintable(Py_UCS4 ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000161{
162 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
163
Georg Brandld52429f2008-07-04 15:55:02 +0000164 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000165}
166
Guido van Rossum603484d2000-03-10 22:52:46 +0000167/* Returns 1 for Unicode characters having the category 'Ll', 0
168 otherwise. */
169
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000170int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000171{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000172 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
173
174 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000175}
176
177/* Returns 1 for Unicode characters having the category 'Lu', 0
178 otherwise. */
179
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000180int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000181{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000182 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
183
184 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000185}
186
187/* Returns the uppercase Unicode characters corresponding to ch or just
188 ch if no uppercase mapping is known. */
189
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000190Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000191{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000192 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500193
194 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500195 return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500196 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +0000197}
198
199/* Returns the lowercase Unicode characters corresponding to ch or just
200 ch if no lowercase mapping is known. */
201
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000202Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000203{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000204 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500205
206 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500207 return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500208 return ch + ctype->lower;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500209}
210
211int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
212{
213 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
214
215 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500216 int index = ctype->lower & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500217 int n = ctype->lower >> 24;
218 int i;
219 for (i = 0; i < n; i++)
220 res[i] = _PyUnicode_ExtendedCase[index + i];
221 return n;
222 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500223 res[0] = ch + ctype->lower;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500224 return 1;
225}
226
227int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
228{
229 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
230
231 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500232 int index = ctype->title & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500233 int n = ctype->title >> 24;
234 int i;
235 for (i = 0; i < n; i++)
236 res[i] = _PyUnicode_ExtendedCase[index + i];
237 return n;
238 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500239 res[0] = ch + ctype->title;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500240 return 1;
241}
242
243int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
244{
245 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
246
247 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500248 int index = ctype->upper & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500249 int n = ctype->upper >> 24;
250 int i;
251 for (i = 0; i < n; i++)
252 res[i] = _PyUnicode_ExtendedCase[index + i];
253 return n;
254 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500255 res[0] = ch + ctype->upper;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500256 return 1;
257}
258
Benjamin Petersond5890c82012-01-14 13:23:30 -0500259int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
260{
261 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
262
263 if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
264 int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
265 int n = (ctype->lower >> 20) & 7;
266 int i;
267 for (i = 0; i < n; i++)
268 res[i] = _PyUnicode_ExtendedCase[index + i];
269 return n;
270 }
271 return _PyUnicode_ToLowerFull(ch, res);
272}
273
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500274int _PyUnicode_IsCased(Py_UCS4 ch)
275{
276 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
277
278 return (ctype->flags & CASED_MASK) != 0;
279}
280
281int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
282{
283 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
284
285 return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000286}
287
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000288/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
289 'Lo' or 'Lm', 0 otherwise. */
290
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000291int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000292{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000293 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000294
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000295 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000296}
297