blob: b21fd35a4a44818dcd6bfbe0c3056ab37fd6601a [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000013#define ALPHA_MASK 0x01
14#define DECIMAL_MASK 0x02
15#define DIGIT_MASK 0x04
16#define LOWER_MASK 0x08
17#define LINEBREAK_MASK 0x10
18#define SPACE_MASK 0x20
19#define TITLE_MASK 0x40
20#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000021#define XID_START_MASK 0x100
22#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000023#define PRINTABLE_MASK 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024#define NUMERIC_MASK 0x800
25#define CASE_IGNORABLE_MASK 0x1000
26#define CASED_MASK 0x2000
27#define EXTENDED_CASE_MASK 0x4000
Jack Jansen56cdce32000-07-06 13:57:38 +000028
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000029typedef struct {
Benjamin Petersonad9c5692012-01-15 21:19:20 -050030 /*
31 These are either deltas to the character or offsets in
32 _PyUnicode_ExtendedCase.
33 */
34 const int upper;
35 const int lower;
36 const int title;
Benjamin Peterson9487c4d2012-01-15 21:26:23 -050037 /* Note if more flag space is needed, decimal and digit could be unified. */
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000038 const unsigned char decimal;
39 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000040 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000041} _PyUnicode_TypeRecord;
42
43#include "unicodetype_db.h"
44
45static const _PyUnicode_TypeRecord *
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000046gettyperecord(Py_UCS4 code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000047{
48 int index;
49
Martin v. Löwis9def6a32002-10-18 16:11:54 +000050 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000051 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000052 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000053 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000054 index = index1[(code>>SHIFT)];
55 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
56 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000057
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000058 return &_PyUnicode_TypeRecords[index];
59}
Jack Jansen56cdce32000-07-06 13:57:38 +000060
Guido van Rossum603484d2000-03-10 22:52:46 +000061/* Returns the titlecase Unicode characters corresponding to ch or just
62 ch if no titlecase mapping is known. */
63
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000064Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000065{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000066 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000067
Benjamin Petersonad9c5692012-01-15 21:19:20 -050068 return ch + ctype->title;
Guido van Rossum603484d2000-03-10 22:52:46 +000069}
70
71/* Returns 1 for Unicode characters having the category 'Lt', 0
72 otherwise. */
73
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000074int _PyUnicode_IsTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000075{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000076 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
77
78 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000079}
80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000081/* Returns 1 for Unicode characters having the XID_Start property, 0
82 otherwise. */
83
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000084int _PyUnicode_IsXidStart(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000085{
86 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88 return (ctype->flags & XID_START_MASK) != 0;
89}
90
91/* Returns 1 for Unicode characters having the XID_Continue property,
92 0 otherwise. */
93
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000094int _PyUnicode_IsXidContinue(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000095{
96 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97
98 return (ctype->flags & XID_CONTINUE_MASK) != 0;
99}
100
Guido van Rossum603484d2000-03-10 22:52:46 +0000101/* Returns the integer decimal (0-9) for Unicode characters having
102 this property, -1 otherwise. */
103
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000104int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000105{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000106 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
107
108 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000109}
110
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000111int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000112{
113 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000114 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000115 return 1;
116}
117
118/* Returns the integer digit (0-9) for Unicode characters having
119 this property, -1 otherwise. */
120
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000121int _PyUnicode_ToDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000122{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000123 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
124
125 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000126}
127
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000128int _PyUnicode_IsDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000129{
130 if (_PyUnicode_ToDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000131 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000132 return 1;
133}
134
135/* Returns the numeric value as double for Unicode characters having
136 this property, -1.0 otherwise. */
137
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000138int _PyUnicode_IsNumeric(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000139{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000140 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
141
142 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000143}
144
Georg Brandl559e5d72008-06-11 18:37:52 +0000145/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
146 0 otherwise.
147 All characters except those characters defined in the Unicode character
148 database as following categories are considered printable.
149 * Cc (Other, Control)
150 * Cf (Other, Format)
151 * Cs (Other, Surrogate)
152 * Co (Other, Private Use)
153 * Cn (Other, Not Assigned)
154 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
155 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
156 * Zs (Separator, Space) other than ASCII space('\x20').
157*/
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000158int _PyUnicode_IsPrintable(Py_UCS4 ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000159{
160 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
161
Georg Brandld52429f2008-07-04 15:55:02 +0000162 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000163}
164
Guido van Rossum603484d2000-03-10 22:52:46 +0000165/* Returns 1 for Unicode characters having the category 'Ll', 0
166 otherwise. */
167
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000168int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000169{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000170 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
171
172 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000173}
174
175/* Returns 1 for Unicode characters having the category 'Lu', 0
176 otherwise. */
177
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000178int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000179{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000180 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
181
182 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000183}
184
185/* Returns the uppercase Unicode characters corresponding to ch or just
186 ch if no uppercase mapping is known. */
187
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000188Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000189{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000190 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500191
192 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500193 return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500194 return ch + ctype->upper;
Guido van Rossum603484d2000-03-10 22:52:46 +0000195}
196
197/* Returns the lowercase Unicode characters corresponding to ch or just
198 ch if no lowercase mapping is known. */
199
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000200Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000201{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000202 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500203
204 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500205 return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500206 return ch + ctype->lower;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500207}
208
209int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
210{
211 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
212
213 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500214 int index = ctype->lower & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500215 int n = ctype->lower >> 24;
216 int i;
217 for (i = 0; i < n; i++)
218 res[i] = _PyUnicode_ExtendedCase[index + i];
219 return n;
220 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500221 res[0] = ch + ctype->lower;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500222 return 1;
223}
224
225int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
226{
227 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
228
229 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500230 int index = ctype->title & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500231 int n = ctype->title >> 24;
232 int i;
233 for (i = 0; i < n; i++)
234 res[i] = _PyUnicode_ExtendedCase[index + i];
235 return n;
236 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500237 res[0] = ch + ctype->title;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500238 return 1;
239}
240
241int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
242{
243 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
244
245 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500246 int index = ctype->upper & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500247 int n = ctype->upper >> 24;
248 int i;
249 for (i = 0; i < n; i++)
250 res[i] = _PyUnicode_ExtendedCase[index + i];
251 return n;
252 }
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500253 res[0] = ch + ctype->upper;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500254 return 1;
255}
256
Benjamin Petersond5890c82012-01-14 13:23:30 -0500257int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
258{
259 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
260
261 if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
262 int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
263 int n = (ctype->lower >> 20) & 7;
264 int i;
265 for (i = 0; i < n; i++)
266 res[i] = _PyUnicode_ExtendedCase[index + i];
267 return n;
268 }
269 return _PyUnicode_ToLowerFull(ch, res);
270}
271
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500272int _PyUnicode_IsCased(Py_UCS4 ch)
273{
274 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
275
276 return (ctype->flags & CASED_MASK) != 0;
277}
278
279int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
280{
281 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
282
283 return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000284}
285
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000286/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
287 'Lo' or 'Lm', 0 otherwise. */
288
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000289int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000290{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000291 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000292
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000293 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000294}
295