blob: 0ebdedbe9f7dd733968a25b174fb714f9472eef5 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000013#define ALPHA_MASK 0x01
14#define DECIMAL_MASK 0x02
15#define DIGIT_MASK 0x04
16#define LOWER_MASK 0x08
17#define LINEBREAK_MASK 0x10
18#define SPACE_MASK 0x20
19#define TITLE_MASK 0x40
20#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000021#define XID_START_MASK 0x100
22#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000023#define PRINTABLE_MASK 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050024#define NUMERIC_MASK 0x800
25#define CASE_IGNORABLE_MASK 0x1000
26#define CASED_MASK 0x2000
27#define EXTENDED_CASE_MASK 0x4000
Jack Jansen56cdce32000-07-06 13:57:38 +000028
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000029typedef struct {
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000030 const Py_UCS4 upper;
31 const Py_UCS4 lower;
32 const Py_UCS4 title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000033 const unsigned char decimal;
34 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000035 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000036} _PyUnicode_TypeRecord;
37
38#include "unicodetype_db.h"
39
40static const _PyUnicode_TypeRecord *
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000041gettyperecord(Py_UCS4 code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000042{
43 int index;
44
Martin v. Löwis9def6a32002-10-18 16:11:54 +000045 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000047 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000048 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000049 index = index1[(code>>SHIFT)];
50 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
51 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000052
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000053 return &_PyUnicode_TypeRecords[index];
54}
Jack Jansen56cdce32000-07-06 13:57:38 +000055
Guido van Rossum603484d2000-03-10 22:52:46 +000056/* Returns the titlecase Unicode characters corresponding to ch or just
57 ch if no titlecase mapping is known. */
58
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000059Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000060{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000061 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000062
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050063 return ctype->title ? ctype->title : ch;
Guido van Rossum603484d2000-03-10 22:52:46 +000064}
65
66/* Returns 1 for Unicode characters having the category 'Lt', 0
67 otherwise. */
68
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000069int _PyUnicode_IsTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000070{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000071 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
72
73 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000074}
75
Martin v. Löwis13c3e382007-08-14 22:37:03 +000076/* Returns 1 for Unicode characters having the XID_Start property, 0
77 otherwise. */
78
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000079int _PyUnicode_IsXidStart(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080{
81 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
82
83 return (ctype->flags & XID_START_MASK) != 0;
84}
85
86/* Returns 1 for Unicode characters having the XID_Continue property,
87 0 otherwise. */
88
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000089int _PyUnicode_IsXidContinue(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000090{
91 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
92
93 return (ctype->flags & XID_CONTINUE_MASK) != 0;
94}
95
Guido van Rossum603484d2000-03-10 22:52:46 +000096/* Returns the integer decimal (0-9) for Unicode characters having
97 this property, -1 otherwise. */
98
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000099int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000100{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000101 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
102
103 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000104}
105
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000106int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000107{
108 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000109 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000110 return 1;
111}
112
113/* Returns the integer digit (0-9) for Unicode characters having
114 this property, -1 otherwise. */
115
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000116int _PyUnicode_ToDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000117{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000118 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
119
120 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000121}
122
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000123int _PyUnicode_IsDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000124{
125 if (_PyUnicode_ToDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000126 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000127 return 1;
128}
129
130/* Returns the numeric value as double for Unicode characters having
131 this property, -1.0 otherwise. */
132
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000133int _PyUnicode_IsNumeric(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000134{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000135 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
136
137 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000138}
139
Georg Brandl559e5d72008-06-11 18:37:52 +0000140/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
141 0 otherwise.
142 All characters except those characters defined in the Unicode character
143 database as following categories are considered printable.
144 * Cc (Other, Control)
145 * Cf (Other, Format)
146 * Cs (Other, Surrogate)
147 * Co (Other, Private Use)
148 * Cn (Other, Not Assigned)
149 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
150 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
151 * Zs (Separator, Space) other than ASCII space('\x20').
152*/
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000153int _PyUnicode_IsPrintable(Py_UCS4 ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000154{
155 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
156
Georg Brandld52429f2008-07-04 15:55:02 +0000157 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000158}
159
Guido van Rossum603484d2000-03-10 22:52:46 +0000160/* Returns 1 for Unicode characters having the category 'Ll', 0
161 otherwise. */
162
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000163int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000164{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000165 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
166
167 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000168}
169
170/* Returns 1 for Unicode characters having the category 'Lu', 0
171 otherwise. */
172
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000173int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000174{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000175 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
176
177 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000178}
179
180/* Returns the uppercase Unicode characters corresponding to ch or just
181 ch if no uppercase mapping is known. */
182
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000183Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000184{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000185 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500186
187 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500188 return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500189 return ctype->upper ? ctype->upper : ch;
Guido van Rossum603484d2000-03-10 22:52:46 +0000190}
191
192/* Returns the lowercase Unicode characters corresponding to ch or just
193 ch if no lowercase mapping is known. */
194
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000195Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000196{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000197 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500198
199 if (ctype->flags & EXTENDED_CASE_MASK)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500200 return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500201 return ctype->lower ? ctype->lower : ch;
202}
203
204int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
205{
206 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
207
208 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500209 int index = ctype->lower & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500210 int n = ctype->lower >> 24;
211 int i;
212 for (i = 0; i < n; i++)
213 res[i] = _PyUnicode_ExtendedCase[index + i];
214 return n;
215 }
216 res[0] = ctype->lower ? ctype->lower : ch;
217 return 1;
218}
219
220int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
221{
222 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
223
224 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500225 int index = ctype->title & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500226 int n = ctype->title >> 24;
227 int i;
228 for (i = 0; i < n; i++)
229 res[i] = _PyUnicode_ExtendedCase[index + i];
230 return n;
231 }
232 res[0] = ctype->title ? ctype->title : ch;
233 return 1;
234}
235
236int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
237{
238 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
239
240 if (ctype->flags & EXTENDED_CASE_MASK) {
Benjamin Petersond5890c82012-01-14 13:23:30 -0500241 int index = ctype->upper & 0xFFFF;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500242 int n = ctype->upper >> 24;
243 int i;
244 for (i = 0; i < n; i++)
245 res[i] = _PyUnicode_ExtendedCase[index + i];
246 return n;
247 }
248 res[0] = ctype->upper ? ctype->upper : ch;
249 return 1;
250}
251
Benjamin Petersond5890c82012-01-14 13:23:30 -0500252int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
253{
254 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
255
256 if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
257 int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
258 int n = (ctype->lower >> 20) & 7;
259 int i;
260 for (i = 0; i < n; i++)
261 res[i] = _PyUnicode_ExtendedCase[index + i];
262 return n;
263 }
264 return _PyUnicode_ToLowerFull(ch, res);
265}
266
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500267int _PyUnicode_IsCased(Py_UCS4 ch)
268{
269 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
270
271 return (ctype->flags & CASED_MASK) != 0;
272}
273
274int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
275{
276 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
277
278 return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000279}
280
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000281/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
282 'Lo' or 'Lm', 0 otherwise. */
283
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000284int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000285{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000286 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000287
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000288 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000289}
290