blob: f6e325082457b07e551a76d9b5dac466a952ef2c [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000022#define XID_START_MASK 0x100
23#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000024#define PRINTABLE_MASK 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000025#define NODELTA_MASK 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000026#define NUMERIC_MASK 0x1000
Jack Jansen56cdce32000-07-06 13:57:38 +000027
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000028typedef struct {
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000029 const Py_UCS4 upper;
30 const Py_UCS4 lower;
31 const Py_UCS4 title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000032 const unsigned char decimal;
33 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000034 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000035} _PyUnicode_TypeRecord;
36
37#include "unicodetype_db.h"
38
39static const _PyUnicode_TypeRecord *
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000040gettyperecord(Py_UCS4 code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000041{
42 int index;
43
Martin v. Löwis9def6a32002-10-18 16:11:54 +000044 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000045 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000046 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000047 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000048 index = index1[(code>>SHIFT)];
49 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
50 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000051
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000052 return &_PyUnicode_TypeRecords[index];
53}
Jack Jansen56cdce32000-07-06 13:57:38 +000054
Guido van Rossum603484d2000-03-10 22:52:46 +000055/* Returns the titlecase Unicode characters corresponding to ch or just
56 ch if no titlecase mapping is known. */
57
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000058Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000059{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000060 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwis71efeb72009-04-26 01:02:07 +000061 int delta = ctype->title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000062
Martin v. Löwis93cbca32008-09-10 14:08:48 +000063 if (ctype->flags & NODELTA_MASK)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +000064 return delta;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000065
Martin v. Löwisedf368c2002-10-18 16:40:36 +000066 if (delta >= 32768)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +000067 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +000068
69 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000070}
71
72/* Returns 1 for Unicode characters having the category 'Lt', 0
73 otherwise. */
74
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000075int _PyUnicode_IsTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000076{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000077 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
78
79 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000080}
81
Martin v. Löwis13c3e382007-08-14 22:37:03 +000082/* Returns 1 for Unicode characters having the XID_Start property, 0
83 otherwise. */
84
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000085int _PyUnicode_IsXidStart(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000086{
87 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
88
89 return (ctype->flags & XID_START_MASK) != 0;
90}
91
92/* Returns 1 for Unicode characters having the XID_Continue property,
93 0 otherwise. */
94
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000095int _PyUnicode_IsXidContinue(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000096{
97 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
98
99 return (ctype->flags & XID_CONTINUE_MASK) != 0;
100}
101
Guido van Rossum603484d2000-03-10 22:52:46 +0000102/* Returns the integer decimal (0-9) for Unicode characters having
103 this property, -1 otherwise. */
104
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000105int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000106{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000107 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
108
109 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000110}
111
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000112int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000113{
114 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000115 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000116 return 1;
117}
118
119/* Returns the integer digit (0-9) for Unicode characters having
120 this property, -1 otherwise. */
121
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000122int _PyUnicode_ToDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000123{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000124 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
125
126 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000127}
128
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000129int _PyUnicode_IsDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000130{
131 if (_PyUnicode_ToDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000132 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000133 return 1;
134}
135
136/* Returns the numeric value as double for Unicode characters having
137 this property, -1.0 otherwise. */
138
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000139int _PyUnicode_IsNumeric(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000140{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000141 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
142
143 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000144}
145
Georg Brandl559e5d72008-06-11 18:37:52 +0000146/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
147 0 otherwise.
148 All characters except those characters defined in the Unicode character
149 database as following categories are considered printable.
150 * Cc (Other, Control)
151 * Cf (Other, Format)
152 * Cs (Other, Surrogate)
153 * Co (Other, Private Use)
154 * Cn (Other, Not Assigned)
155 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
156 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
157 * Zs (Separator, Space) other than ASCII space('\x20').
158*/
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000159int _PyUnicode_IsPrintable(Py_UCS4 ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160{
161 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
162
Georg Brandld52429f2008-07-04 15:55:02 +0000163 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000164}
165
Guido van Rossum603484d2000-03-10 22:52:46 +0000166#ifndef WANT_WCTYPE_FUNCTIONS
167
Guido van Rossum603484d2000-03-10 22:52:46 +0000168/* Returns 1 for Unicode characters having the category 'Ll', 0
169 otherwise. */
170
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000171int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000172{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000173 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
174
175 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000176}
177
178/* Returns 1 for Unicode characters having the category 'Lu', 0
179 otherwise. */
180
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000181int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000182{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000183 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
184
185 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000186}
187
188/* Returns the uppercase Unicode characters corresponding to ch or just
189 ch if no uppercase mapping is known. */
190
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000191Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000192{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000193 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000194 int delta = ctype->upper;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000195 if (ctype->flags & NODELTA_MASK)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000196 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000197 if (delta >= 32768)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000198 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000199 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000200}
201
202/* Returns the lowercase Unicode characters corresponding to ch or just
203 ch if no lowercase mapping is known. */
204
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000205Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000206{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000207 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000208 int delta = ctype->lower;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000209 if (ctype->flags & NODELTA_MASK)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000210 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000211 if (delta >= 32768)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000212 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000213 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000214}
215
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000216/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
217 'Lo' or 'Lm', 0 otherwise. */
218
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000219int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000220{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000221 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000222
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000223 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000224}
225
Guido van Rossum603484d2000-03-10 22:52:46 +0000226#else
227
228/* Export the interfaces using the wchar_t type for portability
229 reasons: */
230
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000231int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000232{
233 return iswlower(ch);
234}
235
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000236int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000237{
238 return iswupper(ch);
239}
240
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000241Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000242{
243 return towlower(ch);
244}
245
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000246Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000247{
248 return towupper(ch);
249}
250
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000251int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000252{
253 return iswalpha(ch);
254}
255
Guido van Rossum603484d2000-03-10 22:52:46 +0000256#endif