blob: 1849831e84d50134d5acacc34c43defa870e3290 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012#include "unicodeobject.h"
13
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000014#define ALPHA_MASK 0x01
15#define DECIMAL_MASK 0x02
16#define DIGIT_MASK 0x04
17#define LOWER_MASK 0x08
18#define LINEBREAK_MASK 0x10
19#define SPACE_MASK 0x20
20#define TITLE_MASK 0x40
21#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000022#define XID_START_MASK 0x100
23#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000024#define PRINTABLE_MASK 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000025#define NODELTA_MASK 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000026#define NUMERIC_MASK 0x1000
Jack Jansen56cdce32000-07-06 13:57:38 +000027
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000028typedef struct {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000029 const Py_UNICODE upper;
30 const Py_UNICODE lower;
31 const Py_UNICODE title;
32 const unsigned char decimal;
33 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000034 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000035} _PyUnicode_TypeRecord;
36
37#include "unicodetype_db.h"
38
39static const _PyUnicode_TypeRecord *
Fredrik Lundhee13dba2001-06-26 20:36:12 +000040gettyperecord(Py_UNICODE code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000041{
42 int index;
43
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000044#ifdef Py_UNICODE_WIDE
Martin v. Löwis9def6a32002-10-18 16:11:54 +000045 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000046 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000047 else
48#endif
49 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000050 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000053
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000054 return &_PyUnicode_TypeRecords[index];
55}
Jack Jansen56cdce32000-07-06 13:57:38 +000056
Guido van Rossum603484d2000-03-10 22:52:46 +000057/* Returns the titlecase Unicode characters corresponding to ch or just
58 ch if no titlecase mapping is known. */
59
Martin v. Löwisce9b5a52001-06-27 06:28:56 +000060Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000061{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000062 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwis71efeb72009-04-26 01:02:07 +000063 int delta = ctype->title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000064
Martin v. Löwis93cbca32008-09-10 14:08:48 +000065 if (ctype->flags & NODELTA_MASK)
66 return delta;
67
Martin v. Löwisedf368c2002-10-18 16:40:36 +000068 if (delta >= 32768)
69 delta -= 65536;
70
71 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000072}
73
74/* Returns 1 for Unicode characters having the category 'Lt', 0
75 otherwise. */
76
Fredrik Lundh72b06852001-06-27 22:08:26 +000077int _PyUnicode_IsTitlecase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000078{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000079 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
80
81 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000082}
83
Martin v. Löwis13c3e382007-08-14 22:37:03 +000084/* Returns 1 for Unicode characters having the XID_Start property, 0
85 otherwise. */
86
87int _PyUnicode_IsXidStart(Py_UNICODE ch)
88{
89 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
90
91 return (ctype->flags & XID_START_MASK) != 0;
92}
93
94/* Returns 1 for Unicode characters having the XID_Continue property,
95 0 otherwise. */
96
97int _PyUnicode_IsXidContinue(Py_UNICODE ch)
98{
99 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
100
101 return (ctype->flags & XID_CONTINUE_MASK) != 0;
102}
103
Guido van Rossum603484d2000-03-10 22:52:46 +0000104/* Returns the integer decimal (0-9) for Unicode characters having
105 this property, -1 otherwise. */
106
Fredrik Lundh72b06852001-06-27 22:08:26 +0000107int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000108{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000109 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
110
111 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000112}
113
Fredrik Lundh72b06852001-06-27 22:08:26 +0000114int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000115{
116 if (_PyUnicode_ToDecimalDigit(ch) < 0)
117 return 0;
118 return 1;
119}
120
121/* Returns the integer digit (0-9) for Unicode characters having
122 this property, -1 otherwise. */
123
Fredrik Lundh72b06852001-06-27 22:08:26 +0000124int _PyUnicode_ToDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000125{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000126 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
127
128 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000129}
130
Fredrik Lundh72b06852001-06-27 22:08:26 +0000131int _PyUnicode_IsDigit(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000132{
133 if (_PyUnicode_ToDigit(ch) < 0)
134 return 0;
135 return 1;
136}
137
138/* Returns the numeric value as double for Unicode characters having
139 this property, -1.0 otherwise. */
140
Fredrik Lundh72b06852001-06-27 22:08:26 +0000141int _PyUnicode_IsNumeric(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000142{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000143 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
144
145 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000146}
147
Georg Brandl559e5d72008-06-11 18:37:52 +0000148/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
149 0 otherwise.
150 All characters except those characters defined in the Unicode character
151 database as following categories are considered printable.
152 * Cc (Other, Control)
153 * Cf (Other, Format)
154 * Cs (Other, Surrogate)
155 * Co (Other, Private Use)
156 * Cn (Other, Not Assigned)
157 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
158 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
159 * Zs (Separator, Space) other than ASCII space('\x20').
160*/
161int _PyUnicode_IsPrintable(Py_UNICODE ch)
162{
163 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
164
Georg Brandld52429f2008-07-04 15:55:02 +0000165 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000166}
167
Guido van Rossum603484d2000-03-10 22:52:46 +0000168#ifndef WANT_WCTYPE_FUNCTIONS
169
Guido van Rossum603484d2000-03-10 22:52:46 +0000170/* Returns 1 for Unicode characters having the category 'Ll', 0
171 otherwise. */
172
Fredrik Lundh72b06852001-06-27 22:08:26 +0000173int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000174{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000175 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
176
177 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000178}
179
180/* Returns 1 for Unicode characters having the category 'Lu', 0
181 otherwise. */
182
Fredrik Lundh72b06852001-06-27 22:08:26 +0000183int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000184{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000185 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
186
187 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000188}
189
190/* Returns the uppercase Unicode characters corresponding to ch or just
191 ch if no uppercase mapping is known. */
192
Fredrik Lundh72b06852001-06-27 22:08:26 +0000193Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000194{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000195 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000196 int delta = ctype->upper;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000197 if (ctype->flags & NODELTA_MASK)
198 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000199 if (delta >= 32768)
200 delta -= 65536;
201 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000202}
203
204/* Returns the lowercase Unicode characters corresponding to ch or just
205 ch if no lowercase mapping is known. */
206
Fredrik Lundh72b06852001-06-27 22:08:26 +0000207Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000208{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000209 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000210 int delta = ctype->lower;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000211 if (ctype->flags & NODELTA_MASK)
212 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000213 if (delta >= 32768)
214 delta -= 65536;
215 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000216}
217
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000218/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
219 'Lo' or 'Lm', 0 otherwise. */
220
Fredrik Lundh72b06852001-06-27 22:08:26 +0000221int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000222{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000223 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000224
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000225 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000226}
227
Guido van Rossum603484d2000-03-10 22:52:46 +0000228#else
229
230/* Export the interfaces using the wchar_t type for portability
231 reasons: */
232
Fredrik Lundh72b06852001-06-27 22:08:26 +0000233int _PyUnicode_IsLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000234{
235 return iswlower(ch);
236}
237
Fredrik Lundh72b06852001-06-27 22:08:26 +0000238int _PyUnicode_IsUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000239{
240 return iswupper(ch);
241}
242
Fredrik Lundh72b06852001-06-27 22:08:26 +0000243Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000244{
245 return towlower(ch);
246}
247
Fredrik Lundh72b06852001-06-27 22:08:26 +0000248Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000249{
250 return towupper(ch);
251}
252
Fredrik Lundh72b06852001-06-27 22:08:26 +0000253int _PyUnicode_IsAlpha(Py_UNICODE ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000254{
255 return iswalpha(ch);
256}
257
Guido van Rossum603484d2000-03-10 22:52:46 +0000258#endif