blob: 9f6ac89b9f2a0e062b772e311e9efe078e437178 [file] [log] [blame]
Guido van Rossum603484d2000-03-10 22:52:46 +00001/*
2 Unicode character type helpers.
3
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum603484d2000-03-10 22:52:46 +00006
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +00007 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum603484d2000-03-10 22:52:46 +00008
9*/
10
11#include "Python.h"
Guido van Rossum603484d2000-03-10 22:52:46 +000012
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000013#define ALPHA_MASK 0x01
14#define DECIMAL_MASK 0x02
15#define DIGIT_MASK 0x04
16#define LOWER_MASK 0x08
17#define LINEBREAK_MASK 0x10
18#define SPACE_MASK 0x20
19#define TITLE_MASK 0x40
20#define UPPER_MASK 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000021#define XID_START_MASK 0x100
22#define XID_CONTINUE_MASK 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000023#define PRINTABLE_MASK 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000024#define NODELTA_MASK 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000025#define NUMERIC_MASK 0x1000
Jack Jansen56cdce32000-07-06 13:57:38 +000026
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000027typedef struct {
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000028 const Py_UCS4 upper;
29 const Py_UCS4 lower;
30 const Py_UCS4 title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000031 const unsigned char decimal;
32 const unsigned char digit;
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000033 const unsigned short flags;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000034} _PyUnicode_TypeRecord;
35
36#include "unicodetype_db.h"
37
38static const _PyUnicode_TypeRecord *
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000039gettyperecord(Py_UCS4 code)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000040{
41 int index;
42
Martin v. Löwis9def6a32002-10-18 16:11:54 +000043 if (code >= 0x110000)
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000044 index = 0;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000045 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +000046 {
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000047 index = index1[(code>>SHIFT)];
48 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
49 }
Fredrik Lundhee13dba2001-06-26 20:36:12 +000050
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000051 return &_PyUnicode_TypeRecords[index];
52}
Jack Jansen56cdce32000-07-06 13:57:38 +000053
Guido van Rossum603484d2000-03-10 22:52:46 +000054/* Returns the titlecase Unicode characters corresponding to ch or just
55 ch if no titlecase mapping is known. */
56
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000057Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000058{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000059 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwis71efeb72009-04-26 01:02:07 +000060 int delta = ctype->title;
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000061
Martin v. Löwis93cbca32008-09-10 14:08:48 +000062 if (ctype->flags & NODELTA_MASK)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +000063 return delta;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000064
Martin v. Löwisedf368c2002-10-18 16:40:36 +000065 if (delta >= 32768)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +000066 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +000067
68 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +000069}
70
71/* Returns 1 for Unicode characters having the category 'Lt', 0
72 otherwise. */
73
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000074int _PyUnicode_IsTitlecase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +000075{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +000076 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
77
78 return (ctype->flags & TITLE_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +000079}
80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000081/* Returns 1 for Unicode characters having the XID_Start property, 0
82 otherwise. */
83
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000084int _PyUnicode_IsXidStart(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000085{
86 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88 return (ctype->flags & XID_START_MASK) != 0;
89}
90
91/* Returns 1 for Unicode characters having the XID_Continue property,
92 0 otherwise. */
93
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000094int _PyUnicode_IsXidContinue(Py_UCS4 ch)
Martin v. Löwis13c3e382007-08-14 22:37:03 +000095{
96 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97
98 return (ctype->flags & XID_CONTINUE_MASK) != 0;
99}
100
Guido van Rossum603484d2000-03-10 22:52:46 +0000101/* Returns the integer decimal (0-9) for Unicode characters having
102 this property, -1 otherwise. */
103
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000104int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000105{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000106 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
107
108 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000109}
110
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000111int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000112{
113 if (_PyUnicode_ToDecimalDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000114 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000115 return 1;
116}
117
118/* Returns the integer digit (0-9) for Unicode characters having
119 this property, -1 otherwise. */
120
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000121int _PyUnicode_ToDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000122{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000123 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
124
125 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
Guido van Rossum603484d2000-03-10 22:52:46 +0000126}
127
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000128int _PyUnicode_IsDigit(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000129{
130 if (_PyUnicode_ToDigit(ch) < 0)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000131 return 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000132 return 1;
133}
134
135/* Returns the numeric value as double for Unicode characters having
136 this property, -1.0 otherwise. */
137
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000138int _PyUnicode_IsNumeric(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000139{
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000140 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
141
142 return (ctype->flags & NUMERIC_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000143}
144
Georg Brandl559e5d72008-06-11 18:37:52 +0000145/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
146 0 otherwise.
147 All characters except those characters defined in the Unicode character
148 database as following categories are considered printable.
149 * Cc (Other, Control)
150 * Cf (Other, Format)
151 * Cs (Other, Surrogate)
152 * Co (Other, Private Use)
153 * Cn (Other, Not Assigned)
154 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
155 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
156 * Zs (Separator, Space) other than ASCII space('\x20').
157*/
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000158int _PyUnicode_IsPrintable(Py_UCS4 ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000159{
160 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
161
Georg Brandld52429f2008-07-04 15:55:02 +0000162 return (ctype->flags & PRINTABLE_MASK) != 0;
Georg Brandl559e5d72008-06-11 18:37:52 +0000163}
164
Guido van Rossum603484d2000-03-10 22:52:46 +0000165/* Returns 1 for Unicode characters having the category 'Ll', 0
166 otherwise. */
167
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000168int _PyUnicode_IsLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000169{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000170 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
171
172 return (ctype->flags & LOWER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000173}
174
175/* Returns 1 for Unicode characters having the category 'Lu', 0
176 otherwise. */
177
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000178int _PyUnicode_IsUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000179{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000180 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
181
182 return (ctype->flags & UPPER_MASK) != 0;
Guido van Rossum603484d2000-03-10 22:52:46 +0000183}
184
185/* Returns the uppercase Unicode characters corresponding to ch or just
186 ch if no uppercase mapping is known. */
187
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000188Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000189{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000190 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000191 int delta = ctype->upper;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000192 if (ctype->flags & NODELTA_MASK)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000193 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000194 if (delta >= 32768)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000195 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000196 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000197}
198
199/* Returns the lowercase Unicode characters corresponding to ch or just
200 ch if no lowercase mapping is known. */
201
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000202Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
Guido van Rossum603484d2000-03-10 22:52:46 +0000203{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000204 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000205 int delta = ctype->lower;
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000206 if (ctype->flags & NODELTA_MASK)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000207 return delta;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000208 if (delta >= 32768)
Alexander Belopolskyf0f45142010-08-11 17:31:17 +0000209 delta -= 65536;
Martin v. Löwisedf368c2002-10-18 16:40:36 +0000210 return ch + delta;
Guido van Rossum603484d2000-03-10 22:52:46 +0000211}
212
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000213/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
214 'Lo' or 'Lm', 0 otherwise. */
215
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000216int _PyUnicode_IsAlpha(Py_UCS4 ch)
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000217{
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000218 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000219
Fredrik Lundh9e7dd4c2000-09-25 21:48:13 +0000220 return (ctype->flags & ALPHA_MASK) != 0;
Marc-André Lemburgf3938f52000-07-05 09:48:59 +0000221}
222