Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | Unicode character type helpers. |
| 3 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 4 | Written by Marc-Andre Lemburg (mal@lemburg.com). |
| 5 | Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 6 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 7 | Copyright (c) Corporation for National Research Initiatives. |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 8 | |
| 9 | */ |
| 10 | |
| 11 | #include "Python.h" |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 12 | #include "unicodeobject.h" |
| 13 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 14 | #define ALPHA_MASK 0x01 |
| 15 | #define DECIMAL_MASK 0x02 |
| 16 | #define DIGIT_MASK 0x04 |
| 17 | #define LOWER_MASK 0x08 |
| 18 | #define LINEBREAK_MASK 0x10 |
| 19 | #define SPACE_MASK 0x20 |
| 20 | #define TITLE_MASK 0x40 |
| 21 | #define UPPER_MASK 0x80 |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 22 | #define XID_START_MASK 0x100 |
| 23 | #define XID_CONTINUE_MASK 0x200 |
Georg Brandl | d52429f | 2008-07-04 15:55:02 +0000 | [diff] [blame] | 24 | #define PRINTABLE_MASK 0x400 |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 25 | #define NODELTA_MASK 0x800 |
Amaury Forgeot d'Arc | 7d52079 | 2009-10-06 21:03:20 +0000 | [diff] [blame] | 26 | #define NUMERIC_MASK 0x1000 |
Jack Jansen | 56cdce3 | 2000-07-06 13:57:38 +0000 | [diff] [blame] | 27 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 28 | typedef struct { |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 29 | const Py_UCS4 upper; |
| 30 | const Py_UCS4 lower; |
| 31 | const Py_UCS4 title; |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 32 | const unsigned char decimal; |
| 33 | const unsigned char digit; |
Hye-Shik Chang | 974ed7c | 2004-06-02 16:49:17 +0000 | [diff] [blame] | 34 | const unsigned short flags; |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 35 | } _PyUnicode_TypeRecord; |
| 36 | |
| 37 | #include "unicodetype_db.h" |
| 38 | |
| 39 | static const _PyUnicode_TypeRecord * |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 40 | gettyperecord(Py_UCS4 code) |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 41 | { |
| 42 | int index; |
| 43 | |
Martin v. Löwis | 9def6a3 | 2002-10-18 16:11:54 +0000 | [diff] [blame] | 44 | if (code >= 0x110000) |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 45 | index = 0; |
Hye-Shik Chang | 7db07e6 | 2003-12-29 01:36:01 +0000 | [diff] [blame] | 46 | else |
Hye-Shik Chang | 7db07e6 | 2003-12-29 01:36:01 +0000 | [diff] [blame] | 47 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 48 | index = index1[(code>>SHIFT)]; |
| 49 | index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; |
| 50 | } |
Fredrik Lundh | ee13dba | 2001-06-26 20:36:12 +0000 | [diff] [blame] | 51 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 52 | return &_PyUnicode_TypeRecords[index]; |
| 53 | } |
Jack Jansen | 56cdce3 | 2000-07-06 13:57:38 +0000 | [diff] [blame] | 54 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 55 | /* Returns the titlecase Unicode characters corresponding to ch or just |
| 56 | ch if no titlecase mapping is known. */ |
| 57 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 58 | Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 59 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 60 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Martin v. Löwis | 71efeb7 | 2009-04-26 01:02:07 +0000 | [diff] [blame] | 61 | int delta = ctype->title; |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 62 | |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 63 | if (ctype->flags & NODELTA_MASK) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 64 | return delta; |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 65 | |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 66 | if (delta >= 32768) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 67 | delta -= 65536; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 68 | |
| 69 | return ch + delta; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 70 | } |
| 71 | |
| 72 | /* Returns 1 for Unicode characters having the category 'Lt', 0 |
| 73 | otherwise. */ |
| 74 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 75 | int _PyUnicode_IsTitlecase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 76 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 77 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 78 | |
| 79 | return (ctype->flags & TITLE_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 80 | } |
| 81 | |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 82 | /* Returns 1 for Unicode characters having the XID_Start property, 0 |
| 83 | otherwise. */ |
| 84 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 85 | int _PyUnicode_IsXidStart(Py_UCS4 ch) |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 86 | { |
| 87 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 88 | |
| 89 | return (ctype->flags & XID_START_MASK) != 0; |
| 90 | } |
| 91 | |
| 92 | /* Returns 1 for Unicode characters having the XID_Continue property, |
| 93 | 0 otherwise. */ |
| 94 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 95 | int _PyUnicode_IsXidContinue(Py_UCS4 ch) |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 96 | { |
| 97 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 98 | |
| 99 | return (ctype->flags & XID_CONTINUE_MASK) != 0; |
| 100 | } |
| 101 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 102 | /* Returns the integer decimal (0-9) for Unicode characters having |
| 103 | this property, -1 otherwise. */ |
| 104 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 105 | int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 106 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 107 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 108 | |
| 109 | return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 110 | } |
| 111 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 112 | int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 113 | { |
| 114 | if (_PyUnicode_ToDecimalDigit(ch) < 0) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 115 | return 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 116 | return 1; |
| 117 | } |
| 118 | |
| 119 | /* Returns the integer digit (0-9) for Unicode characters having |
| 120 | this property, -1 otherwise. */ |
| 121 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 122 | int _PyUnicode_ToDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 123 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 124 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 125 | |
| 126 | return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 127 | } |
| 128 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 129 | int _PyUnicode_IsDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 130 | { |
| 131 | if (_PyUnicode_ToDigit(ch) < 0) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 132 | return 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 133 | return 1; |
| 134 | } |
| 135 | |
| 136 | /* Returns the numeric value as double for Unicode characters having |
| 137 | this property, -1.0 otherwise. */ |
| 138 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 139 | int _PyUnicode_IsNumeric(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 140 | { |
Amaury Forgeot d'Arc | 7d52079 | 2009-10-06 21:03:20 +0000 | [diff] [blame] | 141 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 142 | |
| 143 | return (ctype->flags & NUMERIC_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 144 | } |
| 145 | |
Georg Brandl | 559e5d7 | 2008-06-11 18:37:52 +0000 | [diff] [blame] | 146 | /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, |
| 147 | 0 otherwise. |
| 148 | All characters except those characters defined in the Unicode character |
| 149 | database as following categories are considered printable. |
| 150 | * Cc (Other, Control) |
| 151 | * Cf (Other, Format) |
| 152 | * Cs (Other, Surrogate) |
| 153 | * Co (Other, Private Use) |
| 154 | * Cn (Other, Not Assigned) |
| 155 | * Zl Separator, Line ('\u2028', LINE SEPARATOR) |
| 156 | * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) |
| 157 | * Zs (Separator, Space) other than ASCII space('\x20'). |
| 158 | */ |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 159 | int _PyUnicode_IsPrintable(Py_UCS4 ch) |
Georg Brandl | 559e5d7 | 2008-06-11 18:37:52 +0000 | [diff] [blame] | 160 | { |
| 161 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 162 | |
Georg Brandl | d52429f | 2008-07-04 15:55:02 +0000 | [diff] [blame] | 163 | return (ctype->flags & PRINTABLE_MASK) != 0; |
Georg Brandl | 559e5d7 | 2008-06-11 18:37:52 +0000 | [diff] [blame] | 164 | } |
| 165 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 166 | #ifndef WANT_WCTYPE_FUNCTIONS |
| 167 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 168 | /* Returns 1 for Unicode characters having the category 'Ll', 0 |
| 169 | otherwise. */ |
| 170 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 171 | int _PyUnicode_IsLowercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 172 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 173 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 174 | |
| 175 | return (ctype->flags & LOWER_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 176 | } |
| 177 | |
| 178 | /* Returns 1 for Unicode characters having the category 'Lu', 0 |
| 179 | otherwise. */ |
| 180 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 181 | int _PyUnicode_IsUppercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 182 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 183 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 184 | |
| 185 | return (ctype->flags & UPPER_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 186 | } |
| 187 | |
| 188 | /* Returns the uppercase Unicode characters corresponding to ch or just |
| 189 | ch if no uppercase mapping is known. */ |
| 190 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 191 | Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 192 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 193 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 194 | int delta = ctype->upper; |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 195 | if (ctype->flags & NODELTA_MASK) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 196 | return delta; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 197 | if (delta >= 32768) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 198 | delta -= 65536; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 199 | return ch + delta; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 200 | } |
| 201 | |
| 202 | /* Returns the lowercase Unicode characters corresponding to ch or just |
| 203 | ch if no lowercase mapping is known. */ |
| 204 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 205 | Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 206 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 207 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 208 | int delta = ctype->lower; |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 209 | if (ctype->flags & NODELTA_MASK) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 210 | return delta; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 211 | if (delta >= 32768) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 212 | delta -= 65536; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 213 | return ch + delta; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 214 | } |
| 215 | |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 216 | /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |
| 217 | 'Lo' or 'Lm', 0 otherwise. */ |
| 218 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 219 | int _PyUnicode_IsAlpha(Py_UCS4 ch) |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 220 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 221 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 222 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 223 | return (ctype->flags & ALPHA_MASK) != 0; |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 224 | } |
| 225 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 226 | #else |
| 227 | |
| 228 | /* Export the interfaces using the wchar_t type for portability |
| 229 | reasons: */ |
| 230 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 231 | int _PyUnicode_IsLowercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 232 | { |
| 233 | return iswlower(ch); |
| 234 | } |
| 235 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 236 | int _PyUnicode_IsUppercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 237 | { |
| 238 | return iswupper(ch); |
| 239 | } |
| 240 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 241 | Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 242 | { |
| 243 | return towlower(ch); |
| 244 | } |
| 245 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 246 | Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 247 | { |
| 248 | return towupper(ch); |
| 249 | } |
| 250 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame^] | 251 | int _PyUnicode_IsAlpha(Py_UCS4 ch) |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 252 | { |
| 253 | return iswalpha(ch); |
| 254 | } |
| 255 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 256 | #endif |