Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | Unicode character type helpers. |
| 3 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 4 | Written by Marc-Andre Lemburg (mal@lemburg.com). |
| 5 | Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 6 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 7 | Copyright (c) Corporation for National Research Initiatives. |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 8 | |
| 9 | */ |
| 10 | |
| 11 | #include "Python.h" |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 12 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 13 | #define ALPHA_MASK 0x01 |
| 14 | #define DECIMAL_MASK 0x02 |
| 15 | #define DIGIT_MASK 0x04 |
| 16 | #define LOWER_MASK 0x08 |
| 17 | #define LINEBREAK_MASK 0x10 |
| 18 | #define SPACE_MASK 0x20 |
| 19 | #define TITLE_MASK 0x40 |
| 20 | #define UPPER_MASK 0x80 |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 21 | #define XID_START_MASK 0x100 |
| 22 | #define XID_CONTINUE_MASK 0x200 |
Georg Brandl | d52429f | 2008-07-04 15:55:02 +0000 | [diff] [blame] | 23 | #define PRINTABLE_MASK 0x400 |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 24 | #define NODELTA_MASK 0x800 |
Amaury Forgeot d'Arc | 7d52079 | 2009-10-06 21:03:20 +0000 | [diff] [blame] | 25 | #define NUMERIC_MASK 0x1000 |
Jack Jansen | 56cdce3 | 2000-07-06 13:57:38 +0000 | [diff] [blame] | 26 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 27 | typedef struct { |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 28 | const Py_UCS4 upper; |
| 29 | const Py_UCS4 lower; |
| 30 | const Py_UCS4 title; |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 31 | const unsigned char decimal; |
| 32 | const unsigned char digit; |
Hye-Shik Chang | 974ed7c | 2004-06-02 16:49:17 +0000 | [diff] [blame] | 33 | const unsigned short flags; |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 34 | } _PyUnicode_TypeRecord; |
| 35 | |
| 36 | #include "unicodetype_db.h" |
| 37 | |
| 38 | static const _PyUnicode_TypeRecord * |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 39 | gettyperecord(Py_UCS4 code) |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 40 | { |
| 41 | int index; |
| 42 | |
Martin v. Löwis | 9def6a3 | 2002-10-18 16:11:54 +0000 | [diff] [blame] | 43 | if (code >= 0x110000) |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 44 | index = 0; |
Hye-Shik Chang | 7db07e6 | 2003-12-29 01:36:01 +0000 | [diff] [blame] | 45 | else |
Hye-Shik Chang | 7db07e6 | 2003-12-29 01:36:01 +0000 | [diff] [blame] | 46 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 47 | index = index1[(code>>SHIFT)]; |
| 48 | index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; |
| 49 | } |
Fredrik Lundh | ee13dba | 2001-06-26 20:36:12 +0000 | [diff] [blame] | 50 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 51 | return &_PyUnicode_TypeRecords[index]; |
| 52 | } |
Jack Jansen | 56cdce3 | 2000-07-06 13:57:38 +0000 | [diff] [blame] | 53 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 54 | /* Returns the titlecase Unicode characters corresponding to ch or just |
| 55 | ch if no titlecase mapping is known. */ |
| 56 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 57 | Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 58 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 59 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Martin v. Löwis | 71efeb7 | 2009-04-26 01:02:07 +0000 | [diff] [blame] | 60 | int delta = ctype->title; |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 61 | |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 62 | if (ctype->flags & NODELTA_MASK) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 63 | return delta; |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 64 | |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 65 | if (delta >= 32768) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 66 | delta -= 65536; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 67 | |
| 68 | return ch + delta; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 69 | } |
| 70 | |
| 71 | /* Returns 1 for Unicode characters having the category 'Lt', 0 |
| 72 | otherwise. */ |
| 73 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 74 | int _PyUnicode_IsTitlecase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 75 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 76 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 77 | |
| 78 | return (ctype->flags & TITLE_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 79 | } |
| 80 | |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 81 | /* Returns 1 for Unicode characters having the XID_Start property, 0 |
| 82 | otherwise. */ |
| 83 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 84 | int _PyUnicode_IsXidStart(Py_UCS4 ch) |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 85 | { |
| 86 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 87 | |
| 88 | return (ctype->flags & XID_START_MASK) != 0; |
| 89 | } |
| 90 | |
| 91 | /* Returns 1 for Unicode characters having the XID_Continue property, |
| 92 | 0 otherwise. */ |
| 93 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 94 | int _PyUnicode_IsXidContinue(Py_UCS4 ch) |
Martin v. Löwis | 13c3e38 | 2007-08-14 22:37:03 +0000 | [diff] [blame] | 95 | { |
| 96 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 97 | |
| 98 | return (ctype->flags & XID_CONTINUE_MASK) != 0; |
| 99 | } |
| 100 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 101 | /* Returns the integer decimal (0-9) for Unicode characters having |
| 102 | this property, -1 otherwise. */ |
| 103 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 104 | int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 105 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 106 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 107 | |
| 108 | return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 109 | } |
| 110 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 111 | int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 112 | { |
| 113 | if (_PyUnicode_ToDecimalDigit(ch) < 0) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 114 | return 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 115 | return 1; |
| 116 | } |
| 117 | |
| 118 | /* Returns the integer digit (0-9) for Unicode characters having |
| 119 | this property, -1 otherwise. */ |
| 120 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 121 | int _PyUnicode_ToDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 122 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 123 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 124 | |
| 125 | return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 126 | } |
| 127 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 128 | int _PyUnicode_IsDigit(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 129 | { |
| 130 | if (_PyUnicode_ToDigit(ch) < 0) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 131 | return 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 132 | return 1; |
| 133 | } |
| 134 | |
| 135 | /* Returns the numeric value as double for Unicode characters having |
| 136 | this property, -1.0 otherwise. */ |
| 137 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 138 | int _PyUnicode_IsNumeric(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 139 | { |
Amaury Forgeot d'Arc | 7d52079 | 2009-10-06 21:03:20 +0000 | [diff] [blame] | 140 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 141 | |
| 142 | return (ctype->flags & NUMERIC_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 143 | } |
| 144 | |
Georg Brandl | 559e5d7 | 2008-06-11 18:37:52 +0000 | [diff] [blame] | 145 | /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, |
| 146 | 0 otherwise. |
| 147 | All characters except those characters defined in the Unicode character |
| 148 | database as following categories are considered printable. |
| 149 | * Cc (Other, Control) |
| 150 | * Cf (Other, Format) |
| 151 | * Cs (Other, Surrogate) |
| 152 | * Co (Other, Private Use) |
| 153 | * Cn (Other, Not Assigned) |
| 154 | * Zl Separator, Line ('\u2028', LINE SEPARATOR) |
| 155 | * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) |
| 156 | * Zs (Separator, Space) other than ASCII space('\x20'). |
| 157 | */ |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 158 | int _PyUnicode_IsPrintable(Py_UCS4 ch) |
Georg Brandl | 559e5d7 | 2008-06-11 18:37:52 +0000 | [diff] [blame] | 159 | { |
| 160 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 161 | |
Georg Brandl | d52429f | 2008-07-04 15:55:02 +0000 | [diff] [blame] | 162 | return (ctype->flags & PRINTABLE_MASK) != 0; |
Georg Brandl | 559e5d7 | 2008-06-11 18:37:52 +0000 | [diff] [blame] | 163 | } |
| 164 | |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 165 | /* Returns 1 for Unicode characters having the category 'Ll', 0 |
| 166 | otherwise. */ |
| 167 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 168 | int _PyUnicode_IsLowercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 169 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 170 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 171 | |
| 172 | return (ctype->flags & LOWER_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 173 | } |
| 174 | |
| 175 | /* Returns 1 for Unicode characters having the category 'Lu', 0 |
| 176 | otherwise. */ |
| 177 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 178 | int _PyUnicode_IsUppercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 179 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 180 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
| 181 | |
| 182 | return (ctype->flags & UPPER_MASK) != 0; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 183 | } |
| 184 | |
| 185 | /* Returns the uppercase Unicode characters corresponding to ch or just |
| 186 | ch if no uppercase mapping is known. */ |
| 187 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 188 | Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 189 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 190 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 191 | int delta = ctype->upper; |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 192 | if (ctype->flags & NODELTA_MASK) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 193 | return delta; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 194 | if (delta >= 32768) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 195 | delta -= 65536; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 196 | return ch + delta; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 197 | } |
| 198 | |
| 199 | /* Returns the lowercase Unicode characters corresponding to ch or just |
| 200 | ch if no lowercase mapping is known. */ |
| 201 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 202 | Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 203 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 204 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 205 | int delta = ctype->lower; |
Martin v. Löwis | 93cbca3 | 2008-09-10 14:08:48 +0000 | [diff] [blame] | 206 | if (ctype->flags & NODELTA_MASK) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 207 | return delta; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 208 | if (delta >= 32768) |
Alexander Belopolsky | f0f4514 | 2010-08-11 17:31:17 +0000 | [diff] [blame] | 209 | delta -= 65536; |
Martin v. Löwis | edf368c | 2002-10-18 16:40:36 +0000 | [diff] [blame] | 210 | return ch + delta; |
Guido van Rossum | 603484d | 2000-03-10 22:52:46 +0000 | [diff] [blame] | 211 | } |
| 212 | |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 213 | /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |
| 214 | 'Lo' or 'Lm', 0 otherwise. */ |
| 215 | |
Amaury Forgeot d'Arc | 324ac65 | 2010-08-18 20:44:58 +0000 | [diff] [blame] | 216 | int _PyUnicode_IsAlpha(Py_UCS4 ch) |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 217 | { |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 218 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 219 | |
Fredrik Lundh | 9e7dd4c | 2000-09-25 21:48:13 +0000 | [diff] [blame] | 220 | return (ctype->flags & ALPHA_MASK) != 0; |
Marc-André Lemburg | f3938f5 | 2000-07-05 09:48:59 +0000 | [diff] [blame] | 221 | } |
| 222 | |