| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 1 | /* ------------------------------------------------------------------------ | 
 | 2 |  | 
| Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 3 |    unicodedata -- Provides access to the Unicode 4.1 data base. | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 4 |  | 
| Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 5 |    Data was extracted from the Unicode 4.1 UnicodeData.txt file. | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 6 |  | 
| Fredrik Lundh | cfcea49 | 2000-09-25 08:07:06 +0000 | [diff] [blame] | 7 |    Written by Marc-Andre Lemburg (mal@lemburg.com). | 
 | 8 |    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 9 |    Modified by Martin v. Löwis (martin@v.loewis.de) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 10 |  | 
| Fredrik Lundh | cfcea49 | 2000-09-25 08:07:06 +0000 | [diff] [blame] | 11 |    Copyright (c) Corporation for National Research Initiatives. | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 12 |  | 
 | 13 |    ------------------------------------------------------------------------ */ | 
 | 14 |  | 
 | 15 | #include "Python.h" | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 16 | #include "ucnhash.h" | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 17 | #include "structmember.h" | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 18 |  | 
 | 19 | /* character properties */ | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 20 |  | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 21 | typedef struct { | 
 | 22 |     const unsigned char category;	/* index into | 
 | 23 | 					   _PyUnicode_CategoryNames */ | 
 | 24 |     const unsigned char	combining; 	/* combining class value 0 - 255 */ | 
 | 25 |     const unsigned char	bidirectional; 	/* index into | 
 | 26 | 					   _PyUnicode_BidirectionalNames */ | 
 | 27 |     const unsigned char mirrored;	/* true if mirrored in bidir mode */ | 
| Hye-Shik Chang | e9ddfbb | 2004-08-04 07:38:35 +0000 | [diff] [blame] | 28 |     const unsigned char east_asian_width;	/* index into | 
 | 29 | 						   _PyUnicode_EastAsianWidth */ | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 30 | } _PyUnicode_DatabaseRecord; | 
 | 31 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 32 | typedef struct change_record { | 
 | 33 |     /* sequence of fields should be the same as in merge_old_version */ | 
 | 34 |     const unsigned char bidir_changed; | 
 | 35 |     const unsigned char category_changed; | 
 | 36 |     const unsigned char decimal_changed; | 
 | 37 |     const int numeric_changed; | 
 | 38 | } change_record; | 
 | 39 |  | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 40 | /* data file generated by Tools/unicode/makeunicodedata.py */ | 
 | 41 | #include "unicodedata_db.h" | 
 | 42 |  | 
 | 43 | static const _PyUnicode_DatabaseRecord* | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 44 | _getrecord_ex(Py_UCS4 code) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 45 | { | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 46 |     int index; | 
| Neal Norwitz | e9c571f | 2003-02-28 03:14:37 +0000 | [diff] [blame] | 47 |     if (code >= 0x110000) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 48 |         index = 0; | 
 | 49 |     else { | 
 | 50 |         index = index1[(code>>SHIFT)]; | 
 | 51 |         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; | 
 | 52 |     } | 
 | 53 |  | 
 | 54 |     return &_PyUnicode_Database_Records[index]; | 
 | 55 | } | 
 | 56 |  | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 57 | static const _PyUnicode_DatabaseRecord* | 
 | 58 | _getrecord(PyUnicodeObject* v) | 
 | 59 | { | 
 | 60 |     return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); | 
 | 61 | } | 
 | 62 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 63 | /* ------------- Previous-version API ------------------------------------- */ | 
 | 64 | typedef struct previous_version { | 
 | 65 |     PyObject_HEAD | 
 | 66 |     const char *name; | 
 | 67 |     const change_record* (*getrecord)(Py_UCS4); | 
 | 68 |     Py_UCS4 (*normalization)(Py_UCS4); | 
 | 69 | } PreviousDBVersion; | 
 | 70 |  | 
 | 71 | #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v)) | 
 | 72 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 73 | static PyMemberDef DB_members[] = { | 
 | 74 | 	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, | 
 | 75 |         {NULL} | 
 | 76 | }; | 
 | 77 |  | 
| Thomas Wouters | 89f507f | 2006-12-13 04:49:30 +0000 | [diff] [blame] | 78 | /* forward declaration */ | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 79 | static PyTypeObject UCD_Type; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 80 |  | 
 | 81 | static PyObject* | 
 | 82 | new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), | 
 | 83 |                      Py_UCS4 (*normalization)(Py_UCS4)) | 
 | 84 | { | 
 | 85 | 	PreviousDBVersion *self; | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 86 | 	self = PyObject_New(PreviousDBVersion, &UCD_Type); | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 87 | 	if (self == NULL) | 
 | 88 | 		return NULL; | 
 | 89 | 	self->name = name; | 
 | 90 | 	self->getrecord = getrecord; | 
 | 91 |         self->normalization = normalization; | 
 | 92 | 	return (PyObject*)self; | 
 | 93 | } | 
 | 94 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 95 | /* --- Module API --------------------------------------------------------- */ | 
 | 96 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 97 | PyDoc_STRVAR(unicodedata_decimal__doc__, | 
 | 98 | "decimal(unichr[, default])\n\ | 
 | 99 | \n\ | 
 | 100 | Returns the decimal value assigned to the Unicode character unichr\n\ | 
 | 101 | as integer. If no such value is defined, default is returned, or, if\n\ | 
 | 102 | not given, ValueError is raised."); | 
 | 103 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 104 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 105 | unicodedata_decimal(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 106 | { | 
 | 107 |     PyUnicodeObject *v; | 
 | 108 |     PyObject *defobj = NULL; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 109 |     int have_old = 0; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 110 |     long rc; | 
 | 111 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 112 |     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 113 |         return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 114 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 115 | 	PyErr_SetString(PyExc_TypeError, | 
 | 116 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 117 |         return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 118 |     } | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 119 |  | 
 | 120 |     if (self) { | 
 | 121 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 122 |         if (old->category_changed == 0) { | 
 | 123 |             /* unassigned */ | 
 | 124 |             have_old = 1; | 
 | 125 |             rc = -1; | 
 | 126 |         }  | 
 | 127 |         else if (old->decimal_changed != 0xFF) { | 
 | 128 |             have_old = 1; | 
 | 129 |             rc = old->decimal_changed; | 
 | 130 |         } | 
 | 131 |     } | 
 | 132 |  | 
 | 133 |     if (!have_old) | 
 | 134 |         rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 135 |     if (rc < 0) { | 
 | 136 | 	if (defobj == NULL) { | 
 | 137 | 	    PyErr_SetString(PyExc_ValueError, | 
 | 138 | 			    "not a decimal"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 139 |             return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 140 | 	} | 
 | 141 | 	else { | 
 | 142 | 	    Py_INCREF(defobj); | 
 | 143 | 	    return defobj; | 
 | 144 | 	} | 
 | 145 |     } | 
 | 146 |     return PyInt_FromLong(rc); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 147 | } | 
 | 148 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 149 | PyDoc_STRVAR(unicodedata_digit__doc__, | 
 | 150 | "digit(unichr[, default])\n\ | 
 | 151 | \n\ | 
 | 152 | Returns the digit value assigned to the Unicode character unichr as\n\ | 
 | 153 | integer. If no such value is defined, default is returned, or, if\n\ | 
 | 154 | not given, ValueError is raised."); | 
 | 155 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 156 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 157 | unicodedata_digit(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 158 | { | 
 | 159 |     PyUnicodeObject *v; | 
 | 160 |     PyObject *defobj = NULL; | 
 | 161 |     long rc; | 
 | 162 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 163 |     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 164 |         return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 165 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 166 | 	PyErr_SetString(PyExc_TypeError, | 
 | 167 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 168 |         return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 169 |     } | 
 | 170 |     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v)); | 
 | 171 |     if (rc < 0) { | 
 | 172 | 	if (defobj == NULL) { | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 173 | 	    PyErr_SetString(PyExc_ValueError, "not a digit"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 174 |             return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 175 | 	} | 
 | 176 | 	else { | 
 | 177 | 	    Py_INCREF(defobj); | 
 | 178 | 	    return defobj; | 
 | 179 | 	} | 
 | 180 |     } | 
 | 181 |     return PyInt_FromLong(rc); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 182 | } | 
 | 183 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 184 | PyDoc_STRVAR(unicodedata_numeric__doc__, | 
 | 185 | "numeric(unichr[, default])\n\ | 
 | 186 | \n\ | 
 | 187 | Returns the numeric value assigned to the Unicode character unichr\n\ | 
 | 188 | as float. If no such value is defined, default is returned, or, if\n\ | 
 | 189 | not given, ValueError is raised."); | 
 | 190 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 191 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 192 | unicodedata_numeric(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 193 | { | 
 | 194 |     PyUnicodeObject *v; | 
 | 195 |     PyObject *defobj = NULL; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 196 |     int have_old = 0; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 197 |     double rc; | 
 | 198 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 199 |     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 200 |         return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 201 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 202 | 	PyErr_SetString(PyExc_TypeError, | 
 | 203 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 204 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 205 |     } | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 206 |  | 
 | 207 |     if (self) { | 
 | 208 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 209 |         if (old->category_changed == 0) { | 
 | 210 |             /* unassigned */ | 
 | 211 |             have_old = 1; | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 212 |             rc = -1.0; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 213 |         }  | 
 | 214 |         else if (old->decimal_changed != 0xFF) { | 
 | 215 |             have_old = 1; | 
 | 216 |             rc = old->decimal_changed; | 
 | 217 |         } | 
 | 218 |     } | 
 | 219 |  | 
 | 220 |     if (!have_old) | 
 | 221 |         rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); | 
| Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 222 |     if (rc == -1.0) { | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 223 | 	if (defobj == NULL) { | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 224 | 	    PyErr_SetString(PyExc_ValueError, "not a numeric character"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 225 | 	    return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 226 | 	} | 
 | 227 | 	else { | 
 | 228 | 	    Py_INCREF(defobj); | 
 | 229 | 	    return defobj; | 
 | 230 | 	} | 
 | 231 |     } | 
 | 232 |     return PyFloat_FromDouble(rc); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 233 | } | 
 | 234 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 235 | PyDoc_STRVAR(unicodedata_category__doc__, | 
 | 236 | "category(unichr)\n\ | 
 | 237 | \n\ | 
 | 238 | Returns the general category assigned to the Unicode character\n\ | 
 | 239 | unichr as string."); | 
 | 240 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 241 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 242 | unicodedata_category(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 243 | { | 
 | 244 |     PyUnicodeObject *v; | 
 | 245 |     int index; | 
 | 246 |  | 
 | 247 |     if (!PyArg_ParseTuple(args, "O!:category", | 
 | 248 | 			  &PyUnicode_Type, &v)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 249 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 250 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 251 | 	PyErr_SetString(PyExc_TypeError, | 
 | 252 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 253 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 254 |     } | 
| Fredrik Lundh | b95896b | 2001-02-18 22:06:17 +0000 | [diff] [blame] | 255 |     index = (int) _getrecord(v)->category; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 256 |     if (self) { | 
 | 257 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 258 |         if (old->category_changed != 0xFF) | 
 | 259 |             index = old->category_changed; | 
 | 260 |     } | 
| Walter Dörwald | 4254e76 | 2007-06-05 16:04:09 +0000 | [diff] [blame] | 261 |     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 262 | } | 
 | 263 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 264 | PyDoc_STRVAR(unicodedata_bidirectional__doc__, | 
 | 265 | "bidirectional(unichr)\n\ | 
 | 266 | \n\ | 
 | 267 | Returns the bidirectional category assigned to the Unicode character\n\ | 
 | 268 | unichr as string. If no such value is defined, an empty string is\n\ | 
 | 269 | returned."); | 
 | 270 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 271 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 272 | unicodedata_bidirectional(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 273 | { | 
 | 274 |     PyUnicodeObject *v; | 
 | 275 |     int index; | 
 | 276 |  | 
 | 277 |     if (!PyArg_ParseTuple(args, "O!:bidirectional", | 
 | 278 | 			  &PyUnicode_Type, &v)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 279 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 280 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 281 | 	PyErr_SetString(PyExc_TypeError, | 
 | 282 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 283 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 284 |     } | 
| Fredrik Lundh | b95896b | 2001-02-18 22:06:17 +0000 | [diff] [blame] | 285 |     index = (int) _getrecord(v)->bidirectional; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 286 |     if (self) { | 
 | 287 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 288 |         if (old->category_changed == 0) | 
 | 289 |             index = 0; /* unassigned */ | 
 | 290 |         else if (old->bidir_changed != 0xFF) | 
 | 291 |             index = old->bidir_changed; | 
 | 292 |     } | 
| Walter Dörwald | 4254e76 | 2007-06-05 16:04:09 +0000 | [diff] [blame] | 293 |     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 294 | } | 
 | 295 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 296 | PyDoc_STRVAR(unicodedata_combining__doc__, | 
 | 297 | "combining(unichr)\n\ | 
 | 298 | \n\ | 
 | 299 | Returns the canonical combining class assigned to the Unicode\n\ | 
 | 300 | character unichr as integer. Returns 0 if no combining class is\n\ | 
 | 301 | defined."); | 
 | 302 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 303 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 304 | unicodedata_combining(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 305 | { | 
 | 306 |     PyUnicodeObject *v; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 307 |     int index; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 308 |  | 
 | 309 |     if (!PyArg_ParseTuple(args, "O!:combining", | 
 | 310 | 			  &PyUnicode_Type, &v)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 311 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 312 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 313 | 	PyErr_SetString(PyExc_TypeError, | 
 | 314 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 315 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 316 |     } | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 317 |     index = (int) _getrecord(v)->combining; | 
 | 318 |     if (self) { | 
 | 319 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 320 |         if (old->category_changed == 0) | 
 | 321 |             index = 0; /* unassigned */ | 
 | 322 |     } | 
 | 323 |     return PyInt_FromLong(index); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 324 | } | 
 | 325 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 326 | PyDoc_STRVAR(unicodedata_mirrored__doc__, | 
 | 327 | "mirrored(unichr)\n\ | 
 | 328 | \n\ | 
 | 329 | Returns the mirrored property assigned to the Unicode character\n\ | 
 | 330 | unichr as integer. Returns 1 if the character has been identified as\n\ | 
 | 331 | a \"mirrored\" character in bidirectional text, 0 otherwise."); | 
 | 332 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 333 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 334 | unicodedata_mirrored(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 335 | { | 
 | 336 |     PyUnicodeObject *v; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 337 |     int index; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 338 |  | 
 | 339 |     if (!PyArg_ParseTuple(args, "O!:mirrored", | 
 | 340 | 			  &PyUnicode_Type, &v)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 341 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 342 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 343 | 	PyErr_SetString(PyExc_TypeError, | 
 | 344 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 345 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 346 |     } | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 347 |     index = (int) _getrecord(v)->mirrored; | 
 | 348 |     if (self) { | 
 | 349 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 350 |         if (old->category_changed == 0) | 
 | 351 |             index = 0; /* unassigned */ | 
 | 352 |     } | 
 | 353 |     return PyInt_FromLong(index); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 354 | } | 
 | 355 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 356 | PyDoc_STRVAR(unicodedata_east_asian_width__doc__, | 
 | 357 | "east_asian_width(unichr)\n\ | 
 | 358 | \n\ | 
 | 359 | Returns the east asian width assigned to the Unicode character\n\ | 
 | 360 | unichr as string."); | 
 | 361 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 362 | static PyObject * | 
| Hye-Shik Chang | e9ddfbb | 2004-08-04 07:38:35 +0000 | [diff] [blame] | 363 | unicodedata_east_asian_width(PyObject *self, PyObject *args) | 
 | 364 | { | 
 | 365 |     PyUnicodeObject *v; | 
 | 366 |     int index; | 
 | 367 |  | 
 | 368 |     if (!PyArg_ParseTuple(args, "O!:east_asian_width", | 
 | 369 | 			  &PyUnicode_Type, &v)) | 
 | 370 | 	return NULL; | 
 | 371 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 372 | 	PyErr_SetString(PyExc_TypeError, | 
 | 373 | 			"need a single Unicode character as parameter"); | 
 | 374 | 	return NULL; | 
 | 375 |     } | 
 | 376 |     index = (int) _getrecord(v)->east_asian_width; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 377 |     if (self) { | 
 | 378 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 379 |         if (old->category_changed == 0) | 
 | 380 |             index = 0; /* unassigned */ | 
 | 381 |     } | 
| Walter Dörwald | 4254e76 | 2007-06-05 16:04:09 +0000 | [diff] [blame] | 382 |     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); | 
| Hye-Shik Chang | e9ddfbb | 2004-08-04 07:38:35 +0000 | [diff] [blame] | 383 | } | 
 | 384 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 385 | PyDoc_STRVAR(unicodedata_decomposition__doc__, | 
 | 386 | "decomposition(unichr)\n\ | 
 | 387 | \n\ | 
 | 388 | Returns the character decomposition mapping assigned to the Unicode\n\ | 
 | 389 | character unichr as string. An empty string is returned in case no\n\ | 
 | 390 | such mapping is defined."); | 
 | 391 |  | 
| Hye-Shik Chang | e9ddfbb | 2004-08-04 07:38:35 +0000 | [diff] [blame] | 392 | static PyObject * | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 393 | unicodedata_decomposition(PyObject *self, PyObject *args) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 394 | { | 
 | 395 |     PyUnicodeObject *v; | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 396 |     char decomp[256]; | 
 | 397 |     int code, index, count, i; | 
| Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 398 |     unsigned int prefix_index; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 399 |  | 
 | 400 |     if (!PyArg_ParseTuple(args, "O!:decomposition", | 
 | 401 | 			  &PyUnicode_Type, &v)) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 402 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 403 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 404 | 	PyErr_SetString(PyExc_TypeError, | 
 | 405 | 			"need a single Unicode character as parameter"); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 406 | 	return NULL; | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 407 |     } | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 408 |  | 
 | 409 |     code = (int) *PyUnicode_AS_UNICODE(v); | 
 | 410 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 411 |     if (self) { | 
 | 412 |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | 
 | 413 |         if (old->category_changed == 0) | 
| Walter Dörwald | 4254e76 | 2007-06-05 16:04:09 +0000 | [diff] [blame] | 414 |             return PyUnicode_FromString(""); /* unassigned */ | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 415 |     } | 
 | 416 |  | 
| Martin v. Löwis | 9def6a3 | 2002-10-18 16:11:54 +0000 | [diff] [blame] | 417 |     if (code < 0 || code >= 0x110000) | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 418 |         index = 0; | 
 | 419 |     else { | 
 | 420 |         index = decomp_index1[(code>>DECOMP_SHIFT)]; | 
 | 421 |         index = decomp_index2[(index<<DECOMP_SHIFT)+ | 
 | 422 |                              (code&((1<<DECOMP_SHIFT)-1))]; | 
 | 423 |     } | 
 | 424 |  | 
| Tim Peters | 69b83b1 | 2001-11-30 07:23:05 +0000 | [diff] [blame] | 425 |     /* high byte is number of hex bytes (usually one or two), low byte | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 426 |        is prefix code (from*/ | 
 | 427 |     count = decomp_data[index] >> 8; | 
 | 428 |  | 
 | 429 |     /* XXX: could allocate the PyString up front instead | 
 | 430 |        (strlen(prefix) + 5 * count + 1 bytes) */ | 
 | 431 |  | 
| Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 432 |     /* Based on how index is calculated above and decomp_data is generated | 
 | 433 |        from Tools/unicode/makeunicodedata.py, it should not be possible | 
 | 434 |        to overflow decomp_prefix. */ | 
 | 435 |     prefix_index = decomp_data[index] & 255; | 
 | 436 |     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); | 
 | 437 |  | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 438 |     /* copy prefix */ | 
| Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 439 |     i = strlen(decomp_prefix[prefix_index]); | 
 | 440 |     memcpy(decomp, decomp_prefix[prefix_index], i); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 441 |  | 
 | 442 |     while (count-- > 0) { | 
 | 443 |         if (i) | 
 | 444 |             decomp[i++] = ' '; | 
| Tim Peters | 69b83b1 | 2001-11-30 07:23:05 +0000 | [diff] [blame] | 445 |         assert((size_t)i < sizeof(decomp)); | 
 | 446 |         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", | 
 | 447 |                       decomp_data[++index]); | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 448 |         i += strlen(decomp + i); | 
 | 449 |     } | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 450 |      | 
| Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 451 |     decomp[i] = '\0'; | 
 | 452 |  | 
| Walter Dörwald | 4254e76 | 2007-06-05 16:04:09 +0000 | [diff] [blame] | 453 |     return PyUnicode_FromString(decomp); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 454 | } | 
 | 455 |  | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 456 | static void | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 457 | get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 458 | { | 
| Neal Norwitz | e9c571f | 2003-02-28 03:14:37 +0000 | [diff] [blame] | 459 |     if (code >= 0x110000) { | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 460 |         *index = 0; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 461 |     } else if (self && get_old_record(self, code)->category_changed==0) { | 
 | 462 |         /* unassigned in old version */ | 
 | 463 |         *index = 0; | 
 | 464 |     } | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 465 |     else { | 
 | 466 |         *index = decomp_index1[(code>>DECOMP_SHIFT)]; | 
 | 467 |         *index = decomp_index2[(*index<<DECOMP_SHIFT)+ | 
 | 468 |                                (code&((1<<DECOMP_SHIFT)-1))]; | 
 | 469 |     } | 
 | 470 | 	 | 
 | 471 |     /* high byte is number of hex bytes (usually one or two), low byte | 
 | 472 |        is prefix code (from*/ | 
 | 473 |     *count = decomp_data[*index] >> 8; | 
 | 474 |     *prefix = decomp_data[*index] & 255; | 
 | 475 |  | 
 | 476 |     (*index)++; | 
 | 477 | } | 
 | 478 |  | 
 | 479 | #define SBase   0xAC00 | 
 | 480 | #define LBase   0x1100 | 
 | 481 | #define VBase   0x1161 | 
 | 482 | #define TBase   0x11A7 | 
 | 483 | #define LCount  19 | 
 | 484 | #define VCount  21 | 
 | 485 | #define TCount  28 | 
 | 486 | #define NCount  (VCount*TCount) | 
 | 487 | #define SCount  (LCount*NCount) | 
 | 488 |  | 
 | 489 | static PyObject* | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 490 | nfd_nfkd(PyObject *self, PyObject *input, int k) | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 491 | { | 
 | 492 |     PyObject *result; | 
 | 493 |     Py_UNICODE *i, *end, *o; | 
 | 494 |     /* Longest decomposition in Unicode 3.2: U+FDFA */ | 
 | 495 |     Py_UNICODE stack[20];  | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 496 |     Py_ssize_t space, isize; | 
 | 497 |     int index, prefix, count, stackptr; | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 498 |     unsigned char prev, cur; | 
 | 499 | 	 | 
 | 500 |     stackptr = 0; | 
 | 501 |     isize = PyUnicode_GET_SIZE(input); | 
 | 502 |     /* Overallocate atmost 10 characters. */ | 
 | 503 |     space = (isize > 10 ? 10 : isize) + isize; | 
 | 504 |     result = PyUnicode_FromUnicode(NULL, space); | 
 | 505 |     if (!result) | 
 | 506 |         return NULL; | 
 | 507 |     i = PyUnicode_AS_UNICODE(input); | 
 | 508 |     end = i + isize; | 
 | 509 |     o = PyUnicode_AS_UNICODE(result); | 
 | 510 |  | 
 | 511 |     while (i < end) { | 
 | 512 |         stack[stackptr++] = *i++; | 
 | 513 |         while(stackptr) { | 
 | 514 |             Py_UNICODE code = stack[--stackptr]; | 
| Martin v. Löwis | d2171d2 | 2003-11-06 20:47:57 +0000 | [diff] [blame] | 515 |             /* Hangul Decomposition adds three characters in | 
 | 516 |                a single step, so we need atleast that much room. */ | 
 | 517 |             if (space < 3) { | 
| Martin v. Löwis | 5b22213 | 2007-06-10 09:51:05 +0000 | [diff] [blame] | 518 |                 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10; | 
| Martin v. Löwis | d2171d2 | 2003-11-06 20:47:57 +0000 | [diff] [blame] | 519 |                 space += 10; | 
 | 520 |                 if (PyUnicode_Resize(&result, newsize) == -1) | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 521 |                     return NULL; | 
| Martin v. Löwis | d2171d2 | 2003-11-06 20:47:57 +0000 | [diff] [blame] | 522 |                 o = PyUnicode_AS_UNICODE(result) + newsize - space; | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 523 |             } | 
 | 524 |             /* Hangul Decomposition. */ | 
 | 525 |             if (SBase <= code && code < (SBase+SCount)) { | 
 | 526 |                 int SIndex = code - SBase; | 
 | 527 |                 int L = LBase + SIndex / NCount; | 
 | 528 |                 int V = VBase + (SIndex % NCount) / TCount; | 
 | 529 |                 int T = TBase + SIndex % TCount; | 
 | 530 |                 *o++ = L; | 
 | 531 |                 *o++ = V; | 
 | 532 |                 space -= 2; | 
 | 533 |                 if (T != TBase) { | 
 | 534 |                     *o++ = T; | 
 | 535 |                     space --; | 
 | 536 |                 } | 
 | 537 |                 continue; | 
 | 538 |             } | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 539 |             /* normalization changes */ | 
 | 540 |             if (self) { | 
 | 541 |                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); | 
 | 542 |                 if (value != 0) { | 
 | 543 |                     stack[stackptr++] = value; | 
 | 544 |                     continue; | 
 | 545 |                 } | 
 | 546 |             } | 
 | 547 |  | 
 | 548 |             /* Other decompositions. */ | 
 | 549 |             get_decomp_record(self, code, &index, &prefix, &count); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 550 |  | 
 | 551 |             /* Copy character if it is not decomposable, or has a | 
 | 552 |                compatibility decomposition, but we do NFD. */ | 
 | 553 |             if (!count || (prefix && !k)) { | 
 | 554 |                 *o++ = code; | 
 | 555 |                 space--; | 
 | 556 |                 continue; | 
 | 557 |             } | 
 | 558 |             /* Copy decomposition onto the stack, in reverse | 
 | 559 |                order.  */ | 
 | 560 |             while(count) { | 
 | 561 |                 code = decomp_data[index + (--count)]; | 
 | 562 |                 stack[stackptr++] = code; | 
 | 563 |             } | 
 | 564 |         } | 
 | 565 |     } | 
 | 566 |  | 
 | 567 |     /* Drop overallocation. Cannot fail. */ | 
 | 568 |     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); | 
 | 569 |  | 
 | 570 |     /* Sort canonically. */ | 
 | 571 |     i = PyUnicode_AS_UNICODE(result); | 
 | 572 |     prev = _getrecord_ex(*i)->combining; | 
 | 573 |     end = i + PyUnicode_GET_SIZE(result); | 
 | 574 |     for (i++; i < end; i++) { | 
 | 575 |         cur = _getrecord_ex(*i)->combining; | 
 | 576 |         if (prev == 0 || cur == 0 || prev <= cur) { | 
 | 577 |             prev = cur; | 
 | 578 |             continue; | 
 | 579 |         } | 
 | 580 |         /* Non-canonical order. Need to switch *i with previous. */ | 
 | 581 |         o = i - 1; | 
 | 582 |         while (1) { | 
 | 583 |             Py_UNICODE tmp = o[1]; | 
 | 584 |             o[1] = o[0]; | 
 | 585 |             o[0] = tmp; | 
 | 586 |             o--; | 
 | 587 |             if (o < PyUnicode_AS_UNICODE(result)) | 
 | 588 |                 break; | 
 | 589 |             prev = _getrecord_ex(*o)->combining; | 
 | 590 |             if (prev == 0 || prev <= cur) | 
 | 591 |                 break; | 
 | 592 |         } | 
 | 593 |         prev = _getrecord_ex(*i)->combining; | 
 | 594 |     } | 
 | 595 |     return result; | 
 | 596 | } | 
 | 597 |  | 
 | 598 | static int | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 599 | find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 600 | { | 
 | 601 |     int index; | 
 | 602 |     for (index = 0; nfc[index].start; index++) { | 
 | 603 |         int start = nfc[index].start; | 
 | 604 |         if (code < start) | 
 | 605 |             return -1; | 
 | 606 |         if (code <= start + nfc[index].count) { | 
 | 607 |             int delta = code - start; | 
 | 608 |             return nfc[index].index + delta; | 
 | 609 |         } | 
 | 610 |     } | 
 | 611 |     return -1; | 
 | 612 | } | 
 | 613 |  | 
 | 614 | static PyObject* | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 615 | nfc_nfkc(PyObject *self, PyObject *input, int k) | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 616 | { | 
 | 617 |     PyObject *result; | 
 | 618 |     Py_UNICODE *i, *i1, *o, *end; | 
 | 619 |     int f,l,index,index1,comb; | 
 | 620 |     Py_UNICODE code; | 
 | 621 |     Py_UNICODE *skipped[20]; | 
 | 622 |     int cskipped = 0; | 
 | 623 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 624 |     result = nfd_nfkd(self, input, k); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 625 |     if (!result) | 
 | 626 |         return NULL; | 
 | 627 |  | 
 | 628 |     /* We are going to modify result in-place. | 
 | 629 |        If nfd_nfkd is changed to sometimes return the input, | 
 | 630 |        this code needs to be reviewed. */ | 
 | 631 |     assert(result != input); | 
 | 632 |  | 
 | 633 |     i = PyUnicode_AS_UNICODE(result); | 
 | 634 |     end = i + PyUnicode_GET_SIZE(result); | 
 | 635 |     o = PyUnicode_AS_UNICODE(result); | 
 | 636 | 	 | 
 | 637 |   again: | 
 | 638 |     while (i < end) { | 
 | 639 |       for (index = 0; index < cskipped; index++) { | 
 | 640 |           if (skipped[index] == i) { | 
 | 641 |               /* *i character is skipped.  | 
 | 642 |                  Remove from list. */ | 
 | 643 |               skipped[index] = skipped[cskipped-1]; | 
 | 644 |               cskipped--; | 
 | 645 |               i++; | 
| Martin v. Löwis | 2fb661f | 2002-12-07 14:56:36 +0000 | [diff] [blame] | 646 |               goto again; /* continue while */ | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 647 |           } | 
 | 648 |       } | 
 | 649 |       /* Hangul Composition. We don't need to check for <LV,T> | 
 | 650 |          pairs, since we always have decomposed data. */ | 
 | 651 |       if (LBase <= *i && *i < (LBase+LCount) && | 
 | 652 |           i + 1 < end &&  | 
 | 653 |           VBase <= i[1] && i[1] <= (VBase+VCount)) { | 
 | 654 |           int LIndex, VIndex; | 
 | 655 |           LIndex = i[0] - LBase; | 
 | 656 |           VIndex = i[1] - VBase; | 
 | 657 |           code = SBase + (LIndex*VCount+VIndex)*TCount; | 
 | 658 |           i+=2; | 
 | 659 |           if (i < end && | 
 | 660 |               TBase <= *i && *i <= (TBase+TCount)) { | 
 | 661 |               code += *i-TBase; | 
 | 662 |               i++; | 
 | 663 |           } | 
 | 664 |           *o++ = code; | 
 | 665 |           continue; | 
 | 666 |       } | 
 | 667 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 668 |       f = find_nfc_index(self, nfc_first, *i); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 669 |       if (f == -1) { | 
 | 670 |           *o++ = *i++; | 
 | 671 |           continue; | 
 | 672 |       } | 
 | 673 |       /* Find next unblocked character. */ | 
 | 674 |       i1 = i+1; | 
 | 675 |       comb = 0; | 
 | 676 |       while (i1 < end) { | 
 | 677 |           int comb1 = _getrecord_ex(*i1)->combining; | 
 | 678 |           if (comb1 && comb == comb1) { | 
 | 679 |               /* Character is blocked. */ | 
 | 680 |               i1++; | 
 | 681 |               continue; | 
 | 682 |           } | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 683 |           l = find_nfc_index(self, nfc_last, *i1); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 684 |           /* *i1 cannot be combined with *i. If *i1 | 
 | 685 |              is a starter, we don't need to look further. | 
 | 686 |              Otherwise, record the combining class. */ | 
 | 687 |           if (l == -1) { | 
 | 688 |             not_combinable: | 
 | 689 |               if (comb1 == 0) | 
 | 690 |                   break; | 
 | 691 |               comb = comb1; | 
 | 692 |               i1++; | 
 | 693 |               continue; | 
 | 694 |           } | 
 | 695 |           index = f*TOTAL_LAST + l; | 
 | 696 |           index1 = comp_index[index >> COMP_SHIFT]; | 
 | 697 |           code = comp_data[(index1<<COMP_SHIFT)+ | 
 | 698 |                            (index&((1<<COMP_SHIFT)-1))]; | 
 | 699 |           if (code == 0) | 
 | 700 |               goto not_combinable; | 
 | 701 | 			 | 
 | 702 |           /* Replace the original character. */ | 
 | 703 |           *i = code; | 
 | 704 |           /* Mark the second character unused. */ | 
 | 705 |           skipped[cskipped++] = i1; | 
 | 706 |           i1++; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 707 |           f = find_nfc_index(self, nfc_first, *i); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 708 |           if (f == -1) | 
 | 709 |               break; | 
 | 710 |       } | 
 | 711 |       *o++ = *i++; | 
 | 712 |     } | 
 | 713 |     if (o != end) | 
 | 714 |         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); | 
 | 715 |     return result; | 
 | 716 | } | 
 | 717 | 		 | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 718 | PyDoc_STRVAR(unicodedata_normalize__doc__, | 
 | 719 | "normalize(form, unistr)\n\ | 
 | 720 | \n\ | 
 | 721 | Return the normal form 'form' for the Unicode string unistr.  Valid\n\ | 
 | 722 | values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); | 
 | 723 |  | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 724 | static PyObject* | 
 | 725 | unicodedata_normalize(PyObject *self, PyObject *args) | 
 | 726 | { | 
 | 727 |     char *form; | 
 | 728 |     PyObject *input; | 
 | 729 |  | 
| Hye-Shik Chang | 69dc1c8 | 2004-07-15 04:30:25 +0000 | [diff] [blame] | 730 |     if(!PyArg_ParseTuple(args, "sO!:normalize", | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 731 |                          &form, &PyUnicode_Type, &input)) | 
 | 732 |         return NULL; | 
 | 733 |  | 
| Martin v. Löwis | 61e40bd | 2004-04-17 19:36:48 +0000 | [diff] [blame] | 734 |     if (PyUnicode_GetSize(input) == 0) { | 
 | 735 |         /* Special case empty input strings, since resizing | 
 | 736 |            them  later would cause internal errors. */ | 
 | 737 |         Py_INCREF(input); | 
 | 738 |         return input; | 
 | 739 |     } | 
 | 740 |  | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 741 |     if (strcmp(form, "NFC") == 0) | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 742 |         return nfc_nfkc(self, input, 0); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 743 |     if (strcmp(form, "NFKC") == 0) | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 744 |         return nfc_nfkc(self, input, 1); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 745 |     if (strcmp(form, "NFD") == 0) | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 746 |         return nfd_nfkd(self, input, 0); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 747 |     if (strcmp(form, "NFKD") == 0) | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 748 |         return nfd_nfkd(self, input, 1); | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 749 |     PyErr_SetString(PyExc_ValueError, "invalid normalization form"); | 
 | 750 |     return NULL; | 
 | 751 | } | 
 | 752 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 753 | /* -------------------------------------------------------------------- */ | 
 | 754 | /* unicode character name tables */ | 
 | 755 |  | 
 | 756 | /* data file generated by Tools/unicode/makeunicodedata.py */ | 
 | 757 | #include "unicodename_db.h" | 
 | 758 |  | 
 | 759 | /* -------------------------------------------------------------------- */ | 
 | 760 | /* database code (cut and pasted from the unidb package) */ | 
 | 761 |  | 
 | 762 | static unsigned long | 
| Fredrik Lundh | b95896b | 2001-02-18 22:06:17 +0000 | [diff] [blame] | 763 | _gethash(const char *s, int len, int scale) | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 764 | { | 
 | 765 |     int i; | 
 | 766 |     unsigned long h = 0; | 
 | 767 |     unsigned long ix; | 
 | 768 |     for (i = 0; i < len; i++) { | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 769 |         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i])); | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 770 |         ix = h & 0xff000000; | 
 | 771 |         if (ix) | 
 | 772 |             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; | 
 | 773 |     } | 
 | 774 |     return h; | 
 | 775 | } | 
 | 776 |  | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 777 | static char *hangul_syllables[][3] = { | 
 | 778 |     { "G",  "A",   ""   }, | 
 | 779 |     { "GG", "AE",  "G"  }, | 
 | 780 |     { "N",  "YA",  "GG" }, | 
 | 781 |     { "D",  "YAE", "GS" }, | 
 | 782 |     { "DD", "EO",  "N", }, | 
 | 783 |     { "R",  "E",   "NJ" }, | 
 | 784 |     { "M",  "YEO", "NH" }, | 
 | 785 |     { "B",  "YE",  "D"  }, | 
 | 786 |     { "BB", "O",   "L"  }, | 
 | 787 |     { "S",  "WA",  "LG" }, | 
 | 788 |     { "SS", "WAE", "LM" }, | 
 | 789 |     { "",   "OE",  "LB" }, | 
 | 790 |     { "J",  "YO",  "LS" }, | 
 | 791 |     { "JJ", "U",   "LT" }, | 
 | 792 |     { "C",  "WEO", "LP" }, | 
 | 793 |     { "K",  "WE",  "LH" }, | 
 | 794 |     { "T",  "WI",  "M"  }, | 
 | 795 |     { "P",  "YU",  "B"  }, | 
 | 796 |     { "H",  "EU",  "BS" }, | 
 | 797 |     { 0,    "YI",  "S"  }, | 
 | 798 |     { 0,    "I",   "SS" }, | 
 | 799 |     { 0,    0,     "NG" }, | 
 | 800 |     { 0,    0,     "J"  }, | 
 | 801 |     { 0,    0,     "C"  }, | 
 | 802 |     { 0,    0,     "K"  }, | 
 | 803 |     { 0,    0,     "T"  }, | 
 | 804 |     { 0,    0,     "P"  }, | 
 | 805 |     { 0,    0,     "H"  } | 
 | 806 | }; | 
 | 807 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 808 | static int | 
| Martin v. Löwis | 8d93ca1 | 2002-11-23 22:10:29 +0000 | [diff] [blame] | 809 | is_unified_ideograph(Py_UCS4 code) | 
 | 810 | { | 
 | 811 |     return ( | 
 | 812 |         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ | 
| Martin v. Löwis | c350912 | 2006-03-11 12:16:23 +0000 | [diff] [blame] | 813 |         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ | 
| Martin v. Löwis | 8d93ca1 | 2002-11-23 22:10:29 +0000 | [diff] [blame] | 814 |         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ | 
 | 815 | } | 
 | 816 |  | 
 | 817 | static int | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 818 | _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 819 | { | 
 | 820 |     int offset; | 
 | 821 |     int i; | 
 | 822 |     int word; | 
 | 823 |     unsigned char* w; | 
 | 824 |  | 
| Martin v. Löwis | c350912 | 2006-03-11 12:16:23 +0000 | [diff] [blame] | 825 |     if (code >= 0x110000) | 
 | 826 |         return 0; | 
 | 827 |  | 
 | 828 |     if (self) { | 
 | 829 |         const change_record *old = get_old_record(self, code); | 
 | 830 |         if (old->category_changed == 0) { | 
 | 831 |             /* unassigned */ | 
 | 832 |             return 0; | 
 | 833 |         }  | 
 | 834 |     } | 
 | 835 |  | 
| Martin v. Löwis | 2f4be4e | 2002-11-23 17:11:06 +0000 | [diff] [blame] | 836 |     if (SBase <= code && code < SBase+SCount) { | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 837 | 	/* Hangul syllable. */ | 
 | 838 | 	int SIndex = code - SBase; | 
 | 839 | 	int L = SIndex / NCount; | 
 | 840 | 	int V = (SIndex % NCount) / TCount; | 
 | 841 | 	int T = SIndex % TCount; | 
 | 842 |  | 
 | 843 | 	if (buflen < 27) | 
 | 844 | 	    /* Worst case: HANGUL SYLLABLE <10chars>. */ | 
 | 845 | 	    return 0; | 
 | 846 | 	strcpy(buffer, "HANGUL SYLLABLE "); | 
 | 847 | 	buffer += 16; | 
 | 848 | 	strcpy(buffer, hangul_syllables[L][0]); | 
 | 849 | 	buffer += strlen(hangul_syllables[L][0]); | 
 | 850 | 	strcpy(buffer, hangul_syllables[V][1]); | 
 | 851 | 	buffer += strlen(hangul_syllables[V][1]); | 
 | 852 | 	strcpy(buffer, hangul_syllables[T][2]); | 
 | 853 | 	buffer += strlen(hangul_syllables[T][2]); | 
 | 854 | 	*buffer = '\0'; | 
 | 855 | 	return 1; | 
 | 856 |     } | 
 | 857 |  | 
| Martin v. Löwis | 8d93ca1 | 2002-11-23 22:10:29 +0000 | [diff] [blame] | 858 |     if (is_unified_ideograph(code)) { | 
| Martin v. Löwis | ef7fe2e | 2002-11-23 18:01:32 +0000 | [diff] [blame] | 859 |         if (buflen < 28) | 
 | 860 |             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ | 
 | 861 |             return 0; | 
 | 862 |         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); | 
 | 863 |         return 1; | 
 | 864 |     } | 
 | 865 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 866 |     /* get offset into phrasebook */ | 
 | 867 |     offset = phrasebook_offset1[(code>>phrasebook_shift)]; | 
 | 868 |     offset = phrasebook_offset2[(offset<<phrasebook_shift) + | 
 | 869 |                                (code&((1<<phrasebook_shift)-1))]; | 
 | 870 |     if (!offset) | 
 | 871 |         return 0; | 
 | 872 |  | 
 | 873 |     i = 0; | 
 | 874 |  | 
 | 875 |     for (;;) { | 
 | 876 |         /* get word index */ | 
 | 877 |         word = phrasebook[offset] - phrasebook_short; | 
 | 878 |         if (word >= 0) { | 
 | 879 |             word = (word << 8) + phrasebook[offset+1]; | 
 | 880 |             offset += 2; | 
 | 881 |         } else | 
 | 882 |             word = phrasebook[offset++]; | 
 | 883 |         if (i) { | 
 | 884 |             if (i > buflen) | 
 | 885 |                 return 0; /* buffer overflow */ | 
 | 886 |             buffer[i++] = ' '; | 
 | 887 |         } | 
 | 888 |         /* copy word string from lexicon.  the last character in the | 
 | 889 |            word has bit 7 set.  the last word in a string ends with | 
 | 890 |            0x80 */ | 
 | 891 |         w = lexicon + lexicon_offset[word]; | 
 | 892 |         while (*w < 128) { | 
 | 893 |             if (i >= buflen) | 
 | 894 |                 return 0; /* buffer overflow */ | 
 | 895 |             buffer[i++] = *w++; | 
 | 896 |         } | 
 | 897 |         if (i >= buflen) | 
 | 898 |             return 0; /* buffer overflow */ | 
 | 899 |         buffer[i++] = *w & 127; | 
 | 900 |         if (*w == 128) | 
 | 901 |             break; /* end of word */ | 
 | 902 |     } | 
 | 903 |  | 
 | 904 |     return 1; | 
 | 905 | } | 
 | 906 |  | 
 | 907 | static int | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 908 | _cmpname(PyObject *self, int code, const char* name, int namelen) | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 909 | { | 
 | 910 |     /* check if code corresponds to the given name */ | 
 | 911 |     int i; | 
 | 912 |     char buffer[NAME_MAXLEN]; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 913 |     if (!_getucname(self, code, buffer, sizeof(buffer))) | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 914 |         return 0; | 
 | 915 |     for (i = 0; i < namelen; i++) { | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 916 |         if (toupper(Py_CHARMASK(name[i])) != buffer[i]) | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 917 |             return 0; | 
 | 918 |     } | 
 | 919 |     return buffer[namelen] == '\0'; | 
 | 920 | } | 
 | 921 |  | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 922 | static void  | 
 | 923 | find_syllable(const char *str, int *len, int *pos, int count, int column) | 
 | 924 | { | 
 | 925 |     int i, len1; | 
 | 926 |     *len = -1; | 
 | 927 |     for (i = 0; i < count; i++) { | 
 | 928 | 	char *s = hangul_syllables[i][column]; | 
 | 929 | 	len1 = strlen(s); | 
 | 930 | 	if (len1 <= *len) | 
 | 931 | 	    continue; | 
 | 932 | 	if (strncmp(str, s, len1) == 0) { | 
 | 933 | 	    *len = len1; | 
 | 934 | 	    *pos = i; | 
 | 935 | 	} | 
 | 936 |     } | 
 | 937 |     if (*len == -1) { | 
 | 938 | 	*len = 0; | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 939 |     } | 
 | 940 | } | 
 | 941 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 942 | static int | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 943 | _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 944 | { | 
 | 945 |     unsigned int h, v; | 
 | 946 |     unsigned int mask = code_size-1; | 
 | 947 |     unsigned int i, incr; | 
 | 948 |  | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 949 |     /* Check for hangul syllables. */ | 
 | 950 |     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { | 
| Thomas Wouters | 1e365b2 | 2006-03-01 21:58:30 +0000 | [diff] [blame] | 951 | 	int len, L = -1, V = -1, T = -1; | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 952 | 	const char *pos = name + 16; | 
 | 953 | 	find_syllable(pos, &len, &L, LCount, 0); | 
 | 954 | 	pos += len; | 
 | 955 | 	find_syllable(pos, &len, &V, VCount, 1); | 
 | 956 | 	pos += len; | 
 | 957 | 	find_syllable(pos, &len, &T, TCount, 2); | 
 | 958 | 	pos += len; | 
| Martin v. Löwis | 8b291e2 | 2005-09-18 08:17:56 +0000 | [diff] [blame] | 959 | 	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 960 | 	    *code = SBase + (L*VCount+V)*TCount + T; | 
 | 961 | 	    return 1; | 
 | 962 | 	} | 
| Martin v. Löwis | ef7fe2e | 2002-11-23 18:01:32 +0000 | [diff] [blame] | 963 |         /* Otherwise, it's an illegal syllable name. */ | 
 | 964 |         return 0; | 
 | 965 |     } | 
 | 966 |  | 
 | 967 |     /* Check for unified ideographs. */ | 
 | 968 |     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { | 
 | 969 |         /* Four or five hexdigits must follow. */ | 
 | 970 |         v = 0; | 
 | 971 |         name += 22; | 
 | 972 |         namelen -= 22; | 
 | 973 |         if (namelen != 4 && namelen != 5) | 
 | 974 |             return 0; | 
 | 975 |         while (namelen--) { | 
 | 976 |             v *= 16; | 
 | 977 |             if (*name >= '0' && *name <= '9') | 
 | 978 |                 v += *name - '0'; | 
 | 979 |             else if (*name >= 'A' && *name <= 'F') | 
 | 980 |                 v += *name - 'A' + 10; | 
 | 981 |             else | 
 | 982 |                 return 0; | 
 | 983 |             name++; | 
 | 984 |         } | 
| Martin v. Löwis | 8d93ca1 | 2002-11-23 22:10:29 +0000 | [diff] [blame] | 985 |         if (!is_unified_ideograph(v)) | 
 | 986 |             return 0; | 
| Martin v. Löwis | ef7fe2e | 2002-11-23 18:01:32 +0000 | [diff] [blame] | 987 |         *code = v; | 
 | 988 |         return 1; | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 989 |     } | 
 | 990 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 991 |     /* the following is the same as python's dictionary lookup, with | 
 | 992 |        only minor changes.  see the makeunicodedata script for more | 
 | 993 |        details */ | 
 | 994 |  | 
| Fredrik Lundh | b95896b | 2001-02-18 22:06:17 +0000 | [diff] [blame] | 995 |     h = (unsigned int) _gethash(name, namelen, code_magic); | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 996 |     i = (~h) & mask; | 
 | 997 |     v = code_hash[i]; | 
 | 998 |     if (!v) | 
 | 999 |         return 0; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1000 |     if (_cmpname(self, v, name, namelen)) { | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1001 |         *code = v; | 
 | 1002 |         return 1; | 
 | 1003 |     } | 
 | 1004 |     incr = (h ^ (h >> 3)) & mask; | 
 | 1005 |     if (!incr) | 
 | 1006 |         incr = mask; | 
 | 1007 |     for (;;) { | 
 | 1008 |         i = (i + incr) & mask; | 
 | 1009 |         v = code_hash[i]; | 
 | 1010 |         if (!v) | 
| Fredrik Lundh | ae76367 | 2001-02-18 11:41:49 +0000 | [diff] [blame] | 1011 |             return 0; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1012 |         if (_cmpname(self, v, name, namelen)) { | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1013 |             *code = v; | 
 | 1014 |             return 1; | 
 | 1015 |         } | 
 | 1016 |         incr = incr << 1; | 
 | 1017 |         if (incr > mask) | 
 | 1018 |             incr = incr ^ code_poly; | 
 | 1019 |     } | 
 | 1020 | } | 
 | 1021 |  | 
 | 1022 | static const _PyUnicode_Name_CAPI hashAPI =  | 
 | 1023 | { | 
 | 1024 |     sizeof(_PyUnicode_Name_CAPI), | 
| Andrew MacIntyre | 74a3bec | 2002-06-13 11:55:14 +0000 | [diff] [blame] | 1025 |     _getucname, | 
| Fredrik Lundh | b95896b | 2001-02-18 22:06:17 +0000 | [diff] [blame] | 1026 |     _getcode | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1027 | }; | 
 | 1028 |  | 
 | 1029 | /* -------------------------------------------------------------------- */ | 
 | 1030 | /* Python bindings */ | 
 | 1031 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 1032 | PyDoc_STRVAR(unicodedata_name__doc__, | 
 | 1033 | "name(unichr[, default])\n\ | 
 | 1034 | Returns the name assigned to the Unicode character unichr as a\n\ | 
 | 1035 | string. If no name is defined, default is returned, or, if not\n\ | 
 | 1036 | given, ValueError is raised."); | 
 | 1037 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1038 | static PyObject * | 
 | 1039 | unicodedata_name(PyObject* self, PyObject* args) | 
 | 1040 | { | 
 | 1041 |     char name[NAME_MAXLEN]; | 
 | 1042 |  | 
 | 1043 |     PyUnicodeObject* v; | 
 | 1044 |     PyObject* defobj = NULL; | 
 | 1045 |     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) | 
 | 1046 |         return NULL; | 
 | 1047 |  | 
 | 1048 |     if (PyUnicode_GET_SIZE(v) != 1) { | 
 | 1049 | 	PyErr_SetString(PyExc_TypeError, | 
 | 1050 | 			"need a single Unicode character as parameter"); | 
 | 1051 | 	return NULL; | 
 | 1052 |     } | 
 | 1053 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1054 |     if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v), | 
 | 1055 |                     name, sizeof(name))) { | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1056 | 	if (defobj == NULL) { | 
 | 1057 | 	    PyErr_SetString(PyExc_ValueError, "no such name"); | 
 | 1058 |             return NULL; | 
 | 1059 | 	} | 
 | 1060 | 	else { | 
 | 1061 | 	    Py_INCREF(defobj); | 
 | 1062 | 	    return defobj; | 
 | 1063 | 	} | 
 | 1064 |     } | 
 | 1065 |  | 
| Walter Dörwald | 4254e76 | 2007-06-05 16:04:09 +0000 | [diff] [blame] | 1066 |     return PyUnicode_FromString(name); | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1067 | } | 
 | 1068 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 1069 | PyDoc_STRVAR(unicodedata_lookup__doc__, | 
 | 1070 | "lookup(name)\n\ | 
 | 1071 | \n\ | 
 | 1072 | Look up character by name.  If a character with the\n\ | 
 | 1073 | given name is found, return the corresponding Unicode\n\ | 
 | 1074 | character.  If not found, KeyError is raised."); | 
 | 1075 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1076 | static PyObject * | 
 | 1077 | unicodedata_lookup(PyObject* self, PyObject* args) | 
 | 1078 | { | 
 | 1079 |     Py_UCS4 code; | 
| Guido van Rossum | 806c246 | 2007-08-06 23:33:07 +0000 | [diff] [blame] | 1080 |     Py_UNICODE str[2]; | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1081 |  | 
 | 1082 |     char* name; | 
 | 1083 |     int namelen; | 
 | 1084 |     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) | 
 | 1085 |         return NULL; | 
 | 1086 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1087 |     if (!_getcode(self, name, namelen, &code)) { | 
| Guido van Rossum | 806c246 | 2007-08-06 23:33:07 +0000 | [diff] [blame] | 1088 |         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", | 
 | 1089 |                      name); | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1090 |         return NULL; | 
 | 1091 |     } | 
 | 1092 |  | 
| Guido van Rossum | 806c246 | 2007-08-06 23:33:07 +0000 | [diff] [blame] | 1093 | #ifndef Py_UNICODE_WIDE | 
 | 1094 |     if (code >= 0x10000) { | 
 | 1095 |         str[0] = 0xd800 + ((code - 0x10000) >> 10); | 
 | 1096 |         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff); | 
 | 1097 |         return PyUnicode_FromUnicode(str, 2); | 
 | 1098 |     } | 
 | 1099 | #endif | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1100 |     str[0] = (Py_UNICODE) code; | 
| Guido van Rossum | 806c246 | 2007-08-06 23:33:07 +0000 | [diff] [blame] | 1101 |     return PyUnicode_FromUnicode(str, 1);     | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1102 | } | 
 | 1103 |  | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 1104 | /* XXX Add doc strings. */ | 
 | 1105 |  | 
 | 1106 | static PyMethodDef unicodedata_functions[] = { | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 1107 |     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, | 
 | 1108 |     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, | 
 | 1109 |     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, | 
 | 1110 |     {"category", unicodedata_category, METH_VARARGS, | 
 | 1111 |                  unicodedata_category__doc__}, | 
 | 1112 |     {"bidirectional", unicodedata_bidirectional, METH_VARARGS, | 
 | 1113 |                       unicodedata_bidirectional__doc__}, | 
 | 1114 |     {"combining", unicodedata_combining, METH_VARARGS, | 
 | 1115 |                   unicodedata_combining__doc__}, | 
 | 1116 |     {"mirrored", unicodedata_mirrored, METH_VARARGS, | 
 | 1117 |                  unicodedata_mirrored__doc__}, | 
 | 1118 |     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, | 
 | 1119 |                          unicodedata_east_asian_width__doc__}, | 
 | 1120 |     {"decomposition", unicodedata_decomposition, METH_VARARGS, | 
 | 1121 |                       unicodedata_decomposition__doc__}, | 
 | 1122 |     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, | 
 | 1123 |     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, | 
 | 1124 |     {"normalize", unicodedata_normalize, METH_VARARGS, | 
 | 1125 |                   unicodedata_normalize__doc__}, | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 1126 |     {NULL, NULL}		/* sentinel */ | 
 | 1127 | }; | 
 | 1128 |  | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 1129 | static PyTypeObject UCD_Type = { | 
 | 1130 | 	/* The ob_type field must be initialized in the module init function | 
 | 1131 | 	 * to be portable to Windows without using C++. */ | 
| Martin v. Löwis | 9f2e346 | 2007-07-21 17:22:18 +0000 | [diff] [blame] | 1132 | 	PyVarObject_HEAD_INIT(NULL, 0) | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 1133 | 	"unicodedata.UCD",		/*tp_name*/ | 
 | 1134 | 	sizeof(PreviousDBVersion),	/*tp_basicsize*/ | 
 | 1135 | 	0,			/*tp_itemsize*/ | 
 | 1136 | 	/* methods */ | 
 | 1137 | 	(destructor)PyObject_Del, /*tp_dealloc*/ | 
 | 1138 | 	0,			/*tp_print*/ | 
 | 1139 | 	0,                      /*tp_getattr*/ | 
 | 1140 | 	0,			/*tp_setattr*/ | 
 | 1141 | 	0,			/*tp_compare*/ | 
 | 1142 | 	0,			/*tp_repr*/ | 
 | 1143 | 	0,			/*tp_as_number*/ | 
 | 1144 | 	0,			/*tp_as_sequence*/ | 
 | 1145 | 	0,			/*tp_as_mapping*/ | 
 | 1146 | 	0,			/*tp_hash*/ | 
 | 1147 |         0,                      /*tp_call*/ | 
 | 1148 |         0,                      /*tp_str*/ | 
 | 1149 |         PyObject_GenericGetAttr,/*tp_getattro*/ | 
 | 1150 |         0,                      /*tp_setattro*/ | 
 | 1151 |         0,                      /*tp_as_buffer*/ | 
 | 1152 |         Py_TPFLAGS_DEFAULT,     /*tp_flags*/ | 
 | 1153 |         0,                      /*tp_doc*/ | 
 | 1154 |         0,                      /*tp_traverse*/ | 
 | 1155 |         0,                      /*tp_clear*/ | 
 | 1156 |         0,                      /*tp_richcompare*/ | 
 | 1157 |         0,                      /*tp_weaklistoffset*/ | 
 | 1158 |         0,                      /*tp_iter*/ | 
 | 1159 |         0,                      /*tp_iternext*/ | 
 | 1160 |         unicodedata_functions,  /*tp_methods*/ | 
 | 1161 |         DB_members,             /*tp_members*/ | 
 | 1162 |         0,                      /*tp_getset*/ | 
 | 1163 |         0,                      /*tp_base*/ | 
 | 1164 |         0,                      /*tp_dict*/ | 
 | 1165 |         0,                      /*tp_descr_get*/ | 
 | 1166 |         0,                      /*tp_descr_set*/ | 
 | 1167 |         0,                      /*tp_dictoffset*/ | 
 | 1168 |         0,                      /*tp_init*/ | 
 | 1169 |         0,                      /*tp_alloc*/ | 
 | 1170 |         0,                      /*tp_new*/ | 
 | 1171 |         0,                      /*tp_free*/ | 
 | 1172 |         0,                      /*tp_is_gc*/ | 
 | 1173 | }; | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1174 |  | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 1175 | PyDoc_STRVAR(unicodedata_docstring, | 
 | 1176 | "This module provides access to the Unicode Character Database which\n\ | 
 | 1177 | defines character properties for all Unicode characters. The data in\n\ | 
 | 1178 | this database is based on the UnicodeData.txt file version\n\ | 
| Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1179 | 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\ | 
| Hye-Shik Chang | cf18a5d | 2005-04-04 16:32:07 +0000 | [diff] [blame] | 1180 | \n\ | 
 | 1181 | The module uses the same names and symbols as defined by the\n\ | 
| Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1182 | UnicodeData File Format 4.1.0 (see\n\ | 
 | 1183 | http://www.unicode.org/Public/4.1.0/ucd/UCD.html)."); | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1184 |  | 
| Mark Hammond | 62b1ab1 | 2002-07-23 06:31:15 +0000 | [diff] [blame] | 1185 | PyMODINIT_FUNC | 
| Thomas Wouters | f3f33dc | 2000-07-21 06:00:07 +0000 | [diff] [blame] | 1186 | initunicodedata(void) | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 1187 | { | 
| Fred Drake | a2bd8d3 | 2002-04-03 21:39:26 +0000 | [diff] [blame] | 1188 |     PyObject *m, *v; | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1189 |  | 
| Martin v. Löwis | 9f2e346 | 2007-07-21 17:22:18 +0000 | [diff] [blame] | 1190 |     Py_Type(&UCD_Type) = &PyType_Type; | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 1191 |  | 
| Fred Drake | f585bef | 2001-03-03 19:41:55 +0000 | [diff] [blame] | 1192 |     m = Py_InitModule3( | 
 | 1193 |         "unicodedata", unicodedata_functions, unicodedata_docstring); | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1194 |     if (!m) | 
 | 1195 |         return; | 
 | 1196 |  | 
| Martin v. Löwis | b5c980b | 2002-11-25 09:13:37 +0000 | [diff] [blame] | 1197 |     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); | 
| Martin v. Löwis | 0e2f9b2 | 2006-03-10 11:29:32 +0000 | [diff] [blame] | 1198 |     Py_INCREF(&UCD_Type); | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 1199 |     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); | 
| Martin v. Löwis | b5c980b | 2002-11-25 09:13:37 +0000 | [diff] [blame] | 1200 |  | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1201 |     /* Previous versions */ | 
 | 1202 |     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); | 
 | 1203 |     if (v != NULL) | 
| Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 1204 |         PyModule_AddObject(m, "ucd_3_2_0", v); | 
| Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 1205 |  | 
| Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 1206 |     /* Export C API */ | 
 | 1207 |     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); | 
| Fred Drake | a2bd8d3 | 2002-04-03 21:39:26 +0000 | [diff] [blame] | 1208 |     if (v != NULL) | 
 | 1209 |         PyModule_AddObject(m, "ucnhash_CAPI", v); | 
| Guido van Rossum | 2a70a3a | 2000-03-10 23:10:21 +0000 | [diff] [blame] | 1210 | } | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 1211 |  | 
 | 1212 | /*  | 
 | 1213 | Local variables: | 
 | 1214 | c-basic-offset: 4 | 
| Martin v. Löwis | 677bde2 | 2002-11-23 22:08:15 +0000 | [diff] [blame] | 1215 | indent-tabs-mode: nil | 
| Martin v. Löwis | 7d41e29 | 2002-11-23 12:22:32 +0000 | [diff] [blame] | 1216 | End: | 
 | 1217 | */ |