blob: 021bacf7b6a3f2baf806b3a796097537dd877b42 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
17
18/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000019
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000020typedef struct {
21 const unsigned char category; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining; /* combining class value 0 - 255 */
24 const unsigned char bidirectional; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000027 const unsigned char east_asian_width; /* index into
28 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000029} _PyUnicode_DatabaseRecord;
30
31/* data file generated by Tools/unicode/makeunicodedata.py */
32#include "unicodedata_db.h"
33
34static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000035_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000036{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000038 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000039 index = 0;
40 else {
41 index = index1[(code>>SHIFT)];
42 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
43 }
44
45 return &_PyUnicode_Database_Records[index];
46}
47
Martin v. Löwis677bde22002-11-23 22:08:15 +000048static const _PyUnicode_DatabaseRecord*
49_getrecord(PyUnicodeObject* v)
50{
51 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
52}
53
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000054/* --- Module API --------------------------------------------------------- */
55
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +000056PyDoc_STRVAR(unicodedata_decimal__doc__,
57"decimal(unichr[, default])\n\
58\n\
59Returns the decimal value assigned to the Unicode character unichr\n\
60as integer. If no such value is defined, default is returned, or, if\n\
61not given, ValueError is raised.");
62
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000063static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000065{
66 PyUnicodeObject *v;
67 PyObject *defobj = NULL;
68 long rc;
69
Fredrik Lundh06d12682001-01-24 07:59:11 +000070 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000071 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000072 if (PyUnicode_GET_SIZE(v) != 1) {
73 PyErr_SetString(PyExc_TypeError,
74 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000075 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000076 }
77 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
78 if (rc < 0) {
79 if (defobj == NULL) {
80 PyErr_SetString(PyExc_ValueError,
81 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000082 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000083 }
84 else {
85 Py_INCREF(defobj);
86 return defobj;
87 }
88 }
89 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000090}
91
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +000092PyDoc_STRVAR(unicodedata_digit__doc__,
93"digit(unichr[, default])\n\
94\n\
95Returns the digit value assigned to the Unicode character unichr as\n\
96integer. If no such value is defined, default is returned, or, if\n\
97not given, ValueError is raised.");
98
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000099static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000100unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000101{
102 PyUnicodeObject *v;
103 PyObject *defobj = NULL;
104 long rc;
105
Fredrik Lundh06d12682001-01-24 07:59:11 +0000106 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000107 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108 if (PyUnicode_GET_SIZE(v) != 1) {
109 PyErr_SetString(PyExc_TypeError,
110 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000111 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000112 }
113 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
114 if (rc < 0) {
115 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000116 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118 }
119 else {
120 Py_INCREF(defobj);
121 return defobj;
122 }
123 }
124 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125}
126
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000127PyDoc_STRVAR(unicodedata_numeric__doc__,
128"numeric(unichr[, default])\n\
129\n\
130Returns the numeric value assigned to the Unicode character unichr\n\
131as float. If no such value is defined, default is returned, or, if\n\
132not given, ValueError is raised.");
133
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000134static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000135unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000136{
137 PyUnicodeObject *v;
138 PyObject *defobj = NULL;
139 double rc;
140
Fredrik Lundh06d12682001-01-24 07:59:11 +0000141 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000142 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000143 if (PyUnicode_GET_SIZE(v) != 1) {
144 PyErr_SetString(PyExc_TypeError,
145 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000146 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 }
148 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
149 if (rc < 0) {
150 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000151 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153 }
154 else {
155 Py_INCREF(defobj);
156 return defobj;
157 }
158 }
159 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000160}
161
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000162PyDoc_STRVAR(unicodedata_category__doc__,
163"category(unichr)\n\
164\n\
165Returns the general category assigned to the Unicode character\n\
166unichr as string.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 int index;
173
174 if (!PyArg_ParseTuple(args, "O!:category",
175 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177 if (PyUnicode_GET_SIZE(v) != 1) {
178 PyErr_SetString(PyExc_TypeError,
179 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000182 index = (int) _getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000184}
185
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000186PyDoc_STRVAR(unicodedata_bidirectional__doc__,
187"bidirectional(unichr)\n\
188\n\
189Returns the bidirectional category assigned to the Unicode character\n\
190unichr as string. If no such value is defined, an empty string is\n\
191returned.");
192
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000194unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195{
196 PyUnicodeObject *v;
197 int index;
198
199 if (!PyArg_ParseTuple(args, "O!:bidirectional",
200 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000201 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202 if (PyUnicode_GET_SIZE(v) != 1) {
203 PyErr_SetString(PyExc_TypeError,
204 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000205 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000206 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000207 index = (int) _getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209}
210
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000211PyDoc_STRVAR(unicodedata_combining__doc__,
212"combining(unichr)\n\
213\n\
214Returns the canonical combining class assigned to the Unicode\n\
215character unichr as integer. Returns 0 if no combining class is\n\
216defined.");
217
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000218static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000219unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000220{
221 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000222
223 if (!PyArg_ParseTuple(args, "O!:combining",
224 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000225 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 if (PyUnicode_GET_SIZE(v) != 1) {
227 PyErr_SetString(PyExc_TypeError,
228 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000229 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000230 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000231 return PyInt_FromLong((int) _getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000232}
233
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000234PyDoc_STRVAR(unicodedata_mirrored__doc__,
235"mirrored(unichr)\n\
236\n\
237Returns the mirrored property assigned to the Unicode character\n\
238unichr as integer. Returns 1 if the character has been identified as\n\
239a \"mirrored\" character in bidirectional text, 0 otherwise.");
240
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243{
244 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000245
246 if (!PyArg_ParseTuple(args, "O!:mirrored",
247 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000248 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000249 if (PyUnicode_GET_SIZE(v) != 1) {
250 PyErr_SetString(PyExc_TypeError,
251 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000254 return PyInt_FromLong((int) _getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000255}
256
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000257PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
258"east_asian_width(unichr)\n\
259\n\
260Returns the east asian width assigned to the Unicode character\n\
261unichr as string.");
262
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000263static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000264unicodedata_east_asian_width(PyObject *self, PyObject *args)
265{
266 PyUnicodeObject *v;
267 int index;
268
269 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
270 &PyUnicode_Type, &v))
271 return NULL;
272 if (PyUnicode_GET_SIZE(v) != 1) {
273 PyErr_SetString(PyExc_TypeError,
274 "need a single Unicode character as parameter");
275 return NULL;
276 }
277 index = (int) _getrecord(v)->east_asian_width;
278 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
279}
280
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000281PyDoc_STRVAR(unicodedata_decomposition__doc__,
282"decomposition(unichr)\n\
283\n\
284Returns the character decomposition mapping assigned to the Unicode\n\
285character unichr as string. An empty string is returned in case no\n\
286such mapping is defined.");
287
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000288static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000289unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000290{
291 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000292 char decomp[256];
293 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294
295 if (!PyArg_ParseTuple(args, "O!:decomposition",
296 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000297 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298 if (PyUnicode_GET_SIZE(v) != 1) {
299 PyErr_SetString(PyExc_TypeError,
300 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000303
304 code = (int) *PyUnicode_AS_UNICODE(v);
305
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000306 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000307 index = 0;
308 else {
309 index = decomp_index1[(code>>DECOMP_SHIFT)];
310 index = decomp_index2[(index<<DECOMP_SHIFT)+
311 (code&((1<<DECOMP_SHIFT)-1))];
312 }
313
Tim Peters69b83b12001-11-30 07:23:05 +0000314 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 is prefix code (from*/
316 count = decomp_data[index] >> 8;
317
318 /* XXX: could allocate the PyString up front instead
319 (strlen(prefix) + 5 * count + 1 bytes) */
320
321 /* copy prefix */
322 i = strlen(decomp_prefix[decomp_data[index] & 255]);
323 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
324
325 while (count-- > 0) {
326 if (i)
327 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000328 assert((size_t)i < sizeof(decomp));
329 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
330 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000331 i += strlen(decomp + i);
332 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000334 decomp[i] = '\0';
335
336 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000337}
338
Martin v. Löwis677bde22002-11-23 22:08:15 +0000339void
340get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
341{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000342 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000343 *index = 0;
344 }
345 else {
346 *index = decomp_index1[(code>>DECOMP_SHIFT)];
347 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
348 (code&((1<<DECOMP_SHIFT)-1))];
349 }
350
351 /* high byte is number of hex bytes (usually one or two), low byte
352 is prefix code (from*/
353 *count = decomp_data[*index] >> 8;
354 *prefix = decomp_data[*index] & 255;
355
356 (*index)++;
357}
358
359#define SBase 0xAC00
360#define LBase 0x1100
361#define VBase 0x1161
362#define TBase 0x11A7
363#define LCount 19
364#define VCount 21
365#define TCount 28
366#define NCount (VCount*TCount)
367#define SCount (LCount*NCount)
368
369static PyObject*
370nfd_nfkd(PyObject *input, int k)
371{
372 PyObject *result;
373 Py_UNICODE *i, *end, *o;
374 /* Longest decomposition in Unicode 3.2: U+FDFA */
375 Py_UNICODE stack[20];
376 int space, stackptr, isize;
377 int index, prefix, count;
378 unsigned char prev, cur;
379
380 stackptr = 0;
381 isize = PyUnicode_GET_SIZE(input);
382 /* Overallocate atmost 10 characters. */
383 space = (isize > 10 ? 10 : isize) + isize;
384 result = PyUnicode_FromUnicode(NULL, space);
385 if (!result)
386 return NULL;
387 i = PyUnicode_AS_UNICODE(input);
388 end = i + isize;
389 o = PyUnicode_AS_UNICODE(result);
390
391 while (i < end) {
392 stack[stackptr++] = *i++;
393 while(stackptr) {
394 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000395 /* Hangul Decomposition adds three characters in
396 a single step, so we need atleast that much room. */
397 if (space < 3) {
398 int newsize = PyString_GET_SIZE(result) + 10;
399 space += 10;
400 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000401 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000402 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000403 }
404 /* Hangul Decomposition. */
405 if (SBase <= code && code < (SBase+SCount)) {
406 int SIndex = code - SBase;
407 int L = LBase + SIndex / NCount;
408 int V = VBase + (SIndex % NCount) / TCount;
409 int T = TBase + SIndex % TCount;
410 *o++ = L;
411 *o++ = V;
412 space -= 2;
413 if (T != TBase) {
414 *o++ = T;
415 space --;
416 }
417 continue;
418 }
419 /* Other decompoistions. */
420 get_decomp_record(code, &index, &prefix, &count);
421
422 /* Copy character if it is not decomposable, or has a
423 compatibility decomposition, but we do NFD. */
424 if (!count || (prefix && !k)) {
425 *o++ = code;
426 space--;
427 continue;
428 }
429 /* Copy decomposition onto the stack, in reverse
430 order. */
431 while(count) {
432 code = decomp_data[index + (--count)];
433 stack[stackptr++] = code;
434 }
435 }
436 }
437
438 /* Drop overallocation. Cannot fail. */
439 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
440
441 /* Sort canonically. */
442 i = PyUnicode_AS_UNICODE(result);
443 prev = _getrecord_ex(*i)->combining;
444 end = i + PyUnicode_GET_SIZE(result);
445 for (i++; i < end; i++) {
446 cur = _getrecord_ex(*i)->combining;
447 if (prev == 0 || cur == 0 || prev <= cur) {
448 prev = cur;
449 continue;
450 }
451 /* Non-canonical order. Need to switch *i with previous. */
452 o = i - 1;
453 while (1) {
454 Py_UNICODE tmp = o[1];
455 o[1] = o[0];
456 o[0] = tmp;
457 o--;
458 if (o < PyUnicode_AS_UNICODE(result))
459 break;
460 prev = _getrecord_ex(*o)->combining;
461 if (prev == 0 || prev <= cur)
462 break;
463 }
464 prev = _getrecord_ex(*i)->combining;
465 }
466 return result;
467}
468
469static int
470find_nfc_index(struct reindex* nfc, Py_UNICODE code)
471{
472 int index;
473 for (index = 0; nfc[index].start; index++) {
474 int start = nfc[index].start;
475 if (code < start)
476 return -1;
477 if (code <= start + nfc[index].count) {
478 int delta = code - start;
479 return nfc[index].index + delta;
480 }
481 }
482 return -1;
483}
484
485static PyObject*
486nfc_nfkc(PyObject *input, int k)
487{
488 PyObject *result;
489 Py_UNICODE *i, *i1, *o, *end;
490 int f,l,index,index1,comb;
491 Py_UNICODE code;
492 Py_UNICODE *skipped[20];
493 int cskipped = 0;
494
495 result = nfd_nfkd(input, k);
496 if (!result)
497 return NULL;
498
499 /* We are going to modify result in-place.
500 If nfd_nfkd is changed to sometimes return the input,
501 this code needs to be reviewed. */
502 assert(result != input);
503
504 i = PyUnicode_AS_UNICODE(result);
505 end = i + PyUnicode_GET_SIZE(result);
506 o = PyUnicode_AS_UNICODE(result);
507
508 again:
509 while (i < end) {
510 for (index = 0; index < cskipped; index++) {
511 if (skipped[index] == i) {
512 /* *i character is skipped.
513 Remove from list. */
514 skipped[index] = skipped[cskipped-1];
515 cskipped--;
516 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000517 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000518 }
519 }
520 /* Hangul Composition. We don't need to check for <LV,T>
521 pairs, since we always have decomposed data. */
522 if (LBase <= *i && *i < (LBase+LCount) &&
523 i + 1 < end &&
524 VBase <= i[1] && i[1] <= (VBase+VCount)) {
525 int LIndex, VIndex;
526 LIndex = i[0] - LBase;
527 VIndex = i[1] - VBase;
528 code = SBase + (LIndex*VCount+VIndex)*TCount;
529 i+=2;
530 if (i < end &&
531 TBase <= *i && *i <= (TBase+TCount)) {
532 code += *i-TBase;
533 i++;
534 }
535 *o++ = code;
536 continue;
537 }
538
539 f = find_nfc_index(nfc_first, *i);
540 if (f == -1) {
541 *o++ = *i++;
542 continue;
543 }
544 /* Find next unblocked character. */
545 i1 = i+1;
546 comb = 0;
547 while (i1 < end) {
548 int comb1 = _getrecord_ex(*i1)->combining;
549 if (comb1 && comb == comb1) {
550 /* Character is blocked. */
551 i1++;
552 continue;
553 }
554 l = find_nfc_index(nfc_last, *i1);
555 /* *i1 cannot be combined with *i. If *i1
556 is a starter, we don't need to look further.
557 Otherwise, record the combining class. */
558 if (l == -1) {
559 not_combinable:
560 if (comb1 == 0)
561 break;
562 comb = comb1;
563 i1++;
564 continue;
565 }
566 index = f*TOTAL_LAST + l;
567 index1 = comp_index[index >> COMP_SHIFT];
568 code = comp_data[(index1<<COMP_SHIFT)+
569 (index&((1<<COMP_SHIFT)-1))];
570 if (code == 0)
571 goto not_combinable;
572
573 /* Replace the original character. */
574 *i = code;
575 /* Mark the second character unused. */
576 skipped[cskipped++] = i1;
577 i1++;
578 f = find_nfc_index(nfc_first, *i);
579 if (f == -1)
580 break;
581 }
582 *o++ = *i++;
583 }
584 if (o != end)
585 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
586 return result;
587}
588
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000589PyDoc_STRVAR(unicodedata_normalize__doc__,
590"normalize(form, unistr)\n\
591\n\
592Return the normal form 'form' for the Unicode string unistr. Valid\n\
593values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
594
Martin v. Löwis677bde22002-11-23 22:08:15 +0000595static PyObject*
596unicodedata_normalize(PyObject *self, PyObject *args)
597{
598 char *form;
599 PyObject *input;
600
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000601 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000602 &form, &PyUnicode_Type, &input))
603 return NULL;
604
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000605 if (PyUnicode_GetSize(input) == 0) {
606 /* Special case empty input strings, since resizing
607 them later would cause internal errors. */
608 Py_INCREF(input);
609 return input;
610 }
611
Martin v. Löwis677bde22002-11-23 22:08:15 +0000612 if (strcmp(form, "NFC") == 0)
613 return nfc_nfkc(input, 0);
614 if (strcmp(form, "NFKC") == 0)
615 return nfc_nfkc(input, 1);
616 if (strcmp(form, "NFD") == 0)
617 return nfd_nfkd(input, 0);
618 if (strcmp(form, "NFKD") == 0)
619 return nfd_nfkd(input, 1);
620 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
621 return NULL;
622}
623
Fredrik Lundh06d12682001-01-24 07:59:11 +0000624/* -------------------------------------------------------------------- */
625/* unicode character name tables */
626
627/* data file generated by Tools/unicode/makeunicodedata.py */
628#include "unicodename_db.h"
629
630/* -------------------------------------------------------------------- */
631/* database code (cut and pasted from the unidb package) */
632
633static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000634_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000635{
636 int i;
637 unsigned long h = 0;
638 unsigned long ix;
639 for (i = 0; i < len; i++) {
640 h = (h * scale) + (unsigned char) toupper(s[i]);
641 ix = h & 0xff000000;
642 if (ix)
643 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
644 }
645 return h;
646}
647
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000648static char *hangul_syllables[][3] = {
649 { "G", "A", "" },
650 { "GG", "AE", "G" },
651 { "N", "YA", "GG" },
652 { "D", "YAE", "GS" },
653 { "DD", "EO", "N", },
654 { "R", "E", "NJ" },
655 { "M", "YEO", "NH" },
656 { "B", "YE", "D" },
657 { "BB", "O", "L" },
658 { "S", "WA", "LG" },
659 { "SS", "WAE", "LM" },
660 { "", "OE", "LB" },
661 { "J", "YO", "LS" },
662 { "JJ", "U", "LT" },
663 { "C", "WEO", "LP" },
664 { "K", "WE", "LH" },
665 { "T", "WI", "M" },
666 { "P", "YU", "B" },
667 { "H", "EU", "BS" },
668 { 0, "YI", "S" },
669 { 0, "I", "SS" },
670 { 0, 0, "NG" },
671 { 0, 0, "J" },
672 { 0, 0, "C" },
673 { 0, 0, "K" },
674 { 0, 0, "T" },
675 { 0, 0, "P" },
676 { 0, 0, "H" }
677};
678
Fredrik Lundh06d12682001-01-24 07:59:11 +0000679static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000680is_unified_ideograph(Py_UCS4 code)
681{
682 return (
683 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
684 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
685 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
686}
687
688static int
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000689_getucname(Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000690{
691 int offset;
692 int i;
693 int word;
694 unsigned char* w;
695
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000696 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000697 /* Hangul syllable. */
698 int SIndex = code - SBase;
699 int L = SIndex / NCount;
700 int V = (SIndex % NCount) / TCount;
701 int T = SIndex % TCount;
702
703 if (buflen < 27)
704 /* Worst case: HANGUL SYLLABLE <10chars>. */
705 return 0;
706 strcpy(buffer, "HANGUL SYLLABLE ");
707 buffer += 16;
708 strcpy(buffer, hangul_syllables[L][0]);
709 buffer += strlen(hangul_syllables[L][0]);
710 strcpy(buffer, hangul_syllables[V][1]);
711 buffer += strlen(hangul_syllables[V][1]);
712 strcpy(buffer, hangul_syllables[T][2]);
713 buffer += strlen(hangul_syllables[T][2]);
714 *buffer = '\0';
715 return 1;
716 }
717
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000718 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000719 if (buflen < 28)
720 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
721 return 0;
722 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
723 return 1;
724 }
725
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000726 if (code >= 0x110000)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000727 return 0;
728
729 /* get offset into phrasebook */
730 offset = phrasebook_offset1[(code>>phrasebook_shift)];
731 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
732 (code&((1<<phrasebook_shift)-1))];
733 if (!offset)
734 return 0;
735
736 i = 0;
737
738 for (;;) {
739 /* get word index */
740 word = phrasebook[offset] - phrasebook_short;
741 if (word >= 0) {
742 word = (word << 8) + phrasebook[offset+1];
743 offset += 2;
744 } else
745 word = phrasebook[offset++];
746 if (i) {
747 if (i > buflen)
748 return 0; /* buffer overflow */
749 buffer[i++] = ' ';
750 }
751 /* copy word string from lexicon. the last character in the
752 word has bit 7 set. the last word in a string ends with
753 0x80 */
754 w = lexicon + lexicon_offset[word];
755 while (*w < 128) {
756 if (i >= buflen)
757 return 0; /* buffer overflow */
758 buffer[i++] = *w++;
759 }
760 if (i >= buflen)
761 return 0; /* buffer overflow */
762 buffer[i++] = *w & 127;
763 if (*w == 128)
764 break; /* end of word */
765 }
766
767 return 1;
768}
769
770static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000771_cmpname(int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000772{
773 /* check if code corresponds to the given name */
774 int i;
775 char buffer[NAME_MAXLEN];
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000776 if (!_getucname(code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000777 return 0;
778 for (i = 0; i < namelen; i++) {
779 if (toupper(name[i]) != buffer[i])
780 return 0;
781 }
782 return buffer[namelen] == '\0';
783}
784
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000785static void
786find_syllable(const char *str, int *len, int *pos, int count, int column)
787{
788 int i, len1;
789 *len = -1;
790 for (i = 0; i < count; i++) {
791 char *s = hangul_syllables[i][column];
792 len1 = strlen(s);
793 if (len1 <= *len)
794 continue;
795 if (strncmp(str, s, len1) == 0) {
796 *len = len1;
797 *pos = i;
798 }
799 }
800 if (*len == -1) {
801 *len = 0;
802 *pos = -1;
803 }
804}
805
Fredrik Lundh06d12682001-01-24 07:59:11 +0000806static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000807_getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000808{
809 unsigned int h, v;
810 unsigned int mask = code_size-1;
811 unsigned int i, incr;
812
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000813 /* Check for hangul syllables. */
814 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
815 int L, V, T, len;
816 const char *pos = name + 16;
817 find_syllable(pos, &len, &L, LCount, 0);
818 pos += len;
819 find_syllable(pos, &len, &V, VCount, 1);
820 pos += len;
821 find_syllable(pos, &len, &T, TCount, 2);
822 pos += len;
823 if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
824 *code = SBase + (L*VCount+V)*TCount + T;
825 return 1;
826 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000827 /* Otherwise, it's an illegal syllable name. */
828 return 0;
829 }
830
831 /* Check for unified ideographs. */
832 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
833 /* Four or five hexdigits must follow. */
834 v = 0;
835 name += 22;
836 namelen -= 22;
837 if (namelen != 4 && namelen != 5)
838 return 0;
839 while (namelen--) {
840 v *= 16;
841 if (*name >= '0' && *name <= '9')
842 v += *name - '0';
843 else if (*name >= 'A' && *name <= 'F')
844 v += *name - 'A' + 10;
845 else
846 return 0;
847 name++;
848 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000849 if (!is_unified_ideograph(v))
850 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000851 *code = v;
852 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000853 }
854
Fredrik Lundh06d12682001-01-24 07:59:11 +0000855 /* the following is the same as python's dictionary lookup, with
856 only minor changes. see the makeunicodedata script for more
857 details */
858
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000859 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000860 i = (~h) & mask;
861 v = code_hash[i];
862 if (!v)
863 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000864 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000865 *code = v;
866 return 1;
867 }
868 incr = (h ^ (h >> 3)) & mask;
869 if (!incr)
870 incr = mask;
871 for (;;) {
872 i = (i + incr) & mask;
873 v = code_hash[i];
874 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +0000875 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000876 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000877 *code = v;
878 return 1;
879 }
880 incr = incr << 1;
881 if (incr > mask)
882 incr = incr ^ code_poly;
883 }
884}
885
886static const _PyUnicode_Name_CAPI hashAPI =
887{
888 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000889 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000890 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +0000891};
892
893/* -------------------------------------------------------------------- */
894/* Python bindings */
895
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000896PyDoc_STRVAR(unicodedata_name__doc__,
897"name(unichr[, default])\n\
898Returns the name assigned to the Unicode character unichr as a\n\
899string. If no name is defined, default is returned, or, if not\n\
900given, ValueError is raised.");
901
Fredrik Lundh06d12682001-01-24 07:59:11 +0000902static PyObject *
903unicodedata_name(PyObject* self, PyObject* args)
904{
905 char name[NAME_MAXLEN];
906
907 PyUnicodeObject* v;
908 PyObject* defobj = NULL;
909 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
910 return NULL;
911
912 if (PyUnicode_GET_SIZE(v) != 1) {
913 PyErr_SetString(PyExc_TypeError,
914 "need a single Unicode character as parameter");
915 return NULL;
916 }
917
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000918 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000919 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000920 if (defobj == NULL) {
921 PyErr_SetString(PyExc_ValueError, "no such name");
922 return NULL;
923 }
924 else {
925 Py_INCREF(defobj);
926 return defobj;
927 }
928 }
929
930 return Py_BuildValue("s", name);
931}
932
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000933PyDoc_STRVAR(unicodedata_lookup__doc__,
934"lookup(name)\n\
935\n\
936Look up character by name. If a character with the\n\
937given name is found, return the corresponding Unicode\n\
938character. If not found, KeyError is raised.");
939
Fredrik Lundh06d12682001-01-24 07:59:11 +0000940static PyObject *
941unicodedata_lookup(PyObject* self, PyObject* args)
942{
943 Py_UCS4 code;
944 Py_UNICODE str[1];
945
946 char* name;
947 int namelen;
948 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
949 return NULL;
950
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000951 if (!_getcode(name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000952 char fmt[] = "undefined character name '%s'";
953 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
954 sprintf(buf, fmt, name);
955 PyErr_SetString(PyExc_KeyError, buf);
956 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000957 return NULL;
958 }
959
960 str[0] = (Py_UNICODE) code;
961 return PyUnicode_FromUnicode(str, 1);
962}
963
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000964/* XXX Add doc strings. */
965
966static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000967 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
968 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
969 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
970 {"category", unicodedata_category, METH_VARARGS,
971 unicodedata_category__doc__},
972 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
973 unicodedata_bidirectional__doc__},
974 {"combining", unicodedata_combining, METH_VARARGS,
975 unicodedata_combining__doc__},
976 {"mirrored", unicodedata_mirrored, METH_VARARGS,
977 unicodedata_mirrored__doc__},
978 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
979 unicodedata_east_asian_width__doc__},
980 {"decomposition", unicodedata_decomposition, METH_VARARGS,
981 unicodedata_decomposition__doc__},
982 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
983 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
984 {"normalize", unicodedata_normalize, METH_VARARGS,
985 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000986 {NULL, NULL} /* sentinel */
987};
988
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000989PyDoc_STRVAR(unicodedata_docstring,
990"This module provides access to the Unicode Character Database which\n\
991defines character properties for all Unicode characters. The data in\n\
992this database is based on the UnicodeData.txt file version\n\
9933.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
994\n\
995The module uses the same names and symbols as defined by the\n\
996UnicodeData File Format 3.2.0 (see\n\
Hye-Shik Chang4c560ea2005-06-04 07:31:48 +0000997http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +0000998
Mark Hammond62b1ab12002-07-23 06:31:15 +0000999PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001000initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001001{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001002 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001003
Fred Drakef585bef2001-03-03 19:41:55 +00001004 m = Py_InitModule3(
1005 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001006 if (!m)
1007 return;
1008
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001009 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1010
Fredrik Lundh06d12682001-01-24 07:59:11 +00001011 /* Export C API */
1012 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001013 if (v != NULL)
1014 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001015}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001016
1017/*
1018Local variables:
1019c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001020indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001021End:
1022*/