blob: 4a1e94c83c74a1e67d79f00b1e428dcabaebbd50 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
17
18/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000019
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000020typedef struct {
21 const unsigned char category; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining; /* combining class value 0 - 255 */
24 const unsigned char bidirectional; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000027 const unsigned char east_asian_width; /* index into
28 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000029} _PyUnicode_DatabaseRecord;
30
31/* data file generated by Tools/unicode/makeunicodedata.py */
32#include "unicodedata_db.h"
33
34static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000035_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000036{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000038 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000039 index = 0;
40 else {
41 index = index1[(code>>SHIFT)];
42 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
43 }
44
45 return &_PyUnicode_Database_Records[index];
46}
47
Martin v. Löwis677bde22002-11-23 22:08:15 +000048static const _PyUnicode_DatabaseRecord*
49_getrecord(PyUnicodeObject* v)
50{
51 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
52}
53
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000054/* --- Module API --------------------------------------------------------- */
55
56static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000058{
59 PyUnicodeObject *v;
60 PyObject *defobj = NULL;
61 long rc;
62
Fredrik Lundh06d12682001-01-24 07:59:11 +000063 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000065 if (PyUnicode_GET_SIZE(v) != 1) {
66 PyErr_SetString(PyExc_TypeError,
67 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000068 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000069 }
70 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
71 if (rc < 0) {
72 if (defobj == NULL) {
73 PyErr_SetString(PyExc_ValueError,
74 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000075 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000076 }
77 else {
78 Py_INCREF(defobj);
79 return defobj;
80 }
81 }
82 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000083}
84
85static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000086unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000087{
88 PyUnicodeObject *v;
89 PyObject *defobj = NULL;
90 long rc;
91
Fredrik Lundh06d12682001-01-24 07:59:11 +000092 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000093 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000094 if (PyUnicode_GET_SIZE(v) != 1) {
95 PyErr_SetString(PyExc_TypeError,
96 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000097 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000098 }
99 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
100 if (rc < 0) {
101 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000102 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000103 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000104 }
105 else {
106 Py_INCREF(defobj);
107 return defobj;
108 }
109 }
110 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000111}
112
113static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000114unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000115{
116 PyUnicodeObject *v;
117 PyObject *defobj = NULL;
118 double rc;
119
Fredrik Lundh06d12682001-01-24 07:59:11 +0000120 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000121 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 if (PyUnicode_GET_SIZE(v) != 1) {
123 PyErr_SetString(PyExc_TypeError,
124 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000125 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000126 }
127 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
128 if (rc < 0) {
129 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000130 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000132 }
133 else {
134 Py_INCREF(defobj);
135 return defobj;
136 }
137 }
138 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000139}
140
141static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000142unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000143{
144 PyUnicodeObject *v;
145 int index;
146
147 if (!PyArg_ParseTuple(args, "O!:category",
148 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000149 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000150 if (PyUnicode_GET_SIZE(v) != 1) {
151 PyErr_SetString(PyExc_TypeError,
152 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000154 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000155 index = (int) _getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157}
158
159static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000161{
162 PyUnicodeObject *v;
163 int index;
164
165 if (!PyArg_ParseTuple(args, "O!:bidirectional",
166 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000167 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168 if (PyUnicode_GET_SIZE(v) != 1) {
169 PyErr_SetString(PyExc_TypeError,
170 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000173 index = (int) _getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175}
176
177static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000179{
180 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181
182 if (!PyArg_ParseTuple(args, "O!:combining",
183 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185 if (PyUnicode_GET_SIZE(v) != 1) {
186 PyErr_SetString(PyExc_TypeError,
187 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000188 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000190 return PyInt_FromLong((int) _getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191}
192
193static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000194unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195{
196 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197
198 if (!PyArg_ParseTuple(args, "O!:mirrored",
199 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 if (PyUnicode_GET_SIZE(v) != 1) {
202 PyErr_SetString(PyExc_TypeError,
203 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000206 return PyInt_FromLong((int) _getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207}
208
209static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000210unicodedata_east_asian_width(PyObject *self, PyObject *args)
211{
212 PyUnicodeObject *v;
213 int index;
214
215 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
216 &PyUnicode_Type, &v))
217 return NULL;
218 if (PyUnicode_GET_SIZE(v) != 1) {
219 PyErr_SetString(PyExc_TypeError,
220 "need a single Unicode character as parameter");
221 return NULL;
222 }
223 index = (int) _getrecord(v)->east_asian_width;
224 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
225}
226
227static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000228unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000229{
230 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000231 char decomp[256];
232 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233
234 if (!PyArg_ParseTuple(args, "O!:decomposition",
235 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000236 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000237 if (PyUnicode_GET_SIZE(v) != 1) {
238 PyErr_SetString(PyExc_TypeError,
239 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000240 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242
243 code = (int) *PyUnicode_AS_UNICODE(v);
244
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000245 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000246 index = 0;
247 else {
248 index = decomp_index1[(code>>DECOMP_SHIFT)];
249 index = decomp_index2[(index<<DECOMP_SHIFT)+
250 (code&((1<<DECOMP_SHIFT)-1))];
251 }
252
Tim Peters69b83b12001-11-30 07:23:05 +0000253 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000254 is prefix code (from*/
255 count = decomp_data[index] >> 8;
256
257 /* XXX: could allocate the PyString up front instead
258 (strlen(prefix) + 5 * count + 1 bytes) */
259
260 /* copy prefix */
261 i = strlen(decomp_prefix[decomp_data[index] & 255]);
262 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
263
264 while (count-- > 0) {
265 if (i)
266 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000267 assert((size_t)i < sizeof(decomp));
268 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
269 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000270 i += strlen(decomp + i);
271 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000273 decomp[i] = '\0';
274
275 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000276}
277
Martin v. Löwis677bde22002-11-23 22:08:15 +0000278void
279get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
280{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000281 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000282 *index = 0;
283 }
284 else {
285 *index = decomp_index1[(code>>DECOMP_SHIFT)];
286 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
287 (code&((1<<DECOMP_SHIFT)-1))];
288 }
289
290 /* high byte is number of hex bytes (usually one or two), low byte
291 is prefix code (from*/
292 *count = decomp_data[*index] >> 8;
293 *prefix = decomp_data[*index] & 255;
294
295 (*index)++;
296}
297
298#define SBase 0xAC00
299#define LBase 0x1100
300#define VBase 0x1161
301#define TBase 0x11A7
302#define LCount 19
303#define VCount 21
304#define TCount 28
305#define NCount (VCount*TCount)
306#define SCount (LCount*NCount)
307
308static PyObject*
309nfd_nfkd(PyObject *input, int k)
310{
311 PyObject *result;
312 Py_UNICODE *i, *end, *o;
313 /* Longest decomposition in Unicode 3.2: U+FDFA */
314 Py_UNICODE stack[20];
315 int space, stackptr, isize;
316 int index, prefix, count;
317 unsigned char prev, cur;
318
319 stackptr = 0;
320 isize = PyUnicode_GET_SIZE(input);
321 /* Overallocate atmost 10 characters. */
322 space = (isize > 10 ? 10 : isize) + isize;
323 result = PyUnicode_FromUnicode(NULL, space);
324 if (!result)
325 return NULL;
326 i = PyUnicode_AS_UNICODE(input);
327 end = i + isize;
328 o = PyUnicode_AS_UNICODE(result);
329
330 while (i < end) {
331 stack[stackptr++] = *i++;
332 while(stackptr) {
333 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000334 /* Hangul Decomposition adds three characters in
335 a single step, so we need atleast that much room. */
336 if (space < 3) {
337 int newsize = PyString_GET_SIZE(result) + 10;
338 space += 10;
339 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000340 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000341 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000342 }
343 /* Hangul Decomposition. */
344 if (SBase <= code && code < (SBase+SCount)) {
345 int SIndex = code - SBase;
346 int L = LBase + SIndex / NCount;
347 int V = VBase + (SIndex % NCount) / TCount;
348 int T = TBase + SIndex % TCount;
349 *o++ = L;
350 *o++ = V;
351 space -= 2;
352 if (T != TBase) {
353 *o++ = T;
354 space --;
355 }
356 continue;
357 }
358 /* Other decompoistions. */
359 get_decomp_record(code, &index, &prefix, &count);
360
361 /* Copy character if it is not decomposable, or has a
362 compatibility decomposition, but we do NFD. */
363 if (!count || (prefix && !k)) {
364 *o++ = code;
365 space--;
366 continue;
367 }
368 /* Copy decomposition onto the stack, in reverse
369 order. */
370 while(count) {
371 code = decomp_data[index + (--count)];
372 stack[stackptr++] = code;
373 }
374 }
375 }
376
377 /* Drop overallocation. Cannot fail. */
378 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
379
380 /* Sort canonically. */
381 i = PyUnicode_AS_UNICODE(result);
382 prev = _getrecord_ex(*i)->combining;
383 end = i + PyUnicode_GET_SIZE(result);
384 for (i++; i < end; i++) {
385 cur = _getrecord_ex(*i)->combining;
386 if (prev == 0 || cur == 0 || prev <= cur) {
387 prev = cur;
388 continue;
389 }
390 /* Non-canonical order. Need to switch *i with previous. */
391 o = i - 1;
392 while (1) {
393 Py_UNICODE tmp = o[1];
394 o[1] = o[0];
395 o[0] = tmp;
396 o--;
397 if (o < PyUnicode_AS_UNICODE(result))
398 break;
399 prev = _getrecord_ex(*o)->combining;
400 if (prev == 0 || prev <= cur)
401 break;
402 }
403 prev = _getrecord_ex(*i)->combining;
404 }
405 return result;
406}
407
408static int
409find_nfc_index(struct reindex* nfc, Py_UNICODE code)
410{
411 int index;
412 for (index = 0; nfc[index].start; index++) {
413 int start = nfc[index].start;
414 if (code < start)
415 return -1;
416 if (code <= start + nfc[index].count) {
417 int delta = code - start;
418 return nfc[index].index + delta;
419 }
420 }
421 return -1;
422}
423
424static PyObject*
425nfc_nfkc(PyObject *input, int k)
426{
427 PyObject *result;
428 Py_UNICODE *i, *i1, *o, *end;
429 int f,l,index,index1,comb;
430 Py_UNICODE code;
431 Py_UNICODE *skipped[20];
432 int cskipped = 0;
433
434 result = nfd_nfkd(input, k);
435 if (!result)
436 return NULL;
437
438 /* We are going to modify result in-place.
439 If nfd_nfkd is changed to sometimes return the input,
440 this code needs to be reviewed. */
441 assert(result != input);
442
443 i = PyUnicode_AS_UNICODE(result);
444 end = i + PyUnicode_GET_SIZE(result);
445 o = PyUnicode_AS_UNICODE(result);
446
447 again:
448 while (i < end) {
449 for (index = 0; index < cskipped; index++) {
450 if (skipped[index] == i) {
451 /* *i character is skipped.
452 Remove from list. */
453 skipped[index] = skipped[cskipped-1];
454 cskipped--;
455 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000456 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000457 }
458 }
459 /* Hangul Composition. We don't need to check for <LV,T>
460 pairs, since we always have decomposed data. */
461 if (LBase <= *i && *i < (LBase+LCount) &&
462 i + 1 < end &&
463 VBase <= i[1] && i[1] <= (VBase+VCount)) {
464 int LIndex, VIndex;
465 LIndex = i[0] - LBase;
466 VIndex = i[1] - VBase;
467 code = SBase + (LIndex*VCount+VIndex)*TCount;
468 i+=2;
469 if (i < end &&
470 TBase <= *i && *i <= (TBase+TCount)) {
471 code += *i-TBase;
472 i++;
473 }
474 *o++ = code;
475 continue;
476 }
477
478 f = find_nfc_index(nfc_first, *i);
479 if (f == -1) {
480 *o++ = *i++;
481 continue;
482 }
483 /* Find next unblocked character. */
484 i1 = i+1;
485 comb = 0;
486 while (i1 < end) {
487 int comb1 = _getrecord_ex(*i1)->combining;
488 if (comb1 && comb == comb1) {
489 /* Character is blocked. */
490 i1++;
491 continue;
492 }
493 l = find_nfc_index(nfc_last, *i1);
494 /* *i1 cannot be combined with *i. If *i1
495 is a starter, we don't need to look further.
496 Otherwise, record the combining class. */
497 if (l == -1) {
498 not_combinable:
499 if (comb1 == 0)
500 break;
501 comb = comb1;
502 i1++;
503 continue;
504 }
505 index = f*TOTAL_LAST + l;
506 index1 = comp_index[index >> COMP_SHIFT];
507 code = comp_data[(index1<<COMP_SHIFT)+
508 (index&((1<<COMP_SHIFT)-1))];
509 if (code == 0)
510 goto not_combinable;
511
512 /* Replace the original character. */
513 *i = code;
514 /* Mark the second character unused. */
515 skipped[cskipped++] = i1;
516 i1++;
517 f = find_nfc_index(nfc_first, *i);
518 if (f == -1)
519 break;
520 }
521 *o++ = *i++;
522 }
523 if (o != end)
524 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
525 return result;
526}
527
528static PyObject*
529unicodedata_normalize(PyObject *self, PyObject *args)
530{
531 char *form;
532 PyObject *input;
533
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000534 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535 &form, &PyUnicode_Type, &input))
536 return NULL;
537
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000538 if (PyUnicode_GetSize(input) == 0) {
539 /* Special case empty input strings, since resizing
540 them later would cause internal errors. */
541 Py_INCREF(input);
542 return input;
543 }
544
Martin v. Löwis677bde22002-11-23 22:08:15 +0000545 if (strcmp(form, "NFC") == 0)
546 return nfc_nfkc(input, 0);
547 if (strcmp(form, "NFKC") == 0)
548 return nfc_nfkc(input, 1);
549 if (strcmp(form, "NFD") == 0)
550 return nfd_nfkd(input, 0);
551 if (strcmp(form, "NFKD") == 0)
552 return nfd_nfkd(input, 1);
553 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
554 return NULL;
555}
556
Fredrik Lundh06d12682001-01-24 07:59:11 +0000557/* -------------------------------------------------------------------- */
558/* unicode character name tables */
559
560/* data file generated by Tools/unicode/makeunicodedata.py */
561#include "unicodename_db.h"
562
563/* -------------------------------------------------------------------- */
564/* database code (cut and pasted from the unidb package) */
565
566static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000567_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000568{
569 int i;
570 unsigned long h = 0;
571 unsigned long ix;
572 for (i = 0; i < len; i++) {
573 h = (h * scale) + (unsigned char) toupper(s[i]);
574 ix = h & 0xff000000;
575 if (ix)
576 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
577 }
578 return h;
579}
580
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000581static char *hangul_syllables[][3] = {
582 { "G", "A", "" },
583 { "GG", "AE", "G" },
584 { "N", "YA", "GG" },
585 { "D", "YAE", "GS" },
586 { "DD", "EO", "N", },
587 { "R", "E", "NJ" },
588 { "M", "YEO", "NH" },
589 { "B", "YE", "D" },
590 { "BB", "O", "L" },
591 { "S", "WA", "LG" },
592 { "SS", "WAE", "LM" },
593 { "", "OE", "LB" },
594 { "J", "YO", "LS" },
595 { "JJ", "U", "LT" },
596 { "C", "WEO", "LP" },
597 { "K", "WE", "LH" },
598 { "T", "WI", "M" },
599 { "P", "YU", "B" },
600 { "H", "EU", "BS" },
601 { 0, "YI", "S" },
602 { 0, "I", "SS" },
603 { 0, 0, "NG" },
604 { 0, 0, "J" },
605 { 0, 0, "C" },
606 { 0, 0, "K" },
607 { 0, 0, "T" },
608 { 0, 0, "P" },
609 { 0, 0, "H" }
610};
611
Fredrik Lundh06d12682001-01-24 07:59:11 +0000612static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000613is_unified_ideograph(Py_UCS4 code)
614{
615 return (
616 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
617 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
618 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
619}
620
621static int
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000622_getucname(Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000623{
624 int offset;
625 int i;
626 int word;
627 unsigned char* w;
628
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000629 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000630 /* Hangul syllable. */
631 int SIndex = code - SBase;
632 int L = SIndex / NCount;
633 int V = (SIndex % NCount) / TCount;
634 int T = SIndex % TCount;
635
636 if (buflen < 27)
637 /* Worst case: HANGUL SYLLABLE <10chars>. */
638 return 0;
639 strcpy(buffer, "HANGUL SYLLABLE ");
640 buffer += 16;
641 strcpy(buffer, hangul_syllables[L][0]);
642 buffer += strlen(hangul_syllables[L][0]);
643 strcpy(buffer, hangul_syllables[V][1]);
644 buffer += strlen(hangul_syllables[V][1]);
645 strcpy(buffer, hangul_syllables[T][2]);
646 buffer += strlen(hangul_syllables[T][2]);
647 *buffer = '\0';
648 return 1;
649 }
650
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000651 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000652 if (buflen < 28)
653 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
654 return 0;
655 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
656 return 1;
657 }
658
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000659 if (code >= 0x110000)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000660 return 0;
661
662 /* get offset into phrasebook */
663 offset = phrasebook_offset1[(code>>phrasebook_shift)];
664 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
665 (code&((1<<phrasebook_shift)-1))];
666 if (!offset)
667 return 0;
668
669 i = 0;
670
671 for (;;) {
672 /* get word index */
673 word = phrasebook[offset] - phrasebook_short;
674 if (word >= 0) {
675 word = (word << 8) + phrasebook[offset+1];
676 offset += 2;
677 } else
678 word = phrasebook[offset++];
679 if (i) {
680 if (i > buflen)
681 return 0; /* buffer overflow */
682 buffer[i++] = ' ';
683 }
684 /* copy word string from lexicon. the last character in the
685 word has bit 7 set. the last word in a string ends with
686 0x80 */
687 w = lexicon + lexicon_offset[word];
688 while (*w < 128) {
689 if (i >= buflen)
690 return 0; /* buffer overflow */
691 buffer[i++] = *w++;
692 }
693 if (i >= buflen)
694 return 0; /* buffer overflow */
695 buffer[i++] = *w & 127;
696 if (*w == 128)
697 break; /* end of word */
698 }
699
700 return 1;
701}
702
703static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000704_cmpname(int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000705{
706 /* check if code corresponds to the given name */
707 int i;
708 char buffer[NAME_MAXLEN];
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000709 if (!_getucname(code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000710 return 0;
711 for (i = 0; i < namelen; i++) {
712 if (toupper(name[i]) != buffer[i])
713 return 0;
714 }
715 return buffer[namelen] == '\0';
716}
717
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000718static void
719find_syllable(const char *str, int *len, int *pos, int count, int column)
720{
721 int i, len1;
722 *len = -1;
723 for (i = 0; i < count; i++) {
724 char *s = hangul_syllables[i][column];
725 len1 = strlen(s);
726 if (len1 <= *len)
727 continue;
728 if (strncmp(str, s, len1) == 0) {
729 *len = len1;
730 *pos = i;
731 }
732 }
733 if (*len == -1) {
734 *len = 0;
735 *pos = -1;
736 }
737}
738
Fredrik Lundh06d12682001-01-24 07:59:11 +0000739static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000740_getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000741{
742 unsigned int h, v;
743 unsigned int mask = code_size-1;
744 unsigned int i, incr;
745
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000746 /* Check for hangul syllables. */
747 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
748 int L, V, T, len;
749 const char *pos = name + 16;
750 find_syllable(pos, &len, &L, LCount, 0);
751 pos += len;
752 find_syllable(pos, &len, &V, VCount, 1);
753 pos += len;
754 find_syllable(pos, &len, &T, TCount, 2);
755 pos += len;
756 if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
757 *code = SBase + (L*VCount+V)*TCount + T;
758 return 1;
759 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000760 /* Otherwise, it's an illegal syllable name. */
761 return 0;
762 }
763
764 /* Check for unified ideographs. */
765 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
766 /* Four or five hexdigits must follow. */
767 v = 0;
768 name += 22;
769 namelen -= 22;
770 if (namelen != 4 && namelen != 5)
771 return 0;
772 while (namelen--) {
773 v *= 16;
774 if (*name >= '0' && *name <= '9')
775 v += *name - '0';
776 else if (*name >= 'A' && *name <= 'F')
777 v += *name - 'A' + 10;
778 else
779 return 0;
780 name++;
781 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000782 if (!is_unified_ideograph(v))
783 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000784 *code = v;
785 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000786 }
787
Fredrik Lundh06d12682001-01-24 07:59:11 +0000788 /* the following is the same as python's dictionary lookup, with
789 only minor changes. see the makeunicodedata script for more
790 details */
791
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000792 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000793 i = (~h) & mask;
794 v = code_hash[i];
795 if (!v)
796 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000797 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000798 *code = v;
799 return 1;
800 }
801 incr = (h ^ (h >> 3)) & mask;
802 if (!incr)
803 incr = mask;
804 for (;;) {
805 i = (i + incr) & mask;
806 v = code_hash[i];
807 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +0000808 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000809 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000810 *code = v;
811 return 1;
812 }
813 incr = incr << 1;
814 if (incr > mask)
815 incr = incr ^ code_poly;
816 }
817}
818
819static const _PyUnicode_Name_CAPI hashAPI =
820{
821 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000822 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000823 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +0000824};
825
826/* -------------------------------------------------------------------- */
827/* Python bindings */
828
829static PyObject *
830unicodedata_name(PyObject* self, PyObject* args)
831{
832 char name[NAME_MAXLEN];
833
834 PyUnicodeObject* v;
835 PyObject* defobj = NULL;
836 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
837 return NULL;
838
839 if (PyUnicode_GET_SIZE(v) != 1) {
840 PyErr_SetString(PyExc_TypeError,
841 "need a single Unicode character as parameter");
842 return NULL;
843 }
844
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000845 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000846 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000847 if (defobj == NULL) {
848 PyErr_SetString(PyExc_ValueError, "no such name");
849 return NULL;
850 }
851 else {
852 Py_INCREF(defobj);
853 return defobj;
854 }
855 }
856
857 return Py_BuildValue("s", name);
858}
859
860static PyObject *
861unicodedata_lookup(PyObject* self, PyObject* args)
862{
863 Py_UCS4 code;
864 Py_UNICODE str[1];
865
866 char* name;
867 int namelen;
868 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
869 return NULL;
870
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000871 if (!_getcode(name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000872 char fmt[] = "undefined character name '%s'";
873 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
874 sprintf(buf, fmt, name);
875 PyErr_SetString(PyExc_KeyError, buf);
876 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000877 return NULL;
878 }
879
880 str[0] = (Py_UNICODE) code;
881 return PyUnicode_FromUnicode(str, 1);
882}
883
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000884/* XXX Add doc strings. */
885
886static PyMethodDef unicodedata_functions[] = {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000887 {"decimal", unicodedata_decimal, METH_VARARGS},
888 {"digit", unicodedata_digit, METH_VARARGS},
889 {"numeric", unicodedata_numeric, METH_VARARGS},
890 {"category", unicodedata_category, METH_VARARGS},
891 {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
892 {"combining", unicodedata_combining, METH_VARARGS},
893 {"mirrored", unicodedata_mirrored, METH_VARARGS},
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000894 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS},
Fredrik Lundh06d12682001-01-24 07:59:11 +0000895 {"decomposition",unicodedata_decomposition, METH_VARARGS},
896 {"name", unicodedata_name, METH_VARARGS},
897 {"lookup", unicodedata_lookup, METH_VARARGS},
Martin v. Löwis677bde22002-11-23 22:08:15 +0000898 {"normalize", unicodedata_normalize, METH_VARARGS},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000899 {NULL, NULL} /* sentinel */
900};
901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +0000902PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
Fredrik Lundh06d12682001-01-24 07:59:11 +0000903
Mark Hammond62b1ab12002-07-23 06:31:15 +0000904PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000905initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000906{
Fred Drakea2bd8d32002-04-03 21:39:26 +0000907 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +0000908
Fred Drakef585bef2001-03-03 19:41:55 +0000909 m = Py_InitModule3(
910 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000911 if (!m)
912 return;
913
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000914 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
915
Fredrik Lundh06d12682001-01-24 07:59:11 +0000916 /* Export C API */
917 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +0000918 if (v != NULL)
919 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000920}
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000921
922/*
923Local variables:
924c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +0000925indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000926End:
927*/