blob: 06e5f04c05cac629189f2e32867d674a0e300b80 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode 3.0 data base.
4
5 Data was extracted from the Unicode 3.0 UnicodeData.txt file.
6
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00009
Fredrik Lundhcfcea492000-09-25 08:07:06 +000010 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
12 ------------------------------------------------------------------------ */
13
14#include "Python.h"
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000015
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000016typedef struct {
17 const unsigned char category; /* index into
18 _PyUnicode_CategoryNames */
19 const unsigned char combining; /* combining class value 0 - 255 */
20 const unsigned char bidirectional; /* index into
21 _PyUnicode_BidirectionalNames */
22 const unsigned char mirrored; /* true if mirrored in bidir mode */
23} _PyUnicode_DatabaseRecord;
24
25/* data file generated by Tools/unicode/makeunicodedata.py */
26#include "unicodedata_db.h"
27
28static const _PyUnicode_DatabaseRecord*
29getrecord(PyUnicodeObject* v)
30{
31 int code;
32 int index;
33
34 code = (int) *PyUnicode_AS_UNICODE(v);
35
36 if (code < 0 || code >= 65536)
37 index = 0;
38 else {
39 index = index1[(code>>SHIFT)];
40 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
41 }
42
43 return &_PyUnicode_Database_Records[index];
44}
45
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000046/* --- Module API --------------------------------------------------------- */
47
48static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000050{
51 PyUnicodeObject *v;
52 PyObject *defobj = NULL;
53 long rc;
54
55 if (!PyArg_ParseTuple(args, "O!|O:decimal",
56 &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000058 if (PyUnicode_GET_SIZE(v) != 1) {
59 PyErr_SetString(PyExc_TypeError,
60 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000061 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000062 }
63 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
64 if (rc < 0) {
65 if (defobj == NULL) {
66 PyErr_SetString(PyExc_ValueError,
67 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000068 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000069 }
70 else {
71 Py_INCREF(defobj);
72 return defobj;
73 }
74 }
75 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000076}
77
78static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000079unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000080{
81 PyUnicodeObject *v;
82 PyObject *defobj = NULL;
83 long rc;
84
85 if (!PyArg_ParseTuple(args, "O!|O:digit",
86 &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000087 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000088 if (PyUnicode_GET_SIZE(v) != 1) {
89 PyErr_SetString(PyExc_TypeError,
90 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000091 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000092 }
93 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
94 if (rc < 0) {
95 if (defobj == NULL) {
96 PyErr_SetString(PyExc_ValueError,
97 "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000098 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000099 }
100 else {
101 Py_INCREF(defobj);
102 return defobj;
103 }
104 }
105 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000106}
107
108static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000109unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110{
111 PyUnicodeObject *v;
112 PyObject *defobj = NULL;
113 double rc;
114
115 if (!PyArg_ParseTuple(args, "O!|O:numeric",
116 &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118 if (PyUnicode_GET_SIZE(v) != 1) {
119 PyErr_SetString(PyExc_TypeError,
120 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000121 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 }
123 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
124 if (rc < 0) {
125 if (defobj == NULL) {
126 PyErr_SetString(PyExc_ValueError,
127 "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000129 }
130 else {
131 Py_INCREF(defobj);
132 return defobj;
133 }
134 }
135 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000136}
137
138static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000139unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000140{
141 PyUnicodeObject *v;
142 int index;
143
144 if (!PyArg_ParseTuple(args, "O!:category",
145 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000146 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (PyUnicode_GET_SIZE(v) != 1) {
148 PyErr_SetString(PyExc_TypeError,
149 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 index = (int) getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000154}
155
156static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158{
159 PyUnicodeObject *v;
160 int index;
161
162 if (!PyArg_ParseTuple(args, "O!:bidirectional",
163 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165 if (PyUnicode_GET_SIZE(v) != 1) {
166 PyErr_SetString(PyExc_TypeError,
167 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 index = (int) getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172}
173
174static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000175unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000176{
177 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178
179 if (!PyArg_ParseTuple(args, "O!:combining",
180 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000181 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (PyUnicode_GET_SIZE(v) != 1) {
183 PyErr_SetString(PyExc_TypeError,
184 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000187 return PyInt_FromLong((int) getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000188}
189
190static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000191unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192{
193 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000194
195 if (!PyArg_ParseTuple(args, "O!:mirrored",
196 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000197 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000198 if (PyUnicode_GET_SIZE(v) != 1) {
199 PyErr_SetString(PyExc_TypeError,
200 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000201 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203 return PyInt_FromLong((int) getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204}
205
206static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000207unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208{
209 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000210 char decomp[256];
211 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000212
213 if (!PyArg_ParseTuple(args, "O!:decomposition",
214 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000216 if (PyUnicode_GET_SIZE(v) != 1) {
217 PyErr_SetString(PyExc_TypeError,
218 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000219 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000220 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000221
222 code = (int) *PyUnicode_AS_UNICODE(v);
223
224 if (code < 0 || code >= 65536)
225 index = 0;
226 else {
227 index = decomp_index1[(code>>DECOMP_SHIFT)];
228 index = decomp_index2[(index<<DECOMP_SHIFT)+
229 (code&((1<<DECOMP_SHIFT)-1))];
230 }
231
232 /* high byte is of hex bytes (usually one or two), low byte
233 is prefix code (from*/
234 count = decomp_data[index] >> 8;
235
236 /* XXX: could allocate the PyString up front instead
237 (strlen(prefix) + 5 * count + 1 bytes) */
238
239 /* copy prefix */
240 i = strlen(decomp_prefix[decomp_data[index] & 255]);
241 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
242
243 while (count-- > 0) {
244 if (i)
245 decomp[i++] = ' ';
246 sprintf(decomp + i, "%04X", decomp_data[++index]);
247 i += strlen(decomp + i);
248 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000249
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000250 decomp[i] = '\0';
251
252 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253}
254
255/* XXX Add doc strings. */
256
257static PyMethodDef unicodedata_functions[] = {
258 {"decimal", unicodedata_decimal, 1},
259 {"digit", unicodedata_digit, 1},
260 {"numeric", unicodedata_numeric, 1},
261 {"category", unicodedata_category, 1},
262 {"bidirectional", unicodedata_bidirectional, 1},
263 {"combining", unicodedata_combining, 1},
264 {"mirrored", unicodedata_mirrored, 1},
265 {"decomposition", unicodedata_decomposition, 1},
266 {NULL, NULL} /* sentinel */
267};
268
269DL_EXPORT(void)
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000270initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271{
272 Py_InitModule("unicodedata", unicodedata_functions);
273}