blob: d5a1d176e3f30bb017b9625c276856a91c4744ad [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode 3.0 data base.
4
5 Data was extracted from the Unicode 3.0 UnicodeData.txt file.
6
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00009
Fredrik Lundhcfcea492000-09-25 08:07:06 +000010 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
12 ------------------------------------------------------------------------ */
13
14#include "Python.h"
15#include "unicodedatabase.h"
16
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000017typedef struct {
18 const unsigned char category; /* index into
19 _PyUnicode_CategoryNames */
20 const unsigned char combining; /* combining class value 0 - 255 */
21 const unsigned char bidirectional; /* index into
22 _PyUnicode_BidirectionalNames */
23 const unsigned char mirrored; /* true if mirrored in bidir mode */
24} _PyUnicode_DatabaseRecord;
25
26/* data file generated by Tools/unicode/makeunicodedata.py */
27#include "unicodedata_db.h"
28
29static const _PyUnicode_DatabaseRecord*
30getrecord(PyUnicodeObject* v)
31{
32 int code;
33 int index;
34
35 code = (int) *PyUnicode_AS_UNICODE(v);
36
37 if (code < 0 || code >= 65536)
38 index = 0;
39 else {
40 index = index1[(code>>SHIFT)];
41 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
42 }
43
44 return &_PyUnicode_Database_Records[index];
45}
46
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000047/* --- Module API --------------------------------------------------------- */
48
49static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000051{
52 PyUnicodeObject *v;
53 PyObject *defobj = NULL;
54 long rc;
55
56 if (!PyArg_ParseTuple(args, "O!|O:decimal",
57 &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000058 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000059 if (PyUnicode_GET_SIZE(v) != 1) {
60 PyErr_SetString(PyExc_TypeError,
61 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000062 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000063 }
64 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
65 if (rc < 0) {
66 if (defobj == NULL) {
67 PyErr_SetString(PyExc_ValueError,
68 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000069 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000070 }
71 else {
72 Py_INCREF(defobj);
73 return defobj;
74 }
75 }
76 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000077}
78
79static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000080unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000081{
82 PyUnicodeObject *v;
83 PyObject *defobj = NULL;
84 long rc;
85
86 if (!PyArg_ParseTuple(args, "O!|O:digit",
87 &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000088 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000089 if (PyUnicode_GET_SIZE(v) != 1) {
90 PyErr_SetString(PyExc_TypeError,
91 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000092 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000093 }
94 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
95 if (rc < 0) {
96 if (defobj == NULL) {
97 PyErr_SetString(PyExc_ValueError,
98 "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000099 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000100 }
101 else {
102 Py_INCREF(defobj);
103 return defobj;
104 }
105 }
106 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107}
108
109static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000110unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000111{
112 PyUnicodeObject *v;
113 PyObject *defobj = NULL;
114 double rc;
115
116 if (!PyArg_ParseTuple(args, "O!|O:numeric",
117 &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119 if (PyUnicode_GET_SIZE(v) != 1) {
120 PyErr_SetString(PyExc_TypeError,
121 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000122 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 }
124 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
125 if (rc < 0) {
126 if (defobj == NULL) {
127 PyErr_SetString(PyExc_ValueError,
128 "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000130 }
131 else {
132 Py_INCREF(defobj);
133 return defobj;
134 }
135 }
136 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000137}
138
139static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000141{
142 PyUnicodeObject *v;
143 int index;
144
145 if (!PyArg_ParseTuple(args, "O!:category",
146 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000147 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000148 if (PyUnicode_GET_SIZE(v) != 1) {
149 PyErr_SetString(PyExc_TypeError,
150 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000152 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 index = (int) getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000154 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000155}
156
157static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000158unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159{
160 PyUnicodeObject *v;
161 int index;
162
163 if (!PyArg_ParseTuple(args, "O!:bidirectional",
164 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000166 if (PyUnicode_GET_SIZE(v) != 1) {
167 PyErr_SetString(PyExc_TypeError,
168 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 index = (int) getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000173}
174
175static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177{
178 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000179
180 if (!PyArg_ParseTuple(args, "O!:combining",
181 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183 if (PyUnicode_GET_SIZE(v) != 1) {
184 PyErr_SetString(PyExc_TypeError,
185 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000188 return PyInt_FromLong((int) getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189}
190
191static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193{
194 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195
196 if (!PyArg_ParseTuple(args, "O!:mirrored",
197 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000198 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000199 if (PyUnicode_GET_SIZE(v) != 1) {
200 PyErr_SetString(PyExc_TypeError,
201 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 return PyInt_FromLong((int) getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205}
206
207static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209{
210 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 char decomp[256];
212 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213
214 if (!PyArg_ParseTuple(args, "O!:decomposition",
215 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000216 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000217 if (PyUnicode_GET_SIZE(v) != 1) {
218 PyErr_SetString(PyExc_TypeError,
219 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000220 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000222
223 code = (int) *PyUnicode_AS_UNICODE(v);
224
225 if (code < 0 || code >= 65536)
226 index = 0;
227 else {
228 index = decomp_index1[(code>>DECOMP_SHIFT)];
229 index = decomp_index2[(index<<DECOMP_SHIFT)+
230 (code&((1<<DECOMP_SHIFT)-1))];
231 }
232
233 /* high byte is of hex bytes (usually one or two), low byte
234 is prefix code (from*/
235 count = decomp_data[index] >> 8;
236
237 /* XXX: could allocate the PyString up front instead
238 (strlen(prefix) + 5 * count + 1 bytes) */
239
240 /* copy prefix */
241 i = strlen(decomp_prefix[decomp_data[index] & 255]);
242 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
243
244 while (count-- > 0) {
245 if (i)
246 decomp[i++] = ' ';
247 sprintf(decomp + i, "%04X", decomp_data[++index]);
248 i += strlen(decomp + i);
249 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251 decomp[i] = '\0';
252
253 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254}
255
256/* XXX Add doc strings. */
257
258static PyMethodDef unicodedata_functions[] = {
259 {"decimal", unicodedata_decimal, 1},
260 {"digit", unicodedata_digit, 1},
261 {"numeric", unicodedata_numeric, 1},
262 {"category", unicodedata_category, 1},
263 {"bidirectional", unicodedata_bidirectional, 1},
264 {"combining", unicodedata_combining, 1},
265 {"mirrored", unicodedata_mirrored, 1},
266 {"decomposition", unicodedata_decomposition, 1},
267 {NULL, NULL} /* sentinel */
268};
269
270DL_EXPORT(void)
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000271initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272{
273 Py_InitModule("unicodedata", unicodedata_functions);
274}