Antoine Pitrou | 0a3229d | 2011-11-21 20:39:13 +0100 | [diff] [blame] | 1 | /* stringlib: codec implementations */ |
| 2 | |
| 3 | #if STRINGLIB_IS_UNICODE |
| 4 | |
| 5 | /* Mask to check or force alignment of a pointer to C 'long' boundaries */ |
| 6 | #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) |
| 7 | |
| 8 | /* Mask to quickly check whether a C 'long' contains a |
| 9 | non-ASCII, UTF8-encoded char. */ |
| 10 | #if (SIZEOF_LONG == 8) |
| 11 | # define ASCII_CHAR_MASK 0x8080808080808080L |
| 12 | #elif (SIZEOF_LONG == 4) |
| 13 | # define ASCII_CHAR_MASK 0x80808080L |
| 14 | #else |
| 15 | # error C 'long' size should be either 4 or 8! |
| 16 | #endif |
| 17 | |
| 18 | Py_LOCAL_INLINE(int) |
| 19 | STRINGLIB(utf8_try_decode)(const char *start, const char *end, |
| 20 | STRINGLIB_CHAR *dest, |
| 21 | const char **src_pos, Py_ssize_t *dest_index) |
| 22 | { |
| 23 | int ret; |
| 24 | Py_ssize_t n; |
| 25 | const char *s = start; |
| 26 | const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK); |
| 27 | STRINGLIB_CHAR *p = dest; |
| 28 | |
| 29 | while (s < end) { |
| 30 | Py_UCS4 ch = (unsigned char)*s; |
| 31 | |
| 32 | if (ch < 0x80) { |
| 33 | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
| 34 | input will consist of an overwhelming majority of ASCII |
| 35 | characters, we try to optimize for this case by checking |
| 36 | as many characters as a C 'long' can contain. |
| 37 | First, check if we can do an aligned read, as most CPUs have |
| 38 | a penalty for unaligned reads. |
| 39 | */ |
| 40 | if (!((size_t) s & LONG_PTR_MASK)) { |
| 41 | /* Help register allocation */ |
| 42 | register const char *_s = s; |
| 43 | register STRINGLIB_CHAR *_p = p; |
| 44 | while (_s < aligned_end) { |
| 45 | /* Read a whole long at a time (either 4 or 8 bytes), |
| 46 | and do a fast unrolled copy if it only contains ASCII |
| 47 | characters. */ |
| 48 | unsigned long value = *(unsigned long *) _s; |
| 49 | if (value & ASCII_CHAR_MASK) |
| 50 | break; |
| 51 | _p[0] = _s[0]; |
| 52 | _p[1] = _s[1]; |
| 53 | _p[2] = _s[2]; |
| 54 | _p[3] = _s[3]; |
| 55 | #if (SIZEOF_LONG == 8) |
| 56 | _p[4] = _s[4]; |
| 57 | _p[5] = _s[5]; |
| 58 | _p[6] = _s[6]; |
| 59 | _p[7] = _s[7]; |
| 60 | #endif |
| 61 | _s += SIZEOF_LONG; |
| 62 | _p += SIZEOF_LONG; |
| 63 | } |
| 64 | s = _s; |
| 65 | p = _p; |
| 66 | if (s == end) |
| 67 | break; |
| 68 | ch = (unsigned char)*s; |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | if (ch < 0x80) { |
| 73 | s++; |
| 74 | *p++ = ch; |
| 75 | continue; |
| 76 | } |
| 77 | |
| 78 | n = utf8_code_length[ch]; |
| 79 | |
| 80 | if (s + n > end) { |
| 81 | /* unexpected end of data: the caller will decide whether |
| 82 | it's an error or not */ |
| 83 | goto _error; |
| 84 | } |
| 85 | |
| 86 | switch (n) { |
| 87 | case 0: |
| 88 | /* invalid start byte */ |
| 89 | goto _error; |
| 90 | case 1: |
| 91 | /* internal error */ |
| 92 | goto _error; |
| 93 | case 2: |
| 94 | if ((s[1] & 0xc0) != 0x80) |
| 95 | /* invalid continuation byte */ |
| 96 | goto _error; |
| 97 | ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); |
| 98 | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
| 99 | s += 2; |
| 100 | *p++ = ch; |
| 101 | break; |
| 102 | |
| 103 | case 3: |
| 104 | /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf |
| 105 | will result in surrogates in range d800-dfff. Surrogates are |
| 106 | not valid UTF-8 so they are rejected. |
| 107 | See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
| 108 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
| 109 | if ((s[1] & 0xc0) != 0x80 || |
| 110 | (s[2] & 0xc0) != 0x80 || |
| 111 | ((unsigned char)s[0] == 0xE0 && |
| 112 | (unsigned char)s[1] < 0xA0) || |
| 113 | ((unsigned char)s[0] == 0xED && |
| 114 | (unsigned char)s[1] > 0x9F)) { |
| 115 | /* invalid continuation byte */ |
| 116 | goto _error; |
| 117 | } |
| 118 | ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); |
| 119 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
| 120 | s += 3; |
| 121 | *p++ = ch; |
| 122 | break; |
| 123 | |
| 124 | case 4: |
| 125 | if ((s[1] & 0xc0) != 0x80 || |
| 126 | (s[2] & 0xc0) != 0x80 || |
| 127 | (s[3] & 0xc0) != 0x80 || |
| 128 | ((unsigned char)s[0] == 0xF0 && |
| 129 | (unsigned char)s[1] < 0x90) || |
| 130 | ((unsigned char)s[0] == 0xF4 && |
| 131 | (unsigned char)s[1] > 0x8F)) { |
| 132 | /* invalid continuation byte */ |
| 133 | goto _error; |
| 134 | } |
| 135 | ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + |
| 136 | ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); |
| 137 | assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); |
| 138 | s += 4; |
| 139 | *p++ = ch; |
| 140 | break; |
| 141 | } |
| 142 | } |
| 143 | ret = 0; |
| 144 | goto _ok; |
| 145 | _error: |
| 146 | ret = -1; |
| 147 | _ok: |
| 148 | *src_pos = s; |
| 149 | *dest_index = p - dest; |
| 150 | return ret; |
| 151 | } |
| 152 | |
| 153 | #undef LONG_PTR_MASK |
| 154 | #undef ASCII_CHAR_MASK |
| 155 | |
Victor Stinner | 6099a03 | 2011-12-18 14:22:26 +0100 | [diff] [blame] | 156 | |
| 157 | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
| 158 | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
| 159 | UCS-1 strings don't need to handle surrogates for example. */ |
| 160 | Py_LOCAL_INLINE(PyObject *) |
| 161 | STRINGLIB(utf8_encoder)(PyObject *unicode, |
| 162 | STRINGLIB_CHAR *data, |
| 163 | Py_ssize_t size, |
| 164 | const char *errors) |
| 165 | { |
| 166 | #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ |
| 167 | |
| 168 | Py_ssize_t i; /* index into s of next input byte */ |
| 169 | PyObject *result; /* result string object */ |
| 170 | char *p; /* next free byte in output buffer */ |
| 171 | Py_ssize_t nallocated; /* number of result bytes allocated */ |
| 172 | Py_ssize_t nneeded; /* number of result bytes needed */ |
| 173 | #if STRINGLIB_SIZEOF_CHAR > 1 |
| 174 | PyObject *errorHandler = NULL; |
| 175 | PyObject *exc = NULL; |
| 176 | PyObject *rep = NULL; |
| 177 | #endif |
| 178 | #if STRINGLIB_SIZEOF_CHAR == 1 |
| 179 | const Py_ssize_t max_char_size = 2; |
| 180 | char stackbuf[MAX_SHORT_UNICHARS * 2]; |
| 181 | #elif STRINGLIB_SIZEOF_CHAR == 2 |
| 182 | const Py_ssize_t max_char_size = 3; |
| 183 | char stackbuf[MAX_SHORT_UNICHARS * 3]; |
| 184 | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
| 185 | const Py_ssize_t max_char_size = 4; |
| 186 | char stackbuf[MAX_SHORT_UNICHARS * 4]; |
| 187 | #endif |
| 188 | |
| 189 | assert(size >= 0); |
| 190 | |
| 191 | if (size <= MAX_SHORT_UNICHARS) { |
| 192 | /* Write into the stack buffer; nallocated can't overflow. |
| 193 | * At the end, we'll allocate exactly as much heap space as it |
| 194 | * turns out we need. |
| 195 | */ |
| 196 | nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); |
| 197 | result = NULL; /* will allocate after we're done */ |
| 198 | p = stackbuf; |
| 199 | } |
| 200 | else { |
| 201 | if (size > PY_SSIZE_T_MAX / max_char_size) { |
| 202 | /* integer overflow */ |
| 203 | return PyErr_NoMemory(); |
| 204 | } |
| 205 | /* Overallocate on the heap, and give the excess back at the end. */ |
| 206 | nallocated = size * max_char_size; |
| 207 | result = PyBytes_FromStringAndSize(NULL, nallocated); |
| 208 | if (result == NULL) |
| 209 | return NULL; |
| 210 | p = PyBytes_AS_STRING(result); |
| 211 | } |
| 212 | |
| 213 | for (i = 0; i < size;) { |
| 214 | Py_UCS4 ch = data[i++]; |
| 215 | |
| 216 | if (ch < 0x80) { |
| 217 | /* Encode ASCII */ |
| 218 | *p++ = (char) ch; |
| 219 | |
| 220 | } |
| 221 | else |
| 222 | #if STRINGLIB_SIZEOF_CHAR > 1 |
| 223 | if (ch < 0x0800) |
| 224 | #endif |
| 225 | { |
| 226 | /* Encode Latin-1 */ |
| 227 | *p++ = (char)(0xc0 | (ch >> 6)); |
| 228 | *p++ = (char)(0x80 | (ch & 0x3f)); |
| 229 | } |
| 230 | #if STRINGLIB_SIZEOF_CHAR > 1 |
| 231 | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
| 232 | Py_ssize_t newpos; |
| 233 | Py_ssize_t repsize, k, startpos; |
| 234 | startpos = i-1; |
| 235 | rep = unicode_encode_call_errorhandler( |
| 236 | errors, &errorHandler, "utf-8", "surrogates not allowed", |
| 237 | unicode, &exc, startpos, startpos+1, &newpos); |
| 238 | if (!rep) |
| 239 | goto error; |
| 240 | |
| 241 | if (PyBytes_Check(rep)) |
| 242 | repsize = PyBytes_GET_SIZE(rep); |
| 243 | else |
| 244 | repsize = PyUnicode_GET_LENGTH(rep); |
| 245 | |
| 246 | if (repsize > max_char_size) { |
| 247 | Py_ssize_t offset; |
| 248 | |
| 249 | if (result == NULL) |
| 250 | offset = p - stackbuf; |
| 251 | else |
| 252 | offset = p - PyBytes_AS_STRING(result); |
| 253 | |
| 254 | if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) { |
| 255 | /* integer overflow */ |
| 256 | PyErr_NoMemory(); |
| 257 | goto error; |
| 258 | } |
| 259 | nallocated += repsize - max_char_size; |
| 260 | if (result != NULL) { |
| 261 | if (_PyBytes_Resize(&result, nallocated) < 0) |
| 262 | goto error; |
| 263 | } else { |
| 264 | result = PyBytes_FromStringAndSize(NULL, nallocated); |
| 265 | if (result == NULL) |
| 266 | goto error; |
| 267 | Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); |
| 268 | } |
| 269 | p = PyBytes_AS_STRING(result) + offset; |
| 270 | } |
| 271 | |
| 272 | if (PyBytes_Check(rep)) { |
| 273 | char *prep = PyBytes_AS_STRING(rep); |
| 274 | for(k = repsize; k > 0; k--) |
| 275 | *p++ = *prep++; |
| 276 | } else /* rep is unicode */ { |
| 277 | enum PyUnicode_Kind repkind; |
| 278 | void *repdata; |
| 279 | |
| 280 | if (PyUnicode_READY(rep) < 0) |
| 281 | goto error; |
| 282 | repkind = PyUnicode_KIND(rep); |
| 283 | repdata = PyUnicode_DATA(rep); |
| 284 | |
| 285 | for(k=0; k<repsize; k++) { |
| 286 | Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); |
| 287 | if (0x80 <= c) { |
| 288 | raise_encode_exception(&exc, "utf-8", |
| 289 | unicode, |
| 290 | i-1, i, |
| 291 | "surrogates not allowed"); |
| 292 | goto error; |
| 293 | } |
| 294 | *p++ = (char)c; |
| 295 | } |
| 296 | } |
| 297 | Py_CLEAR(rep); |
| 298 | } |
| 299 | else |
| 300 | #if STRINGLIB_SIZEOF_CHAR > 2 |
| 301 | if (ch < 0x10000) |
| 302 | #endif |
| 303 | { |
| 304 | *p++ = (char)(0xe0 | (ch >> 12)); |
| 305 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
| 306 | *p++ = (char)(0x80 | (ch & 0x3f)); |
| 307 | } |
| 308 | #if STRINGLIB_SIZEOF_CHAR > 2 |
| 309 | else /* ch >= 0x10000 */ |
| 310 | { |
| 311 | assert(ch <= MAX_UNICODE); |
| 312 | /* Encode UCS4 Unicode ordinals */ |
| 313 | *p++ = (char)(0xf0 | (ch >> 18)); |
| 314 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
| 315 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
| 316 | *p++ = (char)(0x80 | (ch & 0x3f)); |
| 317 | } |
| 318 | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
| 319 | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
| 320 | } |
| 321 | |
| 322 | if (result == NULL) { |
| 323 | /* This was stack allocated. */ |
| 324 | nneeded = p - stackbuf; |
| 325 | assert(nneeded <= nallocated); |
| 326 | result = PyBytes_FromStringAndSize(stackbuf, nneeded); |
| 327 | } |
| 328 | else { |
| 329 | /* Cut back to size actually needed. */ |
| 330 | nneeded = p - PyBytes_AS_STRING(result); |
| 331 | assert(nneeded <= nallocated); |
| 332 | _PyBytes_Resize(&result, nneeded); |
| 333 | } |
| 334 | |
| 335 | #if STRINGLIB_SIZEOF_CHAR > 1 |
| 336 | Py_XDECREF(errorHandler); |
| 337 | Py_XDECREF(exc); |
| 338 | #endif |
| 339 | return result; |
| 340 | |
| 341 | #if STRINGLIB_SIZEOF_CHAR > 1 |
| 342 | error: |
| 343 | Py_XDECREF(rep); |
| 344 | Py_XDECREF(errorHandler); |
| 345 | Py_XDECREF(exc); |
| 346 | Py_XDECREF(result); |
| 347 | return NULL; |
| 348 | #endif |
| 349 | |
| 350 | #undef MAX_SHORT_UNICHARS |
| 351 | } |
| 352 | |
Antoine Pitrou | 0a3229d | 2011-11-21 20:39:13 +0100 | [diff] [blame] | 353 | #endif /* STRINGLIB_IS_UNICODE */ |