Blame - Objects/stringlib/codecs.h - platform/external/python/cpython3

blob: f353367013a61b4c776676c81bc1b2999b50f654 [file] [log] [blame]

Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	1	/* stringlib: codec implementations */
				2
				3	#if STRINGLIB_IS_UNICODE
				4
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	5	/* Mask to quickly check whether a C 'long' contains a
				6	non-ASCII, UTF8-encoded char. */
				7	#if (SIZEOF_LONG == 8)
Mark Dickinson	01ac8b6	2012-07-07 14:08:48 +0200	[diff] [blame]	8	# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	9	#elif (SIZEOF_LONG == 4)
Mark Dickinson	01ac8b6	2012-07-07 14:08:48 +0200	[diff] [blame]	10	# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	11	#else
				12	# error C 'long' size should be either 4 or 8!
				13	#endif
				14
Mark Dickinson	106c414	2012-06-23 21:45:14 +0100	[diff] [blame]	15	/* 10xxxxxx */
				16	#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
				17
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	18	Py_LOCAL_INLINE(Py_UCS4)
				19	STRINGLIB(utf8_decode)(const char *inptr, const char end,
				20	STRINGLIB_CHAR *dest,
				21	Py_ssize_t *outpos)
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	22	{
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	23	Py_UCS4 ch;
				24	const char s = inptr;
Antoine Pitrou	ca8aa4a	2012-09-20 20:56:47 +0200	[diff] [blame]	25	const char aligned_end = (const char ) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	26	STRINGLIB_CHAR p = dest + outpos;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	27
				28	while (s < end) {
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	29	ch = (unsigned char)*s;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	30
				31	if (ch < 0x80) {
				32	/* Fast path for runs of ASCII characters. Given that common UTF-8
				33	input will consist of an overwhelming majority of ASCII
				34	characters, we try to optimize for this case by checking
				35	as many characters as a C 'long' can contain.
				36	First, check if we can do an aligned read, as most CPUs have
				37	a penalty for unaligned reads.
				38	*/
Antoine Pitrou	ca8aa4a	2012-09-20 20:56:47 +0200	[diff] [blame]	39	if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	40	/* Help register allocation */
				41	register const char *_s = s;
				42	register STRINGLIB_CHAR *_p = p;
				43	while (_s < aligned_end) {
				44	/* Read a whole long at a time (either 4 or 8 bytes),
				45	and do a fast unrolled copy if it only contains ASCII
				46	characters. */
				47	unsigned long value = (unsigned long ) _s;
				48	if (value & ASCII_CHAR_MASK)
				49	break;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	50	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				51	_p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
				52	_p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
				53	_p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
				54	_p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
				55	# if SIZEOF_LONG == 8
				56	_p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
				57	_p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
				58	_p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
				59	_p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
				60	# endif
				61	#else
				62	# if SIZEOF_LONG == 8
				63	_p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
				64	_p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
				65	_p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
				66	_p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
				67	_p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
				68	_p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
				69	_p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
				70	_p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
				71	# else
				72	_p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
				73	_p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
				74	_p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
				75	_p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
				76	# endif
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	77	#endif
				78	_s += SIZEOF_LONG;
				79	_p += SIZEOF_LONG;
				80	}
				81	s = _s;
				82	p = _p;
				83	if (s == end)
				84	break;
				85	ch = (unsigned char)*s;
				86	}
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	87	if (ch < 0x80) {
				88	s++;
				89	*p++ = ch;
				90	continue;
				91	}
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	92	}
				93
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	94	if (ch < 0xE0) {
				95	/* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinner	ab60de4	2012-11-04 23:59:15 +0100	[diff] [blame]	96	Py_UCS4 ch2;
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	97	if (ch < 0xC2) {
				98	/* invalid sequence
				99	\x80-\xBF -- continuation byte
				100	\xC0-\xC1 -- fake 0000-007F */
				101	goto InvalidStart;
				102	}
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	103	if (end - s < 2) {
				104	/* unexpected end of data: the caller will decide whether
				105	it's an error or not */
				106	break;
				107	}
				108	ch2 = (unsigned char)s[1];
Mark Dickinson	106c414	2012-06-23 21:45:14 +0100	[diff] [blame]	109	if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	110	/* invalid continuation byte */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	111	goto InvalidContinuation1;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	112	ch = (ch << 6) + ch2 -
				113	((0xC0 << 6) + 0x80);
				114	assert ((ch > 0x007F) && (ch <= 0x07FF));
				115	s += 2;
				116	if (STRINGLIB_MAX_CHAR <= 0x007F \|\|
				117	(STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	118	/* Out-of-range */
				119	goto Return;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	120	*p++ = ch;
				121	continue;
				122	}
				123
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	124	if (ch < 0xF0) {
				125	/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
				126	Py_UCS4 ch2, ch3;
				127	if (end - s < 3) {
				128	/* unexpected end of data: the caller will decide whether
				129	it's an error or not */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	130	if (end - s < 2)
				131	break;
				132	ch2 = (unsigned char)s[1];
				133	if (!IS_CONTINUATION_BYTE(ch2) \|\|
				134	(ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
				135	/* for clarification see comments below */
				136	goto InvalidContinuation1;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	137	break;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	138	}
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	139	ch2 = (unsigned char)s[1];
				140	ch3 = (unsigned char)s[2];
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	141	if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	142	/* invalid continuation byte */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	143	goto InvalidContinuation1;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	144	}
				145	if (ch == 0xE0) {
				146	if (ch2 < 0xA0)
				147	/* invalid sequence
				148	\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	149	goto InvalidContinuation1;
				150	} else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	151	/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
				152	will result in surrogates in range D800-DFFF. Surrogates are
				153	not valid UTF-8 so they are rejected.
				154	See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
				155	(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	156	goto InvalidContinuation1;
				157	}
				158	if (!IS_CONTINUATION_BYTE(ch3)) {
				159	/* invalid continuation byte */
				160	goto InvalidContinuation2;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	161	}
				162	ch = (ch << 12) + (ch2 << 6) + ch3 -
				163	((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	164	assert ((ch > 0x07FF) && (ch <= 0xFFFF));
				165	s += 3;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	166	if (STRINGLIB_MAX_CHAR <= 0x07FF \|\|
				167	(STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	168	/* Out-of-range */
				169	goto Return;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	170	*p++ = ch;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	171	continue;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	172	}
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	173
				174	if (ch < 0xF5) {
				175	/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
				176	Py_UCS4 ch2, ch3, ch4;
				177	if (end - s < 4) {
				178	/* unexpected end of data: the caller will decide whether
				179	it's an error or not */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	180	if (end - s < 2)
				181	break;
				182	ch2 = (unsigned char)s[1];
				183	if (!IS_CONTINUATION_BYTE(ch2) \|\|
				184	(ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
				185	/* for clarification see comments below */
				186	goto InvalidContinuation1;
				187	if (end - s < 3)
				188	break;
				189	ch3 = (unsigned char)s[2];
				190	if (!IS_CONTINUATION_BYTE(ch3))
				191	goto InvalidContinuation2;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	192	break;
				193	}
				194	ch2 = (unsigned char)s[1];
				195	ch3 = (unsigned char)s[2];
				196	ch4 = (unsigned char)s[3];
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	197	if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	198	/* invalid continuation byte */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	199	goto InvalidContinuation1;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	200	}
				201	if (ch == 0xF0) {
				202	if (ch2 < 0x90)
				203	/* invalid sequence
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	204	\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
				205	goto InvalidContinuation1;
				206	} else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	207	/* invalid sequence
				208	\xF4\x90\x80\80- -- 110000- overflow */
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	209	goto InvalidContinuation1;
				210	}
				211	if (!IS_CONTINUATION_BYTE(ch3)) {
				212	/* invalid continuation byte */
				213	goto InvalidContinuation2;
				214	}
				215	if (!IS_CONTINUATION_BYTE(ch4)) {
				216	/* invalid continuation byte */
				217	goto InvalidContinuation3;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	218	}
				219	ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
				220	((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
				221	assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
				222	s += 4;
				223	if (STRINGLIB_MAX_CHAR <= 0xFFFF \|\|
				224	(STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	225	/* Out-of-range */
				226	goto Return;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	227	*p++ = ch;
				228	continue;
				229	}
				230	goto InvalidStart;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	231	}
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	232	ch = 0;
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	233	Return:
				234	*inptr = s;
				235	*outpos = p - dest;
				236	return ch;
				237	InvalidStart:
				238	ch = 1;
				239	goto Return;
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	240	InvalidContinuation1:
Antoine Pitrou	ca5f91b	2012-05-10 16:36:02 +0200	[diff] [blame]	241	ch = 2;
				242	goto Return;
Ezio Melotti	f7ed5d1	2012-11-04 23:21:38 +0200	[diff] [blame]	243	InvalidContinuation2:
				244	ch = 3;
				245	goto Return;
				246	InvalidContinuation3:
				247	ch = 4;
				248	goto Return;
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	249	}
				250
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	251	#undef ASCII_CHAR_MASK
				252
Victor Stinner	6099a03	2011-12-18 14:22:26 +0100	[diff] [blame]	253
				254	/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
				255	PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
				256	UCS-1 strings don't need to handle surrogates for example. */
				257	Py_LOCAL_INLINE(PyObject *)
				258	STRINGLIB(utf8_encoder)(PyObject *unicode,
				259	STRINGLIB_CHAR *data,
				260	Py_ssize_t size,
				261	const char *errors)
				262	{
				263	#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
				264
				265	Py_ssize_t i; /* index into s of next input byte */
				266	PyObject result; / result string object */
				267	char p; / next free byte in output buffer */
				268	Py_ssize_t nallocated; /* number of result bytes allocated */
				269	Py_ssize_t nneeded; /* number of result bytes needed */
				270	#if STRINGLIB_SIZEOF_CHAR > 1
				271	PyObject *errorHandler = NULL;
				272	PyObject *exc = NULL;
				273	PyObject *rep = NULL;
				274	#endif
				275	#if STRINGLIB_SIZEOF_CHAR == 1
				276	const Py_ssize_t max_char_size = 2;
				277	char stackbuf[MAX_SHORT_UNICHARS * 2];
				278	#elif STRINGLIB_SIZEOF_CHAR == 2
				279	const Py_ssize_t max_char_size = 3;
				280	char stackbuf[MAX_SHORT_UNICHARS * 3];
				281	#else /* STRINGLIB_SIZEOF_CHAR == 4 */
				282	const Py_ssize_t max_char_size = 4;
				283	char stackbuf[MAX_SHORT_UNICHARS * 4];
				284	#endif
				285
				286	assert(size >= 0);
				287
				288	if (size <= MAX_SHORT_UNICHARS) {
				289	/* Write into the stack buffer; nallocated can't overflow.
				290	* At the end, we'll allocate exactly as much heap space as it
				291	* turns out we need.
				292	*/
				293	nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
				294	result = NULL; /* will allocate after we're done */
				295	p = stackbuf;
				296	}
				297	else {
				298	if (size > PY_SSIZE_T_MAX / max_char_size) {
				299	/* integer overflow */
				300	return PyErr_NoMemory();
				301	}
				302	/* Overallocate on the heap, and give the excess back at the end. */
				303	nallocated = size * max_char_size;
				304	result = PyBytes_FromStringAndSize(NULL, nallocated);
				305	if (result == NULL)
				306	return NULL;
				307	p = PyBytes_AS_STRING(result);
				308	}
				309
				310	for (i = 0; i < size;) {
				311	Py_UCS4 ch = data[i++];
				312
				313	if (ch < 0x80) {
				314	/* Encode ASCII */
				315	*p++ = (char) ch;
				316
				317	}
				318	else
				319	#if STRINGLIB_SIZEOF_CHAR > 1
				320	if (ch < 0x0800)
				321	#endif
				322	{
				323	/* Encode Latin-1 */
				324	*p++ = (char)(0xc0 \| (ch >> 6));
				325	*p++ = (char)(0x80 \| (ch & 0x3f));
				326	}
				327	#if STRINGLIB_SIZEOF_CHAR > 1
				328	else if (Py_UNICODE_IS_SURROGATE(ch)) {
				329	Py_ssize_t newpos;
				330	Py_ssize_t repsize, k, startpos;
				331	startpos = i-1;
				332	rep = unicode_encode_call_errorhandler(
				333	errors, &errorHandler, "utf-8", "surrogates not allowed",
				334	unicode, &exc, startpos, startpos+1, &newpos);
				335	if (!rep)
				336	goto error;
				337
				338	if (PyBytes_Check(rep))
				339	repsize = PyBytes_GET_SIZE(rep);
				340	else
				341	repsize = PyUnicode_GET_LENGTH(rep);
				342
				343	if (repsize > max_char_size) {
				344	Py_ssize_t offset;
				345
				346	if (result == NULL)
				347	offset = p - stackbuf;
				348	else
				349	offset = p - PyBytes_AS_STRING(result);
				350
				351	if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
				352	/* integer overflow */
				353	PyErr_NoMemory();
				354	goto error;
				355	}
				356	nallocated += repsize - max_char_size;
				357	if (result != NULL) {
				358	if (_PyBytes_Resize(&result, nallocated) < 0)
				359	goto error;
				360	} else {
				361	result = PyBytes_FromStringAndSize(NULL, nallocated);
				362	if (result == NULL)
				363	goto error;
				364	Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
				365	}
				366	p = PyBytes_AS_STRING(result) + offset;
				367	}
				368
				369	if (PyBytes_Check(rep)) {
				370	char *prep = PyBytes_AS_STRING(rep);
				371	for(k = repsize; k > 0; k--)
				372	p++ = prep++;
				373	} else /* rep is unicode */ {
				374	enum PyUnicode_Kind repkind;
				375	void *repdata;
				376
				377	if (PyUnicode_READY(rep) < 0)
				378	goto error;
				379	repkind = PyUnicode_KIND(rep);
				380	repdata = PyUnicode_DATA(rep);
				381
				382	for(k=0; k<repsize; k++) {
				383	Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
				384	if (0x80 <= c) {
				385	raise_encode_exception(&exc, "utf-8",
				386	unicode,
				387	i-1, i,
				388	"surrogates not allowed");
				389	goto error;
				390	}
				391	*p++ = (char)c;
				392	}
				393	}
				394	Py_CLEAR(rep);
				395	}
				396	else
				397	#if STRINGLIB_SIZEOF_CHAR > 2
				398	if (ch < 0x10000)
				399	#endif
				400	{
				401	*p++ = (char)(0xe0 \| (ch >> 12));
				402	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				403	*p++ = (char)(0x80 \| (ch & 0x3f));
				404	}
				405	#if STRINGLIB_SIZEOF_CHAR > 2
				406	else /* ch >= 0x10000 */
				407	{
				408	assert(ch <= MAX_UNICODE);
				409	/* Encode UCS4 Unicode ordinals */
				410	*p++ = (char)(0xf0 \| (ch >> 18));
				411	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
				412	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				413	*p++ = (char)(0x80 \| (ch & 0x3f));
				414	}
				415	#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
				416	#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
				417	}
				418
				419	if (result == NULL) {
				420	/* This was stack allocated. */
				421	nneeded = p - stackbuf;
				422	assert(nneeded <= nallocated);
				423	result = PyBytes_FromStringAndSize(stackbuf, nneeded);
				424	}
				425	else {
				426	/* Cut back to size actually needed. */
				427	nneeded = p - PyBytes_AS_STRING(result);
				428	assert(nneeded <= nallocated);
				429	_PyBytes_Resize(&result, nneeded);
				430	}
				431
				432	#if STRINGLIB_SIZEOF_CHAR > 1
				433	Py_XDECREF(errorHandler);
				434	Py_XDECREF(exc);
				435	#endif
				436	return result;
				437
				438	#if STRINGLIB_SIZEOF_CHAR > 1
				439	error:
				440	Py_XDECREF(rep);
				441	Py_XDECREF(errorHandler);
				442	Py_XDECREF(exc);
				443	Py_XDECREF(result);
				444	return NULL;
				445	#endif
				446
				447	#undef MAX_SHORT_UNICHARS
				448	}
				449
Antoine Pitrou	63065d7	2012-05-15 23:48:04 +0200	[diff] [blame]	450	/* The pattern for constructing UCS2-repeated masks. */
				451	#if SIZEOF_LONG == 8
				452	# define UCS2_REPEAT_MASK 0x0001000100010001ul
				453	#elif SIZEOF_LONG == 4
				454	# define UCS2_REPEAT_MASK 0x00010001ul
				455	#else
				456	# error C 'long' size should be either 4 or 8!
				457	#endif
				458
				459	/* The mask for fast checking. */
				460	#if STRINGLIB_SIZEOF_CHAR == 1
				461	/* The mask for fast checking of whether a C 'long' contains a
				462	non-ASCII or non-Latin1 UTF16-encoded characters. */
				463	# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
				464	#else
				465	/* The mask for fast checking of whether a C 'long' may contain
				466	UTF16-encoded surrogate characters. This is an efficient heuristic,
				467	assuming that non-surrogate characters with a code point >= 0x8000 are
				468	rare in most input.
				469	*/
				470	# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
				471	#endif
				472	/* The mask for fast byte-swapping. */
				473	#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
				474	/* Swap bytes. */
				475	#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) \| \
				476	(((value) & STRIPPED_MASK) << 8))
				477
				478	Py_LOCAL_INLINE(Py_UCS4)
				479	STRINGLIB(utf16_decode)(const unsigned char *inptr, const unsigned char e,
				480	STRINGLIB_CHAR dest, Py_ssize_t outpos,
				481	int native_ordering)
				482	{
				483	Py_UCS4 ch;
				484	const unsigned char *aligned_end =
Antoine Pitrou	ca8aa4a	2012-09-20 20:56:47 +0200	[diff] [blame]	485	(const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou	63065d7	2012-05-15 23:48:04 +0200	[diff] [blame]	486	const unsigned char q = inptr;
				487	STRINGLIB_CHAR p = dest + outpos;
				488	/* Offsets from q for retrieving byte pairs in the right order. */
				489	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				490	int ihi = !!native_ordering, ilo = !native_ordering;
				491	#else
				492	int ihi = !native_ordering, ilo = !!native_ordering;
				493	#endif
				494	--e;
				495
				496	while (q < e) {
				497	Py_UCS4 ch2;
				498	/* First check for possible aligned read of a C 'long'. Unaligned
				499	reads are more expensive, better to defer to another iteration. */
Antoine Pitrou	ca8aa4a	2012-09-20 20:56:47 +0200	[diff] [blame]	500	if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou	63065d7	2012-05-15 23:48:04 +0200	[diff] [blame]	501	/* Fast path for runs of in-range non-surrogate chars. */
				502	register const unsigned char *_q = q;
				503	while (_q < aligned_end) {
				504	unsigned long block = * (unsigned long *) _q;
				505	if (native_ordering) {
				506	/* Can use buffer directly */
				507	if (block & FAST_CHAR_MASK)
				508	break;
				509	}
				510	else {
				511	/* Need to byte-swap */
				512	if (block & SWAB(FAST_CHAR_MASK))
				513	break;
				514	#if STRINGLIB_SIZEOF_CHAR == 1
				515	block >>= 8;
				516	#else
				517	block = SWAB(block);
				518	#endif
				519	}
				520	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				521	# if SIZEOF_LONG == 4
				522	p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
				523	p[1] = (STRINGLIB_CHAR)(block >> 16);
				524	# elif SIZEOF_LONG == 8
				525	p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
				526	p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
				527	p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
				528	p[3] = (STRINGLIB_CHAR)(block >> 48);
				529	# endif
				530	#else
				531	# if SIZEOF_LONG == 4
				532	p[0] = (STRINGLIB_CHAR)(block >> 16);
				533	p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
				534	# elif SIZEOF_LONG == 8
				535	p[0] = (STRINGLIB_CHAR)(block >> 48);
				536	p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
				537	p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
				538	p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
				539	# endif
				540	#endif
				541	_q += SIZEOF_LONG;
				542	p += SIZEOF_LONG / 2;
				543	}
				544	q = _q;
				545	if (q >= e)
				546	break;
				547	}
				548
				549	ch = (q[ihi] << 8) \| q[ilo];
				550	q += 2;
				551	if (!Py_UNICODE_IS_SURROGATE(ch)) {
				552	#if STRINGLIB_SIZEOF_CHAR < 2
				553	if (ch > STRINGLIB_MAX_CHAR)
				554	/* Out-of-range */
				555	goto Return;
				556	#endif
				557	*p++ = (STRINGLIB_CHAR)ch;
				558	continue;
				559	}
				560
				561	/* UTF-16 code pair: */
				562	if (q >= e)
				563	goto UnexpectedEnd;
				564	if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
				565	goto IllegalEncoding;
				566	ch2 = (q[ihi] << 8) \| q[ilo];
				567	q += 2;
				568	if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
				569	goto IllegalSurrogate;
				570	ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
				571	#if STRINGLIB_SIZEOF_CHAR < 4
				572	/* Out-of-range */
				573	goto Return;
				574	#else
				575	*p++ = (STRINGLIB_CHAR)ch;
				576	#endif
				577	}
				578	ch = 0;
				579	Return:
				580	*inptr = q;
				581	*outpos = p - dest;
				582	return ch;
				583	UnexpectedEnd:
				584	ch = 1;
				585	goto Return;
				586	IllegalEncoding:
				587	ch = 2;
				588	goto Return;
				589	IllegalSurrogate:
				590	ch = 3;
				591	goto Return;
				592	}
				593	#undef UCS2_REPEAT_MASK
				594	#undef FAST_CHAR_MASK
				595	#undef STRIPPED_MASK
				596	#undef SWAB
Antoine Pitrou	27f6a3b	2012-06-15 22:15:23 +0200	[diff] [blame]	597
				598
				599	Py_LOCAL_INLINE(void)
				600	STRINGLIB(utf16_encode)(unsigned short *out,
				601	const STRINGLIB_CHAR *in,
				602	Py_ssize_t len,
				603	int native_ordering)
				604	{
				605	const STRINGLIB_CHAR *end = in + len;
				606	#if STRINGLIB_SIZEOF_CHAR == 1
				607	# define SWAB2(CH) ((CH) << 8)
				608	#else
				609	# define SWAB2(CH) (((CH) << 8) \| ((CH) >> 8))
				610	#endif
				611	#if STRINGLIB_MAX_CHAR < 0x10000
				612	if (native_ordering) {
				613	# if STRINGLIB_SIZEOF_CHAR == 2
				614	Py_MEMCPY(out, in, 2 * len);
				615	# else
				616	_PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
				617	# endif
				618	} else {
Antoine Pitrou	ca8aa4a	2012-09-20 20:56:47 +0200	[diff] [blame]	619	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou	27f6a3b	2012-06-15 22:15:23 +0200	[diff] [blame]	620	while (in < unrolled_end) {
				621	out[0] = SWAB2(in[0]);
				622	out[1] = SWAB2(in[1]);
				623	out[2] = SWAB2(in[2]);
				624	out[3] = SWAB2(in[3]);
				625	in += 4; out += 4;
				626	}
				627	while (in < end) {
				628	out++ = SWAB2(in);
				629	++in;
				630	}
				631	}
				632	#else
				633	if (native_ordering) {
				634	while (in < end) {
				635	Py_UCS4 ch = *in++;
				636	if (ch < 0x10000)
				637	*out++ = ch;
				638	else {
				639	out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
				640	out[1] = Py_UNICODE_LOW_SURROGATE(ch);
				641	out += 2;
				642	}
				643	}
				644	} else {
				645	while (in < end) {
				646	Py_UCS4 ch = *in++;
				647	if (ch < 0x10000)
				648	*out++ = SWAB2((Py_UCS2)ch);
				649	else {
				650	Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
				651	Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
				652	out[0] = SWAB2(ch1);
				653	out[1] = SWAB2(ch2);
				654	out += 2;
				655	}
				656	}
				657	}
				658	#endif
				659	#undef SWAB2
				660	}
Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	661	#endif /* STRINGLIB_IS_UNICODE */