Blame - Objects/stringlib/codecs.h - platform/external/python/cpython3

blob: 148bacbaa69ff56914a613678f02e6802b189649 [file] [log] [blame]

Antoine Pitrou	0a3229d	2011-11-21 20:39:13 +0100	[diff] [blame]	1	/* stringlib: codec implementations */
				2
				3	#if STRINGLIB_IS_UNICODE
				4
				5	/* Mask to check or force alignment of a pointer to C 'long' boundaries */
				6	#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
				7
				8	/* Mask to quickly check whether a C 'long' contains a
				9	non-ASCII, UTF8-encoded char. */
				10	#if (SIZEOF_LONG == 8)
				11	# define ASCII_CHAR_MASK 0x8080808080808080L
				12	#elif (SIZEOF_LONG == 4)
				13	# define ASCII_CHAR_MASK 0x80808080L
				14	#else
				15	# error C 'long' size should be either 4 or 8!
				16	#endif
				17
				18	Py_LOCAL_INLINE(int)
				19	STRINGLIB(utf8_try_decode)(const char start, const char end,
				20	STRINGLIB_CHAR *dest,
				21	const char *src_pos, Py_ssize_t dest_index)
				22	{
				23	int ret;
				24	Py_ssize_t n;
				25	const char *s = start;
				26	const char aligned_end = (const char ) ((size_t) end & ~LONG_PTR_MASK);
				27	STRINGLIB_CHAR *p = dest;
				28
				29	while (s < end) {
				30	Py_UCS4 ch = (unsigned char)*s;
				31
				32	if (ch < 0x80) {
				33	/* Fast path for runs of ASCII characters. Given that common UTF-8
				34	input will consist of an overwhelming majority of ASCII
				35	characters, we try to optimize for this case by checking
				36	as many characters as a C 'long' can contain.
				37	First, check if we can do an aligned read, as most CPUs have
				38	a penalty for unaligned reads.
				39	*/
				40	if (!((size_t) s & LONG_PTR_MASK)) {
				41	/* Help register allocation */
				42	register const char *_s = s;
				43	register STRINGLIB_CHAR *_p = p;
				44	while (_s < aligned_end) {
				45	/* Read a whole long at a time (either 4 or 8 bytes),
				46	and do a fast unrolled copy if it only contains ASCII
				47	characters. */
				48	unsigned long value = (unsigned long ) _s;
				49	if (value & ASCII_CHAR_MASK)
				50	break;
				51	_p[0] = _s[0];
				52	_p[1] = _s[1];
				53	_p[2] = _s[2];
				54	_p[3] = _s[3];
				55	#if (SIZEOF_LONG == 8)
				56	_p[4] = _s[4];
				57	_p[5] = _s[5];
				58	_p[6] = _s[6];
				59	_p[7] = _s[7];
				60	#endif
				61	_s += SIZEOF_LONG;
				62	_p += SIZEOF_LONG;
				63	}
				64	s = _s;
				65	p = _p;
				66	if (s == end)
				67	break;
				68	ch = (unsigned char)*s;
				69	}
				70	}
				71
				72	if (ch < 0x80) {
				73	s++;
				74	*p++ = ch;
				75	continue;
				76	}
				77
				78	n = utf8_code_length[ch];
				79
				80	if (s + n > end) {
				81	/* unexpected end of data: the caller will decide whether
				82	it's an error or not */
				83	goto _error;
				84	}
				85
				86	switch (n) {
				87	case 0:
				88	/* invalid start byte */
				89	goto _error;
				90	case 1:
				91	/* internal error */
				92	goto _error;
				93	case 2:
				94	if ((s[1] & 0xc0) != 0x80)
				95	/* invalid continuation byte */
				96	goto _error;
				97	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				98	assert ((ch > 0x007F) && (ch <= 0x07FF));
				99	s += 2;
				100	*p++ = ch;
				101	break;
				102
				103	case 3:
				104	/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
				105	will result in surrogates in range d800-dfff. Surrogates are
				106	not valid UTF-8 so they are rejected.
				107	See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
				108	(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
				109	if ((s[1] & 0xc0) != 0x80 \|\|
				110	(s[2] & 0xc0) != 0x80 \|\|
				111	((unsigned char)s[0] == 0xE0 &&
				112	(unsigned char)s[1] < 0xA0) \|\|
				113	((unsigned char)s[0] == 0xED &&
				114	(unsigned char)s[1] > 0x9F)) {
				115	/* invalid continuation byte */
				116	goto _error;
				117	}
				118	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				119	assert ((ch > 0x07FF) && (ch <= 0xFFFF));
				120	s += 3;
				121	*p++ = ch;
				122	break;
				123
				124	case 4:
				125	if ((s[1] & 0xc0) != 0x80 \|\|
				126	(s[2] & 0xc0) != 0x80 \|\|
				127	(s[3] & 0xc0) != 0x80 \|\|
				128	((unsigned char)s[0] == 0xF0 &&
				129	(unsigned char)s[1] < 0x90) \|\|
				130	((unsigned char)s[0] == 0xF4 &&
				131	(unsigned char)s[1] > 0x8F)) {
				132	/* invalid continuation byte */
				133	goto _error;
				134	}
				135	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				136	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				137	assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
				138	s += 4;
				139	*p++ = ch;
				140	break;
				141	}
				142	}
				143	ret = 0;
				144	goto _ok;
				145	_error:
				146	ret = -1;
				147	_ok:
				148	*src_pos = s;
				149	*dest_index = p - dest;
				150	return ret;
				151	}
				152
				153	#undef LONG_PTR_MASK
				154	#undef ASCII_CHAR_MASK
				155
				156	#endif /* STRINGLIB_IS_UNICODE */