Blame - src/encoding.c - platform/external/libvterm

blob: 14958554f2ddb5db9f764fab8e7f447633b4518e [file] [log] [blame]

Jeff Sharkey	5b78a3a	2013-02-19 17:28:10 -0800	[diff] [blame]	1	#include "vterm_internal.h"
				2
				3	#define UNICODE_INVALID 0xFFFD
				4
Elliott Hughes	6d78f36	2014-12-04 19:52:44 -0800	[diff] [blame]	5	#if defined(DEBUG) && DEBUG > 1
Jeff Sharkey	5b78a3a	2013-02-19 17:28:10 -0800	[diff] [blame]	6	# define DEBUG_PRINT_UTF8
				7	#endif
				8
				9	struct UTF8DecoderData {
				10	// number of bytes remaining in this codepoint
				11	int bytes_remaining;
				12
				13	// number of bytes total in this codepoint once it's finished
				14	// (for detecting overlongs)
				15	int bytes_total;
				16
				17	int this_cp;
				18	};
				19
				20	static void init_utf8(VTermEncoding enc, void data_)
				21	{
				22	struct UTF8DecoderData *data = data_;
				23
				24	data->bytes_remaining = 0;
				25	data->bytes_total = 0;
				26	}
				27
				28	static void decode_utf8(VTermEncoding enc, void data_,
				29	uint32_t cp[], int *cpi, int cplen,
				30	const char bytes[], size_t *pos, size_t bytelen)
				31	{
				32	struct UTF8DecoderData *data = data_;
				33
				34	#ifdef DEBUG_PRINT_UTF8
				35	printf("BEGIN UTF-8\n");
				36	#endif
				37
				38	for(; pos < bytelen && cpi < cplen; (*pos)++) {
				39	unsigned char c = bytes[*pos];
				40
				41	#ifdef DEBUG_PRINT_UTF8
				42	printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
				43	#endif
				44
				45	if(c < 0x20)
				46	return;
				47
				48	else if(c >= 0x20 && c < 0x80) {
				49	if(data->bytes_remaining)
				50	cp[(*cpi)++] = UNICODE_INVALID;
				51
				52	cp[(*cpi)++] = c;
				53	#ifdef DEBUG_PRINT_UTF8
				54	printf(" UTF-8 char: U+%04x\n", c);
				55	#endif
				56	data->bytes_remaining = 0;
				57	}
				58
				59	else if(c >= 0x80 && c < 0xc0) {
				60	if(!data->bytes_remaining) {
				61	cp[(*cpi)++] = UNICODE_INVALID;
				62	continue;
				63	}
				64
				65	data->this_cp <<= 6;
				66	data->this_cp \|= c & 0x3f;
				67	data->bytes_remaining--;
				68
				69	if(!data->bytes_remaining) {
				70	#ifdef DEBUG_PRINT_UTF8
				71	printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
				72	#endif
				73	// Check for overlong sequences
				74	switch(data->bytes_total) {
				75	case 2:
				76	if(data->this_cp < 0x0080) data->this_cp = UNICODE_INVALID; break;
				77	case 3:
				78	if(data->this_cp < 0x0800) data->this_cp = UNICODE_INVALID; break;
				79	case 4:
				80	if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; break;
				81	case 5:
				82	if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; break;
				83	case 6:
				84	if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; break;
				85	}
				86	// Now look for plain invalid ones
				87	if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) \|\|
				88	data->this_cp == 0xFFFE \|\|
				89	data->this_cp == 0xFFFF)
				90	data->this_cp = UNICODE_INVALID;
				91	#ifdef DEBUG_PRINT_UTF8
				92	printf(" char: U+%04x\n", data->this_cp);
				93	#endif
				94	cp[(*cpi)++] = data->this_cp;
				95	}
				96	}
				97
				98	else if(c >= 0xc0 && c < 0xe0) {
				99	if(data->bytes_remaining)
				100	cp[(*cpi)++] = UNICODE_INVALID;
				101
				102	data->this_cp = c & 0x1f;
				103	data->bytes_total = 2;
				104	data->bytes_remaining = 1;
				105	}
				106
				107	else if(c >= 0xe0 && c < 0xf0) {
				108	if(data->bytes_remaining)
				109	cp[(*cpi)++] = UNICODE_INVALID;
				110
				111	data->this_cp = c & 0x0f;
				112	data->bytes_total = 3;
				113	data->bytes_remaining = 2;
				114	}
				115
				116	else if(c >= 0xf0 && c < 0xf8) {
				117	if(data->bytes_remaining)
				118	cp[(*cpi)++] = UNICODE_INVALID;
				119
				120	data->this_cp = c & 0x07;
				121	data->bytes_total = 4;
				122	data->bytes_remaining = 3;
				123	}
				124
				125	else if(c >= 0xf8 && c < 0xfc) {
				126	if(data->bytes_remaining)
				127	cp[(*cpi)++] = UNICODE_INVALID;
				128
				129	data->this_cp = c & 0x03;
				130	data->bytes_total = 5;
				131	data->bytes_remaining = 4;
				132	}
				133
				134	else if(c >= 0xfc && c < 0xfe) {
				135	if(data->bytes_remaining)
				136	cp[(*cpi)++] = UNICODE_INVALID;
				137
				138	data->this_cp = c & 0x01;
				139	data->bytes_total = 6;
				140	data->bytes_remaining = 5;
				141	}
				142
				143	else {
				144	cp[(*cpi)++] = UNICODE_INVALID;
				145	}
				146	}
				147	}
				148
				149	static VTermEncoding encoding_utf8 = {
				150	.init = &init_utf8,
				151	.decode = &decode_utf8,
				152	};
				153
				154	static void decode_usascii(VTermEncoding enc, void data,
				155	uint32_t cp[], int *cpi, int cplen,
				156	const char bytes[], size_t *pos, size_t bytelen)
				157	{
				158	int is_gr = bytes[*pos] & 0x80;
				159
				160	for(; pos < bytelen && cpi < cplen; (*pos)++) {
				161	unsigned char c = bytes[*pos] ^ is_gr;
				162
				163	if(c < 0x20 \|\| c >= 0x80)
				164	return;
				165
				166	cp[(*cpi)++] = c;
				167	}
				168	}
				169
				170	static VTermEncoding encoding_usascii = {
				171	.decode = &decode_usascii,
				172	};
				173
				174	struct StaticTableEncoding {
				175	const VTermEncoding enc;
				176	const uint32_t chars[128];
				177	};
				178
				179	static void decode_table(VTermEncoding enc, void data,
				180	uint32_t cp[], int *cpi, int cplen,
				181	const char bytes[], size_t *pos, size_t bytelen)
				182	{
				183	struct StaticTableEncoding table = (struct StaticTableEncoding )enc;
				184	int is_gr = bytes[*pos] & 0x80;
				185
				186	for(; pos < bytelen && cpi < cplen; (*pos)++) {
				187	unsigned char c = bytes[*pos] ^ is_gr;
				188
				189	if(c < 0x20 \|\| c >= 0x80)
				190	return;
				191
				192	if(table->chars[c])
				193	cp[(*cpi)++] = table->chars[c];
				194	else
				195	cp[(*cpi)++] = c;
				196	}
				197	}
				198
				199	#include "encoding/DECdrawing.inc"
				200	#include "encoding/uk.inc"
				201
				202	static struct {
				203	VTermEncodingType type;
				204	char designation;
				205	VTermEncoding *enc;
				206	}
				207	encodings[] = {
				208	{ ENC_UTF8, 'u', &encoding_utf8 },
				209	{ ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing },
				210	{ ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk },
				211	{ ENC_SINGLE_94, 'B', &encoding_usascii },
Jeff Sharkey	73fbfc3	2013-04-23 10:34:06 -0700	[diff] [blame]	212	{ 0 },
Jeff Sharkey	5b78a3a	2013-02-19 17:28:10 -0800	[diff] [blame]	213	};
				214
Elliott Hughes	6d78f36	2014-12-04 19:52:44 -0800	[diff] [blame]	215	/* This ought to be INTERNAL but isn't because it's used by unit testing */
Jeff Sharkey	5b78a3a	2013-02-19 17:28:10 -0800	[diff] [blame]	216	VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
				217	{
				218	for(int i = 0; encodings[i].designation; i++)
				219	if(encodings[i].type == type && encodings[i].designation == designation)
				220	return encodings[i].enc;
				221	return NULL;
				222	}