Blame - Lib/codecs.py - platform/external/python/cpython2

blob: 56b6dcf0dcc38962c3ac39da2c1b1f3d39fc9004 [file] [log] [blame]

Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	1	""" codecs -- Python Codec Registry, API and helpers.
				2
				3
				4	Written by Marc-Andre Lemburg (mal@lemburg.com).
				5
				6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				7
				8	"""#"
				9
				10	import struct,types,__builtin__
				11
				12	### Registry and builtin stateless codec functions
				13
Guido van Rossum	b95de4f	2000-03-31 17:25:23 +0000	[diff] [blame]	14	try:
				15	from _codecs import *
				16	except ImportError,why:
				17	raise SystemError,\
				18	'Failed to load the builtin codecs: %s' % why
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	19
				20	### Constants
				21
				22	#
				23	# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
				24	#
				25	BOM = struct.pack('=H',0xFEFF)
				26	#
				27	BOM_BE = BOM32_BE = '\376\377'
				28	# corresponds to Unicode U+FEFF in UTF-16 on big endian
				29	# platforms == ZERO WIDTH NO-BREAK SPACE
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	30	BOM_LE = BOM32_LE = '\377\376'
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	31	# corresponds to Unicode U+FFFE in UTF-16 on little endian
				32	# platforms == defined as being an illegal Unicode character
				33
				34	#
				35	# 64-bit Byte Order Marks
				36	#
				37	BOM64_BE = '\000\000\376\377'
				38	# corresponds to Unicode U+0000FEFF in UCS-4
				39	BOM64_LE = '\377\376\000\000'
				40	# corresponds to Unicode U+0000FFFE in UCS-4
				41
				42
				43	### Codec base classes (defining the API)
				44
				45	class Codec:
				46
				47	""" Defines the interface for stateless encoders/decoders.
				48
				49	The .encode()/.decode() methods may implement different error
				50	handling schemes by providing the errors argument. These
				51	string values are defined:
				52
Guido van Rossum	d8855fd	2000-03-24 22:14:19 +0000	[diff] [blame]	53	'strict' - raise a ValueError error (or a subclass)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	54	'ignore' - ignore the character and continue with the next
				55	'replace' - replace with a suitable replacement character;
				56	Python will use the official U+FFFD REPLACEMENT
				57	CHARACTER for the builtin Unicode codecs.
				58
				59	"""
				60	def encode(self,input,errors='strict'):
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	61
Fred Drake	3e74c0d	2000-03-17 15:40:35 +0000	[diff] [blame]	62	""" Encodes the object input and returns a tuple (output
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	63	object, length consumed).
				64
				65	errors defines the error handling to apply. It defaults to
				66	'strict' handling.
				67
				68	The method may not store state in the Codec instance. Use
				69	StreamCodec for codecs which have to keep state in order to
				70	make encoding/decoding efficient.
				71
				72	The encoder must be able to handle zero length input and
				73	return an empty object of the output object type in this
				74	situation.
				75
				76	"""
				77	raise NotImplementedError
				78
				79	def decode(self,input,errors='strict'):
				80
				81	""" Decodes the object input and returns a tuple (output
				82	object, length consumed).
				83
				84	input must be an object which provides the bf_getreadbuf
				85	buffer slot. Python strings, buffer objects and memory
				86	mapped files are examples of objects providing this slot.
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	87
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	88	errors defines the error handling to apply. It defaults to
				89	'strict' handling.
				90
				91	The method may not store state in the Codec instance. Use
				92	StreamCodec for codecs which have to keep state in order to
				93	make encoding/decoding efficient.
				94
				95	The decoder must be able to handle zero length input and
				96	return an empty object of the output object type in this
				97	situation.
				98
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	99	"""
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	100	raise NotImplementedError
				101
				102	#
				103	# The StreamWriter and StreamReader class provide generic working
				104	# interfaces which can be used to implement new encodings submodules
				105	# very easily. See encodings/utf_8.py for an example on how this is
				106	# done.
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	107	#
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	108
				109	class StreamWriter(Codec):
				110
				111	def __init__(self,stream,errors='strict'):
				112
				113	""" Creates a StreamWriter instance.
				114
				115	stream must be a file-like object open for writing
				116	(binary) data.
				117
				118	The StreamWriter may implement different error handling
				119	schemes by providing the errors keyword argument. These
				120	parameters are defined:
				121
				122	'strict' - raise a ValueError (or a subclass)
				123	'ignore' - ignore the character and continue with the next
				124	'replace'- replace with a suitable replacement character
				125
				126	"""
				127	self.stream = stream
				128	self.errors = errors
				129
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	130	def write(self, object):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	131
				132	""" Writes the object's contents encoded to self.stream.
				133	"""
				134	data, consumed = self.encode(object,self.errors)
				135	self.stream.write(data)
				136
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	137	def writelines(self, list):
				138
				139	""" Writes the concatenated list of strings to the stream
				140	using .write().
				141	"""
				142	self.write(''.join(list))
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	143
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	144	def reset(self):
				145
				146	""" Flushes and resets the codec buffers used for keeping state.
				147
				148	Calling this method should ensure that the data on the
				149	output is put into a clean state, that allows appending
				150	of new fresh data without having to rescan the whole
				151	stream to recover state.
				152
				153	"""
				154	pass
				155
				156	def __getattr__(self,name,
				157
				158	getattr=getattr):
				159
				160	""" Inherit all other methods from the underlying stream.
				161	"""
				162	return getattr(self.stream,name)
				163
				164	###
				165
				166	class StreamReader(Codec):
				167
				168	def __init__(self,stream,errors='strict'):
				169
				170	""" Creates a StreamReader instance.
				171
				172	stream must be a file-like object open for reading
				173	(binary) data.
				174
				175	The StreamReader may implement different error handling
				176	schemes by providing the errors keyword argument. These
				177	parameters are defined:
				178
				179	'strict' - raise a ValueError (or a subclass)
				180	'ignore' - ignore the character and continue with the next
				181	'replace'- replace with a suitable replacement character;
				182
				183	"""
				184	self.stream = stream
				185	self.errors = errors
				186
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	187	def read(self, size=-1):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	188
				189	""" Decodes data from the stream self.stream and returns the
				190	resulting object.
				191
				192	size indicates the approximate maximum number of bytes to
				193	read from the stream for decoding purposes. The decoder
				194	can modify this setting as appropriate. The default value
				195	-1 indicates to read and decode as much as possible. size
				196	is intended to prevent having to decode huge files in one
				197	step.
				198
				199	The method should use a greedy read strategy meaning that
				200	it should read as much data as is allowed within the
				201	definition of the encoding and the given size, e.g. if
				202	optional encoding endings or state markers are available
				203	on the stream, these should be read too.
				204
				205	"""
				206	# Unsliced reading:
				207	if size < 0:
				208	return self.decode(self.stream.read())[0]
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	209
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	210	# Sliced reading:
				211	read = self.stream.read
				212	decode = self.decode
				213	data = read(size)
				214	i = 0
				215	while 1:
				216	try:
				217	object, decodedbytes = decode(data)
				218	except ValueError,why:
				219	# This method is slow but should work under pretty much
				220	# all conditions; at most 10 tries are made
				221	i = i + 1
				222	newdata = read(1)
				223	if not newdata or i > 10:
				224	raise
				225	data = data + newdata
				226	else:
				227	return object
				228
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	229	def readline(self, size=None):
				230
				231	""" Read one line from the input stream and return the
				232	decoded data.
				233
Fred Drake	49fd107	2000-04-13 14:11:21 +0000	[diff] [blame]	234	Note: Unlike the .readlines() method, this method inherits
				235	the line breaking knowledge from the underlying stream's
				236	.readline() method -- there is currently no support for
				237	line breaking using the codec decoder due to lack of line
				238	buffering. Sublcasses should however, if possible, try to
				239	implement this method using their own knowledge of line
				240	breaking.
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	241
				242	size, if given, is passed as size argument to the stream's
				243	.readline() method.
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	244
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	245	"""
				246	if size is None:
				247	line = self.stream.readline()
				248	else:
				249	line = self.stream.readline(size)
				250	return self.decode(line)[0]
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	251
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	252
				253	def readlines(self, sizehint=0):
				254
				255	""" Read all lines available on the input stream
				256	and return them as list of lines.
				257
				258	Line breaks are implemented using the codec's decoder
				259	method and are included in the list entries.
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	260
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	261	sizehint, if given, is passed as size argument to the
				262	stream's .read() method.
				263
				264	"""
				265	if sizehint is None:
				266	data = self.stream.read()
				267	else:
				268	data = self.stream.read(sizehint)
				269	return self.decode(data)[0].splitlines(1)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	270
				271	def reset(self):
				272
				273	""" Resets the codec buffers used for keeping state.
				274
				275	Note that no stream repositioning should take place.
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	276	This method is primarily intended to be able to recover
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	277	from decoding errors.
				278
				279	"""
				280	pass
				281
				282	def __getattr__(self,name,
				283
				284	getattr=getattr):
				285
				286	""" Inherit all other methods from the underlying stream.
				287	"""
				288	return getattr(self.stream,name)
				289
				290	###
				291
				292	class StreamReaderWriter:
				293
Fred Drake	49fd107	2000-04-13 14:11:21 +0000	[diff] [blame]	294	""" StreamReaderWriter instances allow wrapping streams which
				295	work in both read and write modes.
				296
				297	The design is such that one can use the factory functions
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	298	returned by the codec.lookup() function to construct the
Fred Drake	49fd107	2000-04-13 14:11:21 +0000	[diff] [blame]	299	instance.
				300
				301	"""
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	302	# Optional attributes set by the file wrappers below
				303	encoding = 'unknown'
				304
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	305	def __init__(self,stream,Reader,Writer,errors='strict'):
				306
				307	""" Creates a StreamReaderWriter instance.
				308
				309	stream must be a Stream-like object.
				310
				311	Reader, Writer must be factory functions or classes
				312	providing the StreamReader, StreamWriter interface resp.
				313
				314	Error handling is done in the same way as defined for the
				315	StreamWriter/Readers.
				316
				317	"""
				318	self.stream = stream
				319	self.reader = Reader(stream, errors)
				320	self.writer = Writer(stream, errors)
				321	self.errors = errors
				322
				323	def read(self,size=-1):
				324
				325	return self.reader.read(size)
				326
Guido van Rossum	d58c26f	2000-05-01 16:17:32 +0000	[diff] [blame]	327	def readline(self, size=None):
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	328
				329	return self.reader.readline(size)
				330
Guido van Rossum	d58c26f	2000-05-01 16:17:32 +0000	[diff] [blame]	331	def readlines(self, sizehint=None):
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	332
				333	return self.reader.readlines(sizehint)
				334
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	335	def write(self,data):
				336
				337	return self.writer.write(data)
				338
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	339	def writelines(self,list):
				340
				341	return self.writer.writelines(list)
				342
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	343	def reset(self):
				344
				345	self.reader.reset()
				346	self.writer.reset()
				347
				348	def __getattr__(self,name,
				349
				350	getattr=getattr):
				351
				352	""" Inherit all other methods from the underlying stream.
				353	"""
				354	return getattr(self.stream,name)
				355
				356	###
				357
				358	class StreamRecoder:
				359
Fred Drake	49fd107	2000-04-13 14:11:21 +0000	[diff] [blame]	360	""" StreamRecoder instances provide a frontend - backend
				361	view of encoding data.
				362
				363	They use the complete set of APIs returned by the
				364	codecs.lookup() function to implement their task.
				365
				366	Data written to the stream is first decoded into an
				367	intermediate format (which is dependent on the given codec
				368	combination) and then written to the stream using an instance
				369	of the provided Writer class.
				370
				371	In the other direction, data is read from the stream using a
				372	Reader instance and then return encoded data to the caller.
				373
				374	"""
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	375	# Optional attributes set by the file wrappers below
				376	data_encoding = 'unknown'
				377	file_encoding = 'unknown'
				378
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	379	def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
				380
				381	""" Creates a StreamRecoder instance which implements a two-way
				382	conversion: encode and decode work on the frontend (the
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	383	input to .read() and output of .write()) while
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	384	Reader and Writer work on the backend (reading and
Fred Drake	908670c	2000-03-17 15:42:11 +0000	[diff] [blame]	385	writing to the stream).
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	386
				387	You can use these objects to do transparent direct
				388	recodings from e.g. latin-1 to utf-8 and back.
				389
				390	stream must be a file-like object.
				391
				392	encode, decode must adhere to the Codec interface, Reader,
				393	Writer must be factory functions or classes providing the
				394	StreamReader, StreamWriter interface resp.
				395
				396	encode and decode are needed for the frontend translation,
				397	Reader and Writer for the backend translation. Unicode is
				398	used as intermediate encoding.
				399
				400	Error handling is done in the same way as defined for the
				401	StreamWriter/Readers.
				402
				403	"""
				404	self.stream = stream
				405	self.encode = encode
				406	self.decode = decode
				407	self.reader = Reader(stream, errors)
				408	self.writer = Writer(stream, errors)
				409	self.errors = errors
				410
				411	def read(self,size=-1):
				412
				413	data = self.reader.read(size)
				414	data, bytesencoded = self.encode(data, self.errors)
				415	return data
				416
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	417	def readline(self,size=None):
				418
				419	if size is None:
				420	data = self.reader.readline()
				421	else:
				422	data = self.reader.readline(size)
				423	data, bytesencoded = self.encode(data, self.errors)
				424	return data
				425
				426	def readlines(self,sizehint=None):
				427
				428	if sizehint is None:
				429	data = self.reader.read()
				430	else:
				431	data = self.reader.read(sizehint)
				432	data, bytesencoded = self.encode(data, self.errors)
				433	return data.splitlines(1)
				434
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	435	def write(self,data):
				436
				437	data, bytesdecoded = self.decode(data, self.errors)
				438	return self.writer.write(data)
				439
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	440	def writelines(self,list):
				441
				442	data = ''.join(list)
				443	data, bytesdecoded = self.decode(data, self.errors)
				444	return self.writer.write(data)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	445
				446	def reset(self):
				447
				448	self.reader.reset()
				449	self.writer.reset()
				450
				451	def __getattr__(self,name,
				452
				453	getattr=getattr):
				454
				455	""" Inherit all other methods from the underlying stream.
				456	"""
				457	return getattr(self.stream,name)
				458
				459	### Shortcuts
				460
Marc-André Lemburg	349a3d3	2000-06-21 21:21:04 +0000	[diff] [blame]	461	def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	462
				463	""" Open an encoded file using the given mode and return
				464	a wrapped version providing transparent encoding/decoding.
				465
				466	Note: The wrapped version will only accept the object format
				467	defined by the codecs, i.e. Unicode objects for most builtin
				468	codecs. Output is also codec dependent and will usually by
				469	Unicode as well.
				470
Marc-André Lemburg	349a3d3	2000-06-21 21:21:04 +0000	[diff] [blame]	471	Files are always opened in binary mode, even if no binary mode
				472	was specified. Thisis done to avoid data loss due to encodings
				473	using 8-bit values. The default file mode is 'rb' meaning to
				474	open the file in binary read mode.
				475
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	476	encoding specifies the encoding which is to be used for the
				477	the file.
				478
				479	errors may be given to define the error handling. It defaults
				480	to 'strict' which causes ValueErrors to be raised in case an
				481	encoding error occurs.
				482
				483	buffering has the same meaning as for the builtin open() API.
				484	It defaults to line buffered.
				485
Fred Drake	49fd107	2000-04-13 14:11:21 +0000	[diff] [blame]	486	The returned wrapped file object provides an extra attribute
				487	.encoding which allows querying the used encoding. This
				488	attribute is only available if an encoding was specified as
				489	parameter.
				490
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	491	"""
				492	if encoding is not None and \
				493	'b' not in mode:
				494	# Force opening of the file in binary mode
				495	mode = mode + 'b'
				496	file = __builtin__.open(filename, mode, buffering)
				497	if encoding is None:
				498	return file
				499	(e,d,sr,sw) = lookup(encoding)
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	500	srw = StreamReaderWriter(file, sr, sw, errors)
				501	# Add attributes to simplify introspection
				502	srw.encoding = encoding
				503	return srw
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	504
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	505	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	506
				507	""" Return a wrapped version of file which provides transparent
				508	encoding translation.
				509
				510	Strings written to the wrapped file are interpreted according
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	511	to the given data_encoding and then written to the original
				512	file as string using file_encoding. The intermediate encoding
				513	will usually be Unicode but depends on the specified codecs.
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	514
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	515	Strings are read from the file using file_encoding and then
				516	passed back to the caller as string using data_encoding.
				517
				518	If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	519
				520	errors may be given to define the error handling. It defaults
				521	to 'strict' which causes ValueErrors to be raised in case an
				522	encoding error occurs.
				523
Fred Drake	49fd107	2000-04-13 14:11:21 +0000	[diff] [blame]	524	The returned wrapped file object provides two extra attributes
				525	.data_encoding and .file_encoding which reflect the given
				526	parameters of the same name. The attributes can be used for
				527	introspection by Python programs.
				528
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	529	"""
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	530	if file_encoding is None:
				531	file_encoding = data_encoding
				532	encode, decode = lookup(data_encoding)[:2]
				533	Reader, Writer = lookup(file_encoding)[2:]
				534	sr = StreamRecoder(file,
				535	encode,decode,Reader,Writer,
				536	errors)
				537	# Add attributes to simplify introspection
				538	sr.data_encoding = data_encoding
				539	sr.file_encoding = file_encoding
				540	return sr
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	541
				542	### Tests
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	543
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	544	if __name__ == '__main__':
				545
				546	import sys
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	547
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	548	# Make stdout translate Latin-1 output into UTF-8 output
				549	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum	1c89b0e	2000-04-11 15:41:38 +0000	[diff] [blame]	550
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame]	551	# Have stdin translate Latin-1 input into UTF-8 input
				552	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')