Blame - Lib/codecs.py - platform/external/python/cpython3

blob: 6a61e1aa60434038246a72303e2612cd1b621655 [file] [log] [blame]

Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	1	""" codecs -- Python Codec Registry, API and helpers.
				2
				3
				4	Written by Marc-Andre Lemburg (mal@lemburg.com).
				5
				6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				7
				8	"""#"
				9
				10	import struct,types,__builtin__
				11
				12	### Registry and builtin stateless codec functions
				13
Guido van Rossum	b95de4f	2000-03-31 17:25:23 +0000	[diff] [blame]	14	try:
				15	from _codecs import *
				16	except ImportError,why:
				17	raise SystemError,\
				18	'Failed to load the builtin codecs: %s' % why
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	19
				20	### Constants
				21
				22	#
				23	# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
				24	#
				25	BOM = struct.pack('=H',0xFEFF)
				26	#
				27	BOM_BE = BOM32_BE = '\376\377'
				28	# corresponds to Unicode U+FEFF in UTF-16 on big endian
				29	# platforms == ZERO WIDTH NO-BREAK SPACE
				30	BOM_LE = BOM32_LE = '\377\376'
				31	# corresponds to Unicode U+FFFE in UTF-16 on little endian
				32	# platforms == defined as being an illegal Unicode character
				33
				34	#
				35	# 64-bit Byte Order Marks
				36	#
				37	BOM64_BE = '\000\000\376\377'
				38	# corresponds to Unicode U+0000FEFF in UCS-4
				39	BOM64_LE = '\377\376\000\000'
				40	# corresponds to Unicode U+0000FFFE in UCS-4
				41
				42
				43	### Codec base classes (defining the API)
				44
				45	class Codec:
				46
				47	""" Defines the interface for stateless encoders/decoders.
				48
				49	The .encode()/.decode() methods may implement different error
				50	handling schemes by providing the errors argument. These
				51	string values are defined:
				52
Guido van Rossum	d8855fd	2000-03-24 22:14:19 +0000	[diff] [blame]	53	'strict' - raise a ValueError error (or a subclass)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	54	'ignore' - ignore the character and continue with the next
				55	'replace' - replace with a suitable replacement character;
				56	Python will use the official U+FFFD REPLACEMENT
				57	CHARACTER for the builtin Unicode codecs.
				58
				59	"""
				60	def encode(self,input,errors='strict'):
				61
Fred Drake	3e74c0d	2000-03-17 15:40:35 +0000	[diff] [blame]	62	""" Encodes the object input and returns a tuple (output
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	63	object, length consumed).
				64
				65	errors defines the error handling to apply. It defaults to
				66	'strict' handling.
				67
				68	The method may not store state in the Codec instance. Use
				69	StreamCodec for codecs which have to keep state in order to
				70	make encoding/decoding efficient.
				71
				72	The encoder must be able to handle zero length input and
				73	return an empty object of the output object type in this
				74	situation.
				75
				76	"""
				77	raise NotImplementedError
				78
				79	def decode(self,input,errors='strict'):
				80
				81	""" Decodes the object input and returns a tuple (output
				82	object, length consumed).
				83
				84	input must be an object which provides the bf_getreadbuf
				85	buffer slot. Python strings, buffer objects and memory
				86	mapped files are examples of objects providing this slot.
				87
				88	errors defines the error handling to apply. It defaults to
				89	'strict' handling.
				90
				91	The method may not store state in the Codec instance. Use
				92	StreamCodec for codecs which have to keep state in order to
				93	make encoding/decoding efficient.
				94
				95	The decoder must be able to handle zero length input and
				96	return an empty object of the output object type in this
				97	situation.
				98
				99	"""
				100	raise NotImplementedError
				101
				102	#
				103	# The StreamWriter and StreamReader class provide generic working
				104	# interfaces which can be used to implement new encodings submodules
				105	# very easily. See encodings/utf_8.py for an example on how this is
				106	# done.
				107	#
				108
				109	class StreamWriter(Codec):
				110
				111	def __init__(self,stream,errors='strict'):
				112
				113	""" Creates a StreamWriter instance.
				114
				115	stream must be a file-like object open for writing
				116	(binary) data.
				117
				118	The StreamWriter may implement different error handling
				119	schemes by providing the errors keyword argument. These
				120	parameters are defined:
				121
				122	'strict' - raise a ValueError (or a subclass)
				123	'ignore' - ignore the character and continue with the next
				124	'replace'- replace with a suitable replacement character
				125
				126	"""
				127	self.stream = stream
				128	self.errors = errors
				129
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	130	def write(self, object):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	131
				132	""" Writes the object's contents encoded to self.stream.
				133	"""
				134	data, consumed = self.encode(object,self.errors)
				135	self.stream.write(data)
				136
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	137	def writelines(self, list):
				138
				139	""" Writes the concatenated list of strings to the stream
				140	using .write().
				141	"""
				142	self.write(''.join(list))
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	143
				144	def reset(self):
				145
				146	""" Flushes and resets the codec buffers used for keeping state.
				147
				148	Calling this method should ensure that the data on the
				149	output is put into a clean state, that allows appending
				150	of new fresh data without having to rescan the whole
				151	stream to recover state.
				152
				153	"""
				154	pass
				155
				156	def __getattr__(self,name,
				157
				158	getattr=getattr):
				159
				160	""" Inherit all other methods from the underlying stream.
				161	"""
				162	return getattr(self.stream,name)
				163
				164	###
				165
				166	class StreamReader(Codec):
				167
				168	def __init__(self,stream,errors='strict'):
				169
				170	""" Creates a StreamReader instance.
				171
				172	stream must be a file-like object open for reading
				173	(binary) data.
				174
				175	The StreamReader may implement different error handling
				176	schemes by providing the errors keyword argument. These
				177	parameters are defined:
				178
				179	'strict' - raise a ValueError (or a subclass)
				180	'ignore' - ignore the character and continue with the next
				181	'replace'- replace with a suitable replacement character;
				182
				183	"""
				184	self.stream = stream
				185	self.errors = errors
				186
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	187	def read(self, size=-1):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	188
				189	""" Decodes data from the stream self.stream and returns the
				190	resulting object.
				191
				192	size indicates the approximate maximum number of bytes to
				193	read from the stream for decoding purposes. The decoder
				194	can modify this setting as appropriate. The default value
				195	-1 indicates to read and decode as much as possible. size
				196	is intended to prevent having to decode huge files in one
				197	step.
				198
				199	The method should use a greedy read strategy meaning that
				200	it should read as much data as is allowed within the
				201	definition of the encoding and the given size, e.g. if
				202	optional encoding endings or state markers are available
				203	on the stream, these should be read too.
				204
				205	"""
				206	# Unsliced reading:
				207	if size < 0:
				208	return self.decode(self.stream.read())[0]
				209
				210	# Sliced reading:
				211	read = self.stream.read
				212	decode = self.decode
				213	data = read(size)
				214	i = 0
				215	while 1:
				216	try:
				217	object, decodedbytes = decode(data)
				218	except ValueError,why:
				219	# This method is slow but should work under pretty much
				220	# all conditions; at most 10 tries are made
				221	i = i + 1
				222	newdata = read(1)
				223	if not newdata or i > 10:
				224	raise
				225	data = data + newdata
				226	else:
				227	return object
				228
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	229	def readline(self, size=None):
				230
				231	""" Read one line from the input stream and return the
				232	decoded data.
				233
				234	Note: Unlike the .readlines() method, line breaking must
				235	be implemented by the underlying stream's .readline()
				236	method -- there is currently no support for line breaking
				237	using the codec decoder due to lack of line buffering.
				238
				239	size, if given, is passed as size argument to the stream's
				240	.readline() method.
				241
				242	"""
				243	if size is None:
				244	line = self.stream.readline()
				245	else:
				246	line = self.stream.readline(size)
				247	return self.decode(line)[0]
				248
				249
				250	def readlines(self, sizehint=0):
				251
				252	""" Read all lines available on the input stream
				253	and return them as list of lines.
				254
				255	Line breaks are implemented using the codec's decoder
				256	method and are included in the list entries.
				257
				258	sizehint, if given, is passed as size argument to the
				259	stream's .read() method.
				260
				261	"""
				262	if sizehint is None:
				263	data = self.stream.read()
				264	else:
				265	data = self.stream.read(sizehint)
				266	return self.decode(data)[0].splitlines(1)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	267
				268	def reset(self):
				269
				270	""" Resets the codec buffers used for keeping state.
				271
				272	Note that no stream repositioning should take place.
				273	This method is primarely intended to be able to recover
				274	from decoding errors.
				275
				276	"""
				277	pass
				278
				279	def __getattr__(self,name,
				280
				281	getattr=getattr):
				282
				283	""" Inherit all other methods from the underlying stream.
				284	"""
				285	return getattr(self.stream,name)
				286
				287	###
				288
				289	class StreamReaderWriter:
				290
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	291	# Optional attributes set by the file wrappers below
				292	encoding = 'unknown'
				293
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	294	def __init__(self,stream,Reader,Writer,errors='strict'):
				295
				296	""" Creates a StreamReaderWriter instance.
				297
				298	stream must be a Stream-like object.
				299
				300	Reader, Writer must be factory functions or classes
				301	providing the StreamReader, StreamWriter interface resp.
				302
				303	Error handling is done in the same way as defined for the
				304	StreamWriter/Readers.
				305
				306	"""
				307	self.stream = stream
				308	self.reader = Reader(stream, errors)
				309	self.writer = Writer(stream, errors)
				310	self.errors = errors
				311
				312	def read(self,size=-1):
				313
				314	return self.reader.read(size)
				315
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	316	def readline(size=None):
				317
				318	return self.reader.readline(size)
				319
				320	def readlines(sizehint=None):
				321
				322	return self.reader.readlines(sizehint)
				323
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	324	def write(self,data):
				325
				326	return self.writer.write(data)
				327
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	328	def writelines(self,list):
				329
				330	return self.writer.writelines(list)
				331
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	332	def reset(self):
				333
				334	self.reader.reset()
				335	self.writer.reset()
				336
				337	def __getattr__(self,name,
				338
				339	getattr=getattr):
				340
				341	""" Inherit all other methods from the underlying stream.
				342	"""
				343	return getattr(self.stream,name)
				344
				345	###
				346
				347	class StreamRecoder:
				348
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	349	# Optional attributes set by the file wrappers below
				350	data_encoding = 'unknown'
				351	file_encoding = 'unknown'
				352
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	353	def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
				354
				355	""" Creates a StreamRecoder instance which implements a two-way
				356	conversion: encode and decode work on the frontend (the
				357	input to .read() and output of .write()) while
				358	Reader and Writer work on the backend (reading and
Fred Drake	908670c	2000-03-17 15:42:11 +0000	[diff] [blame]	359	writing to the stream).
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	360
				361	You can use these objects to do transparent direct
				362	recodings from e.g. latin-1 to utf-8 and back.
				363
				364	stream must be a file-like object.
				365
				366	encode, decode must adhere to the Codec interface, Reader,
				367	Writer must be factory functions or classes providing the
				368	StreamReader, StreamWriter interface resp.
				369
				370	encode and decode are needed for the frontend translation,
				371	Reader and Writer for the backend translation. Unicode is
				372	used as intermediate encoding.
				373
				374	Error handling is done in the same way as defined for the
				375	StreamWriter/Readers.
				376
				377	"""
				378	self.stream = stream
				379	self.encode = encode
				380	self.decode = decode
				381	self.reader = Reader(stream, errors)
				382	self.writer = Writer(stream, errors)
				383	self.errors = errors
				384
				385	def read(self,size=-1):
				386
				387	data = self.reader.read(size)
				388	data, bytesencoded = self.encode(data, self.errors)
				389	return data
				390
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	391	def readline(self,size=None):
				392
				393	if size is None:
				394	data = self.reader.readline()
				395	else:
				396	data = self.reader.readline(size)
				397	data, bytesencoded = self.encode(data, self.errors)
				398	return data
				399
				400	def readlines(self,sizehint=None):
				401
				402	if sizehint is None:
				403	data = self.reader.read()
				404	else:
				405	data = self.reader.read(sizehint)
				406	data, bytesencoded = self.encode(data, self.errors)
				407	return data.splitlines(1)
				408
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	409	def write(self,data):
				410
				411	data, bytesdecoded = self.decode(data, self.errors)
				412	return self.writer.write(data)
				413
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	414	def writelines(self,list):
				415
				416	data = ''.join(list)
				417	data, bytesdecoded = self.decode(data, self.errors)
				418	return self.writer.write(data)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	419
				420	def reset(self):
				421
				422	self.reader.reset()
				423	self.writer.reset()
				424
				425	def __getattr__(self,name,
				426
				427	getattr=getattr):
				428
				429	""" Inherit all other methods from the underlying stream.
				430	"""
				431	return getattr(self.stream,name)
				432
				433	### Shortcuts
				434
				435	def open(filename, mode, encoding=None, errors='strict', buffering=1):
				436
				437	""" Open an encoded file using the given mode and return
				438	a wrapped version providing transparent encoding/decoding.
				439
				440	Note: The wrapped version will only accept the object format
				441	defined by the codecs, i.e. Unicode objects for most builtin
				442	codecs. Output is also codec dependent and will usually by
				443	Unicode as well.
				444
				445	encoding specifies the encoding which is to be used for the
				446	the file.
				447
				448	errors may be given to define the error handling. It defaults
				449	to 'strict' which causes ValueErrors to be raised in case an
				450	encoding error occurs.
				451
				452	buffering has the same meaning as for the builtin open() API.
				453	It defaults to line buffered.
				454
				455	"""
				456	if encoding is not None and \
				457	'b' not in mode:
				458	# Force opening of the file in binary mode
				459	mode = mode + 'b'
				460	file = __builtin__.open(filename, mode, buffering)
				461	if encoding is None:
				462	return file
				463	(e,d,sr,sw) = lookup(encoding)
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	464	srw = StreamReaderWriter(file, sr, sw, errors)
				465	# Add attributes to simplify introspection
				466	srw.encoding = encoding
				467	return srw
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	468
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	469	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	470
				471	""" Return a wrapped version of file which provides transparent
				472	encoding translation.
				473
				474	Strings written to the wrapped file are interpreted according
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	475	to the given data_encoding and then written to the original
				476	file as string using file_encoding. The intermediate encoding
				477	will usually be Unicode but depends on the specified codecs.
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	478
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	479	Strings are read from the file using file_encoding and then
				480	passed back to the caller as string using data_encoding.
				481
				482	If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	483
				484	errors may be given to define the error handling. It defaults
				485	to 'strict' which causes ValueErrors to be raised in case an
				486	encoding error occurs.
				487
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	488	data_encoding and file_encoding are added to the wrapped file
				489	object as attributes .data_encoding and .file_encoding resp.
				490
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	491	"""
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	492	if file_encoding is None:
				493	file_encoding = data_encoding
				494	encode, decode = lookup(data_encoding)[:2]
				495	Reader, Writer = lookup(file_encoding)[2:]
				496	sr = StreamRecoder(file,
				497	encode,decode,Reader,Writer,
				498	errors)
				499	# Add attributes to simplify introspection
				500	sr.data_encoding = data_encoding
				501	sr.file_encoding = file_encoding
				502	return sr
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	503
				504	### Tests
				505
				506	if __name__ == '__main__':
				507
				508	import sys
				509
Guido van Rossum	a327713	2000-04-11 15:37:43 +0000	[diff] [blame^]	510	# Make stdout translate Latin-1 output into UTF-8 output
				511	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
				512
				513	# Have stdin translate Latin-1 input into UTF-8 input
				514	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')