Blame - Lib/codecs.py - platform/external/python/cpython3

blob: 5c669c07a5d472cf0b20ccc987120d769c3bf825 [file] [log] [blame]

Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	1	""" codecs -- Python Codec Registry, API and helpers.
				2
				3
				4	Written by Marc-Andre Lemburg (mal@lemburg.com).
				5
				6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				7
				8	"""#"
				9
				10	import struct,types,__builtin__
				11
				12	### Registry and builtin stateless codec functions
				13
Guido van Rossum	b95de4f	2000-03-31 17:25:23 +0000	[diff] [blame]	14	try:
				15	from _codecs import *
				16	except ImportError,why:
				17	raise SystemError,\
				18	'Failed to load the builtin codecs: %s' % why
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	19
				20	### Constants
				21
				22	#
				23	# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
				24	#
				25	BOM = struct.pack('=H',0xFEFF)
				26	#
				27	BOM_BE = BOM32_BE = '\376\377'
				28	# corresponds to Unicode U+FEFF in UTF-16 on big endian
				29	# platforms == ZERO WIDTH NO-BREAK SPACE
				30	BOM_LE = BOM32_LE = '\377\376'
				31	# corresponds to Unicode U+FFFE in UTF-16 on little endian
				32	# platforms == defined as being an illegal Unicode character
				33
				34	#
				35	# 64-bit Byte Order Marks
				36	#
				37	BOM64_BE = '\000\000\376\377'
				38	# corresponds to Unicode U+0000FEFF in UCS-4
				39	BOM64_LE = '\377\376\000\000'
				40	# corresponds to Unicode U+0000FFFE in UCS-4
				41
				42
				43	### Codec base classes (defining the API)
				44
				45	class Codec:
				46
				47	""" Defines the interface for stateless encoders/decoders.
				48
				49	The .encode()/.decode() methods may implement different error
				50	handling schemes by providing the errors argument. These
				51	string values are defined:
				52
Guido van Rossum	d8855fd	2000-03-24 22:14:19 +0000	[diff] [blame]	53	'strict' - raise a ValueError error (or a subclass)
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	54	'ignore' - ignore the character and continue with the next
				55	'replace' - replace with a suitable replacement character;
				56	Python will use the official U+FFFD REPLACEMENT
				57	CHARACTER for the builtin Unicode codecs.
				58
				59	"""
				60	def encode(self,input,errors='strict'):
				61
Fred Drake	3e74c0d	2000-03-17 15:40:35 +0000	[diff] [blame]	62	""" Encodes the object input and returns a tuple (output
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	63	object, length consumed).
				64
				65	errors defines the error handling to apply. It defaults to
				66	'strict' handling.
				67
				68	The method may not store state in the Codec instance. Use
				69	StreamCodec for codecs which have to keep state in order to
				70	make encoding/decoding efficient.
				71
				72	The encoder must be able to handle zero length input and
				73	return an empty object of the output object type in this
				74	situation.
				75
				76	"""
				77	raise NotImplementedError
				78
				79	def decode(self,input,errors='strict'):
				80
				81	""" Decodes the object input and returns a tuple (output
				82	object, length consumed).
				83
				84	input must be an object which provides the bf_getreadbuf
				85	buffer slot. Python strings, buffer objects and memory
				86	mapped files are examples of objects providing this slot.
				87
				88	errors defines the error handling to apply. It defaults to
				89	'strict' handling.
				90
				91	The method may not store state in the Codec instance. Use
				92	StreamCodec for codecs which have to keep state in order to
				93	make encoding/decoding efficient.
				94
				95	The decoder must be able to handle zero length input and
				96	return an empty object of the output object type in this
				97	situation.
				98
				99	"""
				100	raise NotImplementedError
				101
				102	#
				103	# The StreamWriter and StreamReader class provide generic working
				104	# interfaces which can be used to implement new encodings submodules
				105	# very easily. See encodings/utf_8.py for an example on how this is
				106	# done.
				107	#
				108
				109	class StreamWriter(Codec):
				110
				111	def __init__(self,stream,errors='strict'):
				112
				113	""" Creates a StreamWriter instance.
				114
				115	stream must be a file-like object open for writing
				116	(binary) data.
				117
				118	The StreamWriter may implement different error handling
				119	schemes by providing the errors keyword argument. These
				120	parameters are defined:
				121
				122	'strict' - raise a ValueError (or a subclass)
				123	'ignore' - ignore the character and continue with the next
				124	'replace'- replace with a suitable replacement character
				125
				126	"""
				127	self.stream = stream
				128	self.errors = errors
				129
				130	def write(self,object):
				131
				132	""" Writes the object's contents encoded to self.stream.
				133	"""
				134	data, consumed = self.encode(object,self.errors)
				135	self.stream.write(data)
				136
				137	# XXX .writelines() ?
				138
				139	def reset(self):
				140
				141	""" Flushes and resets the codec buffers used for keeping state.
				142
				143	Calling this method should ensure that the data on the
				144	output is put into a clean state, that allows appending
				145	of new fresh data without having to rescan the whole
				146	stream to recover state.
				147
				148	"""
				149	pass
				150
				151	def __getattr__(self,name,
				152
				153	getattr=getattr):
				154
				155	""" Inherit all other methods from the underlying stream.
				156	"""
				157	return getattr(self.stream,name)
				158
				159	###
				160
				161	class StreamReader(Codec):
				162
				163	def __init__(self,stream,errors='strict'):
				164
				165	""" Creates a StreamReader instance.
				166
				167	stream must be a file-like object open for reading
				168	(binary) data.
				169
				170	The StreamReader may implement different error handling
				171	schemes by providing the errors keyword argument. These
				172	parameters are defined:
				173
				174	'strict' - raise a ValueError (or a subclass)
				175	'ignore' - ignore the character and continue with the next
				176	'replace'- replace with a suitable replacement character;
				177
				178	"""
				179	self.stream = stream
				180	self.errors = errors
				181
				182	def read(self,size=-1):
				183
				184	""" Decodes data from the stream self.stream and returns the
				185	resulting object.
				186
				187	size indicates the approximate maximum number of bytes to
				188	read from the stream for decoding purposes. The decoder
				189	can modify this setting as appropriate. The default value
				190	-1 indicates to read and decode as much as possible. size
				191	is intended to prevent having to decode huge files in one
				192	step.
				193
				194	The method should use a greedy read strategy meaning that
				195	it should read as much data as is allowed within the
				196	definition of the encoding and the given size, e.g. if
				197	optional encoding endings or state markers are available
				198	on the stream, these should be read too.
				199
				200	"""
				201	# Unsliced reading:
				202	if size < 0:
				203	return self.decode(self.stream.read())[0]
				204
				205	# Sliced reading:
				206	read = self.stream.read
				207	decode = self.decode
				208	data = read(size)
				209	i = 0
				210	while 1:
				211	try:
				212	object, decodedbytes = decode(data)
				213	except ValueError,why:
				214	# This method is slow but should work under pretty much
				215	# all conditions; at most 10 tries are made
				216	i = i + 1
				217	newdata = read(1)
				218	if not newdata or i > 10:
				219	raise
				220	data = data + newdata
				221	else:
				222	return object
				223
				224	# XXX .readline() and .readlines() (these are hard to implement
				225	# without using buffers for keeping read-ahead data)
				226
				227	def reset(self):
				228
				229	""" Resets the codec buffers used for keeping state.
				230
				231	Note that no stream repositioning should take place.
				232	This method is primarely intended to be able to recover
				233	from decoding errors.
				234
				235	"""
				236	pass
				237
				238	def __getattr__(self,name,
				239
				240	getattr=getattr):
				241
				242	""" Inherit all other methods from the underlying stream.
				243	"""
				244	return getattr(self.stream,name)
				245
				246	###
				247
				248	class StreamReaderWriter:
				249
				250	def __init__(self,stream,Reader,Writer,errors='strict'):
				251
				252	""" Creates a StreamReaderWriter instance.
				253
				254	stream must be a Stream-like object.
				255
				256	Reader, Writer must be factory functions or classes
				257	providing the StreamReader, StreamWriter interface resp.
				258
				259	Error handling is done in the same way as defined for the
				260	StreamWriter/Readers.
				261
				262	"""
				263	self.stream = stream
				264	self.reader = Reader(stream, errors)
				265	self.writer = Writer(stream, errors)
				266	self.errors = errors
				267
				268	def read(self,size=-1):
				269
				270	return self.reader.read(size)
				271
				272	def write(self,data):
				273
				274	return self.writer.write(data)
				275
				276	def reset(self):
				277
				278	self.reader.reset()
				279	self.writer.reset()
				280
				281	def __getattr__(self,name,
				282
				283	getattr=getattr):
				284
				285	""" Inherit all other methods from the underlying stream.
				286	"""
				287	return getattr(self.stream,name)
				288
				289	###
				290
				291	class StreamRecoder:
				292
				293	def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
				294
				295	""" Creates a StreamRecoder instance which implements a two-way
				296	conversion: encode and decode work on the frontend (the
				297	input to .read() and output of .write()) while
				298	Reader and Writer work on the backend (reading and
Fred Drake	908670c	2000-03-17 15:42:11 +0000	[diff] [blame]	299	writing to the stream).
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	300
				301	You can use these objects to do transparent direct
				302	recodings from e.g. latin-1 to utf-8 and back.
				303
				304	stream must be a file-like object.
				305
				306	encode, decode must adhere to the Codec interface, Reader,
				307	Writer must be factory functions or classes providing the
				308	StreamReader, StreamWriter interface resp.
				309
				310	encode and decode are needed for the frontend translation,
				311	Reader and Writer for the backend translation. Unicode is
				312	used as intermediate encoding.
				313
				314	Error handling is done in the same way as defined for the
				315	StreamWriter/Readers.
				316
				317	"""
				318	self.stream = stream
				319	self.encode = encode
				320	self.decode = decode
				321	self.reader = Reader(stream, errors)
				322	self.writer = Writer(stream, errors)
				323	self.errors = errors
				324
				325	def read(self,size=-1):
				326
				327	data = self.reader.read(size)
				328	data, bytesencoded = self.encode(data, self.errors)
				329	return data
				330
				331	def write(self,data):
				332
				333	data, bytesdecoded = self.decode(data, self.errors)
				334	return self.writer.write(data)
				335
				336	# .writelines(), .readline() and .readlines() ... see notes
				337	# above.
				338
				339	def reset(self):
				340
				341	self.reader.reset()
				342	self.writer.reset()
				343
				344	def __getattr__(self,name,
				345
				346	getattr=getattr):
				347
				348	""" Inherit all other methods from the underlying stream.
				349	"""
				350	return getattr(self.stream,name)
				351
				352	### Shortcuts
				353
				354	def open(filename, mode, encoding=None, errors='strict', buffering=1):
				355
				356	""" Open an encoded file using the given mode and return
				357	a wrapped version providing transparent encoding/decoding.
				358
				359	Note: The wrapped version will only accept the object format
				360	defined by the codecs, i.e. Unicode objects for most builtin
				361	codecs. Output is also codec dependent and will usually by
				362	Unicode as well.
				363
				364	encoding specifies the encoding which is to be used for the
				365	the file.
				366
				367	errors may be given to define the error handling. It defaults
				368	to 'strict' which causes ValueErrors to be raised in case an
				369	encoding error occurs.
				370
				371	buffering has the same meaning as for the builtin open() API.
				372	It defaults to line buffered.
				373
				374	"""
				375	if encoding is not None and \
				376	'b' not in mode:
				377	# Force opening of the file in binary mode
				378	mode = mode + 'b'
				379	file = __builtin__.open(filename, mode, buffering)
				380	if encoding is None:
				381	return file
				382	(e,d,sr,sw) = lookup(encoding)
				383	return StreamReaderWriter(file, sr, sw, errors)
				384
				385	def EncodedFile(file, input, output=None, errors='strict'):
				386
				387	""" Return a wrapped version of file which provides transparent
				388	encoding translation.
				389
				390	Strings written to the wrapped file are interpreted according
				391	to the given input encoding and then written to the original
				392	file as string using the output encoding. The intermediate
				393	encoding will usually be Unicode but depends on the specified
				394	codecs.
				395
				396	If output is not given, it defaults to input.
				397
				398	errors may be given to define the error handling. It defaults
				399	to 'strict' which causes ValueErrors to be raised in case an
				400	encoding error occurs.
				401
				402	"""
				403	if output is None:
				404	output = input
				405	encode, decode = lookup(input)[:2]
				406	Reader, Writer = lookup(output)[2:]
				407	return StreamRecoder(file,
				408	encode,decode,Reader,Writer,
				409	errors)
				410
				411	### Tests
				412
				413	if __name__ == '__main__':
				414
				415	import sys
				416
				417	# Make stdout translate Latin-1 into Unicode-Escape
				418	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')