Blame - Lib/codecs.py - platform/external/python/cpython2

blob: 7f478d7191d2c113c653ece304b57821ec6761d1 [file] [log] [blame]

Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	1	""" codecs -- Python Codec Registry, API and helpers.
				2
				3
				4	Written by Marc-Andre Lemburg (mal@lemburg.com).
				5
				6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				7
				8	"""#"
				9
				10	import struct,types,__builtin__
				11
				12	### Registry and builtin stateless codec functions
				13
				14	from _codecs import *
				15
				16	### Constants
				17
				18	#
				19	# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
				20	#
				21	BOM = struct.pack('=H',0xFEFF)
				22	#
				23	BOM_BE = BOM32_BE = '\376\377'
				24	# corresponds to Unicode U+FEFF in UTF-16 on big endian
				25	# platforms == ZERO WIDTH NO-BREAK SPACE
				26	BOM_LE = BOM32_LE = '\377\376'
				27	# corresponds to Unicode U+FFFE in UTF-16 on little endian
				28	# platforms == defined as being an illegal Unicode character
				29
				30	#
				31	# 64-bit Byte Order Marks
				32	#
				33	BOM64_BE = '\000\000\376\377'
				34	# corresponds to Unicode U+0000FEFF in UCS-4
				35	BOM64_LE = '\377\376\000\000'
				36	# corresponds to Unicode U+0000FFFE in UCS-4
				37
				38
				39	### Codec base classes (defining the API)
				40
				41	class Codec:
				42
				43	""" Defines the interface for stateless encoders/decoders.
				44
				45	The .encode()/.decode() methods may implement different error
				46	handling schemes by providing the errors argument. These
				47	string values are defined:
				48
				49	'strict' - raise an error (or a subclass)
				50	'ignore' - ignore the character and continue with the next
				51	'replace' - replace with a suitable replacement character;
				52	Python will use the official U+FFFD REPLACEMENT
				53	CHARACTER for the builtin Unicode codecs.
				54
				55	"""
				56	def encode(self,input,errors='strict'):
				57
Fred Drake	3e74c0d	2000-03-17 15:40:35 +0000	[diff] [blame]	58	""" Encodes the object input and returns a tuple (output
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	59	object, length consumed).
				60
				61	errors defines the error handling to apply. It defaults to
				62	'strict' handling.
				63
				64	The method may not store state in the Codec instance. Use
				65	StreamCodec for codecs which have to keep state in order to
				66	make encoding/decoding efficient.
				67
				68	The encoder must be able to handle zero length input and
				69	return an empty object of the output object type in this
				70	situation.
				71
				72	"""
				73	raise NotImplementedError
				74
				75	def decode(self,input,errors='strict'):
				76
				77	""" Decodes the object input and returns a tuple (output
				78	object, length consumed).
				79
				80	input must be an object which provides the bf_getreadbuf
				81	buffer slot. Python strings, buffer objects and memory
				82	mapped files are examples of objects providing this slot.
				83
				84	errors defines the error handling to apply. It defaults to
				85	'strict' handling.
				86
				87	The method may not store state in the Codec instance. Use
				88	StreamCodec for codecs which have to keep state in order to
				89	make encoding/decoding efficient.
				90
				91	The decoder must be able to handle zero length input and
				92	return an empty object of the output object type in this
				93	situation.
				94
				95	"""
				96	raise NotImplementedError
				97
				98	#
				99	# The StreamWriter and StreamReader class provide generic working
				100	# interfaces which can be used to implement new encodings submodules
				101	# very easily. See encodings/utf_8.py for an example on how this is
				102	# done.
				103	#
				104
				105	class StreamWriter(Codec):
				106
				107	def __init__(self,stream,errors='strict'):
				108
				109	""" Creates a StreamWriter instance.
				110
				111	stream must be a file-like object open for writing
				112	(binary) data.
				113
				114	The StreamWriter may implement different error handling
				115	schemes by providing the errors keyword argument. These
				116	parameters are defined:
				117
				118	'strict' - raise a ValueError (or a subclass)
				119	'ignore' - ignore the character and continue with the next
				120	'replace'- replace with a suitable replacement character
				121
				122	"""
				123	self.stream = stream
				124	self.errors = errors
				125
				126	def write(self,object):
				127
				128	""" Writes the object's contents encoded to self.stream.
				129	"""
				130	data, consumed = self.encode(object,self.errors)
				131	self.stream.write(data)
				132
				133	# XXX .writelines() ?
				134
				135	def reset(self):
				136
				137	""" Flushes and resets the codec buffers used for keeping state.
				138
				139	Calling this method should ensure that the data on the
				140	output is put into a clean state, that allows appending
				141	of new fresh data without having to rescan the whole
				142	stream to recover state.
				143
				144	"""
				145	pass
				146
				147	def __getattr__(self,name,
				148
				149	getattr=getattr):
				150
				151	""" Inherit all other methods from the underlying stream.
				152	"""
				153	return getattr(self.stream,name)
				154
				155	###
				156
				157	class StreamReader(Codec):
				158
				159	def __init__(self,stream,errors='strict'):
				160
				161	""" Creates a StreamReader instance.
				162
				163	stream must be a file-like object open for reading
				164	(binary) data.
				165
				166	The StreamReader may implement different error handling
				167	schemes by providing the errors keyword argument. These
				168	parameters are defined:
				169
				170	'strict' - raise a ValueError (or a subclass)
				171	'ignore' - ignore the character and continue with the next
				172	'replace'- replace with a suitable replacement character;
				173
				174	"""
				175	self.stream = stream
				176	self.errors = errors
				177
				178	def read(self,size=-1):
				179
				180	""" Decodes data from the stream self.stream and returns the
				181	resulting object.
				182
				183	size indicates the approximate maximum number of bytes to
				184	read from the stream for decoding purposes. The decoder
				185	can modify this setting as appropriate. The default value
				186	-1 indicates to read and decode as much as possible. size
				187	is intended to prevent having to decode huge files in one
				188	step.
				189
				190	The method should use a greedy read strategy meaning that
				191	it should read as much data as is allowed within the
				192	definition of the encoding and the given size, e.g. if
				193	optional encoding endings or state markers are available
				194	on the stream, these should be read too.
				195
				196	"""
				197	# Unsliced reading:
				198	if size < 0:
				199	return self.decode(self.stream.read())[0]
				200
				201	# Sliced reading:
				202	read = self.stream.read
				203	decode = self.decode
				204	data = read(size)
				205	i = 0
				206	while 1:
				207	try:
				208	object, decodedbytes = decode(data)
				209	except ValueError,why:
				210	# This method is slow but should work under pretty much
				211	# all conditions; at most 10 tries are made
				212	i = i + 1
				213	newdata = read(1)
				214	if not newdata or i > 10:
				215	raise
				216	data = data + newdata
				217	else:
				218	return object
				219
				220	# XXX .readline() and .readlines() (these are hard to implement
				221	# without using buffers for keeping read-ahead data)
				222
				223	def reset(self):
				224
				225	""" Resets the codec buffers used for keeping state.
				226
				227	Note that no stream repositioning should take place.
				228	This method is primarely intended to be able to recover
				229	from decoding errors.
				230
				231	"""
				232	pass
				233
				234	def __getattr__(self,name,
				235
				236	getattr=getattr):
				237
				238	""" Inherit all other methods from the underlying stream.
				239	"""
				240	return getattr(self.stream,name)
				241
				242	###
				243
				244	class StreamReaderWriter:
				245
				246	def __init__(self,stream,Reader,Writer,errors='strict'):
				247
				248	""" Creates a StreamReaderWriter instance.
				249
				250	stream must be a Stream-like object.
				251
				252	Reader, Writer must be factory functions or classes
				253	providing the StreamReader, StreamWriter interface resp.
				254
				255	Error handling is done in the same way as defined for the
				256	StreamWriter/Readers.
				257
				258	"""
				259	self.stream = stream
				260	self.reader = Reader(stream, errors)
				261	self.writer = Writer(stream, errors)
				262	self.errors = errors
				263
				264	def read(self,size=-1):
				265
				266	return self.reader.read(size)
				267
				268	def write(self,data):
				269
				270	return self.writer.write(data)
				271
				272	def reset(self):
				273
				274	self.reader.reset()
				275	self.writer.reset()
				276
				277	def __getattr__(self,name,
				278
				279	getattr=getattr):
				280
				281	""" Inherit all other methods from the underlying stream.
				282	"""
				283	return getattr(self.stream,name)
				284
				285	###
				286
				287	class StreamRecoder:
				288
				289	def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
				290
				291	""" Creates a StreamRecoder instance which implements a two-way
				292	conversion: encode and decode work on the frontend (the
				293	input to .read() and output of .write()) while
				294	Reader and Writer work on the backend (reading and
Fred Drake	908670c	2000-03-17 15:42:11 +0000	[diff] [blame^]	295	writing to the stream).
Guido van Rossum	0612d84	2000-03-10 23:20:43 +0000	[diff] [blame]	296
				297	You can use these objects to do transparent direct
				298	recodings from e.g. latin-1 to utf-8 and back.
				299
				300	stream must be a file-like object.
				301
				302	encode, decode must adhere to the Codec interface, Reader,
				303	Writer must be factory functions or classes providing the
				304	StreamReader, StreamWriter interface resp.
				305
				306	encode and decode are needed for the frontend translation,
				307	Reader and Writer for the backend translation. Unicode is
				308	used as intermediate encoding.
				309
				310	Error handling is done in the same way as defined for the
				311	StreamWriter/Readers.
				312
				313	"""
				314	self.stream = stream
				315	self.encode = encode
				316	self.decode = decode
				317	self.reader = Reader(stream, errors)
				318	self.writer = Writer(stream, errors)
				319	self.errors = errors
				320
				321	def read(self,size=-1):
				322
				323	data = self.reader.read(size)
				324	data, bytesencoded = self.encode(data, self.errors)
				325	return data
				326
				327	def write(self,data):
				328
				329	data, bytesdecoded = self.decode(data, self.errors)
				330	return self.writer.write(data)
				331
				332	# .writelines(), .readline() and .readlines() ... see notes
				333	# above.
				334
				335	def reset(self):
				336
				337	self.reader.reset()
				338	self.writer.reset()
				339
				340	def __getattr__(self,name,
				341
				342	getattr=getattr):
				343
				344	""" Inherit all other methods from the underlying stream.
				345	"""
				346	return getattr(self.stream,name)
				347
				348	### Shortcuts
				349
				350	def open(filename, mode, encoding=None, errors='strict', buffering=1):
				351
				352	""" Open an encoded file using the given mode and return
				353	a wrapped version providing transparent encoding/decoding.
				354
				355	Note: The wrapped version will only accept the object format
				356	defined by the codecs, i.e. Unicode objects for most builtin
				357	codecs. Output is also codec dependent and will usually by
				358	Unicode as well.
				359
				360	encoding specifies the encoding which is to be used for the
				361	the file.
				362
				363	errors may be given to define the error handling. It defaults
				364	to 'strict' which causes ValueErrors to be raised in case an
				365	encoding error occurs.
				366
				367	buffering has the same meaning as for the builtin open() API.
				368	It defaults to line buffered.
				369
				370	"""
				371	if encoding is not None and \
				372	'b' not in mode:
				373	# Force opening of the file in binary mode
				374	mode = mode + 'b'
				375	file = __builtin__.open(filename, mode, buffering)
				376	if encoding is None:
				377	return file
				378	(e,d,sr,sw) = lookup(encoding)
				379	return StreamReaderWriter(file, sr, sw, errors)
				380
				381	def EncodedFile(file, input, output=None, errors='strict'):
				382
				383	""" Return a wrapped version of file which provides transparent
				384	encoding translation.
				385
				386	Strings written to the wrapped file are interpreted according
				387	to the given input encoding and then written to the original
				388	file as string using the output encoding. The intermediate
				389	encoding will usually be Unicode but depends on the specified
				390	codecs.
				391
				392	If output is not given, it defaults to input.
				393
				394	errors may be given to define the error handling. It defaults
				395	to 'strict' which causes ValueErrors to be raised in case an
				396	encoding error occurs.
				397
				398	"""
				399	if output is None:
				400	output = input
				401	encode, decode = lookup(input)[:2]
				402	Reader, Writer = lookup(output)[2:]
				403	return StreamRecoder(file,
				404	encode,decode,Reader,Writer,
				405	errors)
				406
				407	### Tests
				408
				409	if __name__ == '__main__':
				410
				411	import sys
				412
				413	# Make stdout translate Latin-1 into Unicode-Escape
				414	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')