| """ codecs -- Python Codec Registry, API and helpers. |
| |
| |
| Written by Marc-Andre Lemburg (mal@lemburg.com). |
| |
| (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
| |
| """#" |
| |
| import struct,types,__builtin__ |
| |
| ### Registry and builtin stateless codec functions |
| |
| try: |
| from _codecs import * |
| except ImportError,why: |
| raise SystemError,\ |
| 'Failed to load the builtin codecs: %s' % why |
| |
| ### Constants |
| |
| # |
| # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE) |
| # |
| BOM = struct.pack('=H',0xFEFF) |
| # |
| BOM_BE = BOM32_BE = '\376\377' |
| # corresponds to Unicode U+FEFF in UTF-16 on big endian |
| # platforms == ZERO WIDTH NO-BREAK SPACE |
| BOM_LE = BOM32_LE = '\377\376' |
| # corresponds to Unicode U+FFFE in UTF-16 on little endian |
| # platforms == defined as being an illegal Unicode character |
| |
| # |
| # 64-bit Byte Order Marks |
| # |
| BOM64_BE = '\000\000\376\377' |
| # corresponds to Unicode U+0000FEFF in UCS-4 |
| BOM64_LE = '\377\376\000\000' |
| # corresponds to Unicode U+0000FFFE in UCS-4 |
| |
| |
| ### Codec base classes (defining the API) |
| |
| class Codec: |
| |
| """ Defines the interface for stateless encoders/decoders. |
| |
| The .encode()/.decode() methods may implement different error |
| handling schemes by providing the errors argument. These |
| string values are defined: |
| |
| 'strict' - raise a ValueError error (or a subclass) |
| 'ignore' - ignore the character and continue with the next |
| 'replace' - replace with a suitable replacement character; |
| Python will use the official U+FFFD REPLACEMENT |
| CHARACTER for the builtin Unicode codecs. |
| |
| """ |
| def encode(self,input,errors='strict'): |
| |
| """ Encodes the object input and returns a tuple (output |
| object, length consumed). |
| |
| errors defines the error handling to apply. It defaults to |
| 'strict' handling. |
| |
| The method may not store state in the Codec instance. Use |
| StreamCodec for codecs which have to keep state in order to |
| make encoding/decoding efficient. |
| |
| The encoder must be able to handle zero length input and |
| return an empty object of the output object type in this |
| situation. |
| |
| """ |
| raise NotImplementedError |
| |
| def decode(self,input,errors='strict'): |
| |
| """ Decodes the object input and returns a tuple (output |
| object, length consumed). |
| |
| input must be an object which provides the bf_getreadbuf |
| buffer slot. Python strings, buffer objects and memory |
| mapped files are examples of objects providing this slot. |
| |
| errors defines the error handling to apply. It defaults to |
| 'strict' handling. |
| |
| The method may not store state in the Codec instance. Use |
| StreamCodec for codecs which have to keep state in order to |
| make encoding/decoding efficient. |
| |
| The decoder must be able to handle zero length input and |
| return an empty object of the output object type in this |
| situation. |
| |
| """ |
| raise NotImplementedError |
| |
| # |
| # The StreamWriter and StreamReader class provide generic working |
| # interfaces which can be used to implement new encodings submodules |
| # very easily. See encodings/utf_8.py for an example on how this is |
| # done. |
| # |
| |
| class StreamWriter(Codec): |
| |
| def __init__(self,stream,errors='strict'): |
| |
| """ Creates a StreamWriter instance. |
| |
| stream must be a file-like object open for writing |
| (binary) data. |
| |
| The StreamWriter may implement different error handling |
| schemes by providing the errors keyword argument. These |
| parameters are defined: |
| |
| 'strict' - raise a ValueError (or a subclass) |
| 'ignore' - ignore the character and continue with the next |
| 'replace'- replace with a suitable replacement character |
| |
| """ |
| self.stream = stream |
| self.errors = errors |
| |
| def write(self,object): |
| |
| """ Writes the object's contents encoded to self.stream. |
| """ |
| data, consumed = self.encode(object,self.errors) |
| self.stream.write(data) |
| |
| # XXX .writelines() ? |
| |
| def reset(self): |
| |
| """ Flushes and resets the codec buffers used for keeping state. |
| |
| Calling this method should ensure that the data on the |
| output is put into a clean state, that allows appending |
| of new fresh data without having to rescan the whole |
| stream to recover state. |
| |
| """ |
| pass |
| |
| def __getattr__(self,name, |
| |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream,name) |
| |
| ### |
| |
| class StreamReader(Codec): |
| |
| def __init__(self,stream,errors='strict'): |
| |
| """ Creates a StreamReader instance. |
| |
| stream must be a file-like object open for reading |
| (binary) data. |
| |
| The StreamReader may implement different error handling |
| schemes by providing the errors keyword argument. These |
| parameters are defined: |
| |
| 'strict' - raise a ValueError (or a subclass) |
| 'ignore' - ignore the character and continue with the next |
| 'replace'- replace with a suitable replacement character; |
| |
| """ |
| self.stream = stream |
| self.errors = errors |
| |
| def read(self,size=-1): |
| |
| """ Decodes data from the stream self.stream and returns the |
| resulting object. |
| |
| size indicates the approximate maximum number of bytes to |
| read from the stream for decoding purposes. The decoder |
| can modify this setting as appropriate. The default value |
| -1 indicates to read and decode as much as possible. size |
| is intended to prevent having to decode huge files in one |
| step. |
| |
| The method should use a greedy read strategy meaning that |
| it should read as much data as is allowed within the |
| definition of the encoding and the given size, e.g. if |
| optional encoding endings or state markers are available |
| on the stream, these should be read too. |
| |
| """ |
| # Unsliced reading: |
| if size < 0: |
| return self.decode(self.stream.read())[0] |
| |
| # Sliced reading: |
| read = self.stream.read |
| decode = self.decode |
| data = read(size) |
| i = 0 |
| while 1: |
| try: |
| object, decodedbytes = decode(data) |
| except ValueError,why: |
| # This method is slow but should work under pretty much |
| # all conditions; at most 10 tries are made |
| i = i + 1 |
| newdata = read(1) |
| if not newdata or i > 10: |
| raise |
| data = data + newdata |
| else: |
| return object |
| |
| # XXX .readline() and .readlines() (these are hard to implement |
| # without using buffers for keeping read-ahead data) |
| |
| def reset(self): |
| |
| """ Resets the codec buffers used for keeping state. |
| |
| Note that no stream repositioning should take place. |
| This method is primarely intended to be able to recover |
| from decoding errors. |
| |
| """ |
| pass |
| |
| def __getattr__(self,name, |
| |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream,name) |
| |
| ### |
| |
| class StreamReaderWriter: |
| |
| def __init__(self,stream,Reader,Writer,errors='strict'): |
| |
| """ Creates a StreamReaderWriter instance. |
| |
| stream must be a Stream-like object. |
| |
| Reader, Writer must be factory functions or classes |
| providing the StreamReader, StreamWriter interface resp. |
| |
| Error handling is done in the same way as defined for the |
| StreamWriter/Readers. |
| |
| """ |
| self.stream = stream |
| self.reader = Reader(stream, errors) |
| self.writer = Writer(stream, errors) |
| self.errors = errors |
| |
| def read(self,size=-1): |
| |
| return self.reader.read(size) |
| |
| def write(self,data): |
| |
| return self.writer.write(data) |
| |
| def reset(self): |
| |
| self.reader.reset() |
| self.writer.reset() |
| |
| def __getattr__(self,name, |
| |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream,name) |
| |
| ### |
| |
| class StreamRecoder: |
| |
| def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'): |
| |
| """ Creates a StreamRecoder instance which implements a two-way |
| conversion: encode and decode work on the frontend (the |
| input to .read() and output of .write()) while |
| Reader and Writer work on the backend (reading and |
| writing to the stream). |
| |
| You can use these objects to do transparent direct |
| recodings from e.g. latin-1 to utf-8 and back. |
| |
| stream must be a file-like object. |
| |
| encode, decode must adhere to the Codec interface, Reader, |
| Writer must be factory functions or classes providing the |
| StreamReader, StreamWriter interface resp. |
| |
| encode and decode are needed for the frontend translation, |
| Reader and Writer for the backend translation. Unicode is |
| used as intermediate encoding. |
| |
| Error handling is done in the same way as defined for the |
| StreamWriter/Readers. |
| |
| """ |
| self.stream = stream |
| self.encode = encode |
| self.decode = decode |
| self.reader = Reader(stream, errors) |
| self.writer = Writer(stream, errors) |
| self.errors = errors |
| |
| def read(self,size=-1): |
| |
| data = self.reader.read(size) |
| data, bytesencoded = self.encode(data, self.errors) |
| return data |
| |
| def write(self,data): |
| |
| data, bytesdecoded = self.decode(data, self.errors) |
| return self.writer.write(data) |
| |
| # .writelines(), .readline() and .readlines() ... see notes |
| # above. |
| |
| def reset(self): |
| |
| self.reader.reset() |
| self.writer.reset() |
| |
| def __getattr__(self,name, |
| |
| getattr=getattr): |
| |
| """ Inherit all other methods from the underlying stream. |
| """ |
| return getattr(self.stream,name) |
| |
| ### Shortcuts |
| |
| def open(filename, mode, encoding=None, errors='strict', buffering=1): |
| |
| """ Open an encoded file using the given mode and return |
| a wrapped version providing transparent encoding/decoding. |
| |
| Note: The wrapped version will only accept the object format |
| defined by the codecs, i.e. Unicode objects for most builtin |
| codecs. Output is also codec dependent and will usually by |
| Unicode as well. |
| |
| encoding specifies the encoding which is to be used for the |
| the file. |
| |
| errors may be given to define the error handling. It defaults |
| to 'strict' which causes ValueErrors to be raised in case an |
| encoding error occurs. |
| |
| buffering has the same meaning as for the builtin open() API. |
| It defaults to line buffered. |
| |
| """ |
| if encoding is not None and \ |
| 'b' not in mode: |
| # Force opening of the file in binary mode |
| mode = mode + 'b' |
| file = __builtin__.open(filename, mode, buffering) |
| if encoding is None: |
| return file |
| (e,d,sr,sw) = lookup(encoding) |
| return StreamReaderWriter(file, sr, sw, errors) |
| |
| def EncodedFile(file, input, output=None, errors='strict'): |
| |
| """ Return a wrapped version of file which provides transparent |
| encoding translation. |
| |
| Strings written to the wrapped file are interpreted according |
| to the given input encoding and then written to the original |
| file as string using the output encoding. The intermediate |
| encoding will usually be Unicode but depends on the specified |
| codecs. |
| |
| If output is not given, it defaults to input. |
| |
| errors may be given to define the error handling. It defaults |
| to 'strict' which causes ValueErrors to be raised in case an |
| encoding error occurs. |
| |
| """ |
| if output is None: |
| output = input |
| encode, decode = lookup(input)[:2] |
| Reader, Writer = lookup(output)[2:] |
| return StreamRecoder(file, |
| encode,decode,Reader,Writer, |
| errors) |
| |
| ### Tests |
| |
| if __name__ == '__main__': |
| |
| import sys |
| |
| # Make stdout translate Latin-1 into Unicode-Escape |
| sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape') |