| """ codecs -- Python Codec Registry, API and helpers. | 
 |  | 
 |  | 
 | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
 |  | 
 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
 |  | 
 | """#" | 
 |  | 
 | import struct,types,__builtin__ | 
 |  | 
 | ### Registry and builtin stateless codec functions | 
 |  | 
 | try: | 
 |     from _codecs import * | 
 | except ImportError,why: | 
 |     raise SystemError,\ | 
 |           'Failed to load the builtin codecs: %s' % why | 
 |  | 
 | __all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE", | 
 |            "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"] | 
 |  | 
 | ### Constants | 
 |  | 
 | # | 
 | # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE) | 
 | # | 
 | BOM = struct.pack('=H',0xFEFF) | 
 | # | 
 | BOM_BE = BOM32_BE = '\376\377' | 
 | #       corresponds to Unicode U+FEFF in UTF-16 on big endian | 
 | #       platforms == ZERO WIDTH NO-BREAK SPACE | 
 | BOM_LE = BOM32_LE = '\377\376' | 
 | #       corresponds to Unicode U+FFFE in UTF-16 on little endian | 
 | #       platforms == defined as being an illegal Unicode character | 
 |  | 
 | # | 
 | # 64-bit Byte Order Marks | 
 | # | 
 | BOM64_BE = '\000\000\376\377' | 
 | #       corresponds to Unicode U+0000FEFF in UCS-4 | 
 | BOM64_LE = '\377\376\000\000' | 
 | #       corresponds to Unicode U+0000FFFE in UCS-4 | 
 |  | 
 |  | 
 | ### Codec base classes (defining the API) | 
 |  | 
 | class Codec: | 
 |  | 
 |     """ Defines the interface for stateless encoders/decoders. | 
 |  | 
 |         The .encode()/.decode() methods may implement different error | 
 |         handling schemes by providing the errors argument. These | 
 |         string values are defined: | 
 |  | 
 |          'strict' - raise a ValueError error (or a subclass) | 
 |          'ignore' - ignore the character and continue with the next | 
 |          'replace' - replace with a suitable replacement character; | 
 |                     Python will use the official U+FFFD REPLACEMENT | 
 |                     CHARACTER for the builtin Unicode codecs. | 
 |  | 
 |     """ | 
 |     def encode(self,input,errors='strict'): | 
 |  | 
 |         """ Encodes the object input and returns a tuple (output | 
 |             object, length consumed). | 
 |  | 
 |             errors defines the error handling to apply. It defaults to | 
 |             'strict' handling. | 
 |  | 
 |             The method may not store state in the Codec instance. Use | 
 |             StreamCodec for codecs which have to keep state in order to | 
 |             make encoding/decoding efficient. | 
 |  | 
 |             The encoder must be able to handle zero length input and | 
 |             return an empty object of the output object type in this | 
 |             situation. | 
 |  | 
 |         """ | 
 |         raise NotImplementedError | 
 |  | 
 |     def decode(self,input,errors='strict'): | 
 |  | 
 |         """ Decodes the object input and returns a tuple (output | 
 |             object, length consumed). | 
 |  | 
 |             input must be an object which provides the bf_getreadbuf | 
 |             buffer slot. Python strings, buffer objects and memory | 
 |             mapped files are examples of objects providing this slot. | 
 |  | 
 |             errors defines the error handling to apply. It defaults to | 
 |             'strict' handling. | 
 |  | 
 |             The method may not store state in the Codec instance. Use | 
 |             StreamCodec for codecs which have to keep state in order to | 
 |             make encoding/decoding efficient. | 
 |  | 
 |             The decoder must be able to handle zero length input and | 
 |             return an empty object of the output object type in this | 
 |             situation. | 
 |  | 
 |         """ | 
 |         raise NotImplementedError | 
 |  | 
 | # | 
 | # The StreamWriter and StreamReader class provide generic working | 
 | # interfaces which can be used to implement new encodings submodules | 
 | # very easily. See encodings/utf_8.py for an example on how this is | 
 | # done. | 
 | # | 
 |  | 
 | class StreamWriter(Codec): | 
 |  | 
 |     def __init__(self,stream,errors='strict'): | 
 |  | 
 |         """ Creates a StreamWriter instance. | 
 |  | 
 |             stream must be a file-like object open for writing | 
 |             (binary) data. | 
 |  | 
 |             The StreamWriter may implement different error handling | 
 |             schemes by providing the errors keyword argument. These | 
 |             parameters are defined: | 
 |  | 
 |              'strict' - raise a ValueError (or a subclass) | 
 |              'ignore' - ignore the character and continue with the next | 
 |              'replace'- replace with a suitable replacement character | 
 |  | 
 |         """ | 
 |         self.stream = stream | 
 |         self.errors = errors | 
 |  | 
 |     def write(self, object): | 
 |  | 
 |         """ Writes the object's contents encoded to self.stream. | 
 |         """ | 
 |         data, consumed = self.encode(object,self.errors) | 
 |         self.stream.write(data) | 
 |  | 
 |     def writelines(self, list): | 
 |  | 
 |         """ Writes the concatenated list of strings to the stream | 
 |             using .write(). | 
 |         """ | 
 |         self.write(''.join(list)) | 
 |  | 
 |     def reset(self): | 
 |  | 
 |         """ Flushes and resets the codec buffers used for keeping state. | 
 |  | 
 |             Calling this method should ensure that the data on the | 
 |             output is put into a clean state, that allows appending | 
 |             of new fresh data without having to rescan the whole | 
 |             stream to recover state. | 
 |  | 
 |         """ | 
 |         pass | 
 |  | 
 |     def __getattr__(self,name, | 
 |  | 
 |                     getattr=getattr): | 
 |  | 
 |         """ Inherit all other methods from the underlying stream. | 
 |         """ | 
 |         return getattr(self.stream,name) | 
 |  | 
 | ### | 
 |  | 
 | class StreamReader(Codec): | 
 |  | 
 |     def __init__(self,stream,errors='strict'): | 
 |  | 
 |         """ Creates a StreamReader instance. | 
 |  | 
 |             stream must be a file-like object open for reading | 
 |             (binary) data. | 
 |  | 
 |             The StreamReader may implement different error handling | 
 |             schemes by providing the errors keyword argument. These | 
 |             parameters are defined: | 
 |  | 
 |              'strict' - raise a ValueError (or a subclass) | 
 |              'ignore' - ignore the character and continue with the next | 
 |              'replace'- replace with a suitable replacement character; | 
 |  | 
 |         """ | 
 |         self.stream = stream | 
 |         self.errors = errors | 
 |  | 
 |     def read(self, size=-1): | 
 |  | 
 |         """ Decodes data from the stream self.stream and returns the | 
 |             resulting object. | 
 |  | 
 |             size indicates the approximate maximum number of bytes to | 
 |             read from the stream for decoding purposes. The decoder | 
 |             can modify this setting as appropriate. The default value | 
 |             -1 indicates to read and decode as much as possible.  size | 
 |             is intended to prevent having to decode huge files in one | 
 |             step. | 
 |  | 
 |             The method should use a greedy read strategy meaning that | 
 |             it should read as much data as is allowed within the | 
 |             definition of the encoding and the given size, e.g.  if | 
 |             optional encoding endings or state markers are available | 
 |             on the stream, these should be read too. | 
 |  | 
 |         """ | 
 |         # Unsliced reading: | 
 |         if size < 0: | 
 |             return self.decode(self.stream.read(), self.errors)[0] | 
 |  | 
 |         # Sliced reading: | 
 |         read = self.stream.read | 
 |         decode = self.decode | 
 |         data = read(size) | 
 |         i = 0 | 
 |         while 1: | 
 |             try: | 
 |                 object, decodedbytes = decode(data, self.errors) | 
 |             except ValueError,why: | 
 |                 # This method is slow but should work under pretty much | 
 |                 # all conditions; at most 10 tries are made | 
 |                 i = i + 1 | 
 |                 newdata = read(1) | 
 |                 if not newdata or i > 10: | 
 |                     raise | 
 |                 data = data + newdata | 
 |             else: | 
 |                 return object | 
 |  | 
 |     def readline(self, size=None): | 
 |  | 
 |         """ Read one line from the input stream and return the | 
 |             decoded data. | 
 |  | 
 |             Note: Unlike the .readlines() method, this method inherits | 
 |             the line breaking knowledge from the underlying stream's | 
 |             .readline() method -- there is currently no support for | 
 |             line breaking using the codec decoder due to lack of line | 
 |             buffering. Sublcasses should however, if possible, try to | 
 |             implement this method using their own knowledge of line | 
 |             breaking. | 
 |  | 
 |             size, if given, is passed as size argument to the stream's | 
 |             .readline() method. | 
 |  | 
 |         """ | 
 |         if size is None: | 
 |             line = self.stream.readline() | 
 |         else: | 
 |             line = self.stream.readline(size) | 
 |         return self.decode(line,self.errors)[0] | 
 |  | 
 |  | 
 |     def readlines(self, sizehint=0): | 
 |  | 
 |         """ Read all lines available on the input stream | 
 |             and return them as list of lines. | 
 |  | 
 |             Line breaks are implemented using the codec's decoder | 
 |             method and are included in the list entries. | 
 |  | 
 |             sizehint, if given, is passed as size argument to the | 
 |             stream's .read() method. | 
 |  | 
 |         """ | 
 |         if sizehint is None: | 
 |             data = self.stream.read() | 
 |         else: | 
 |             data = self.stream.read(sizehint) | 
 |         return self.decode(data,self.errors)[0].splitlines(1) | 
 |  | 
 |     def reset(self): | 
 |  | 
 |         """ Resets the codec buffers used for keeping state. | 
 |  | 
 |             Note that no stream repositioning should take place. | 
 |             This method is primarily intended to be able to recover | 
 |             from decoding errors. | 
 |  | 
 |         """ | 
 |         pass | 
 |  | 
 |     def __getattr__(self,name, | 
 |  | 
 |                     getattr=getattr): | 
 |  | 
 |         """ Inherit all other methods from the underlying stream. | 
 |         """ | 
 |         return getattr(self.stream,name) | 
 |  | 
 | ### | 
 |  | 
 | class StreamReaderWriter: | 
 |  | 
 |     """ StreamReaderWriter instances allow wrapping streams which | 
 |         work in both read and write modes. | 
 |  | 
 |         The design is such that one can use the factory functions | 
 |         returned by the codec.lookup() function to construct the | 
 |         instance. | 
 |  | 
 |     """ | 
 |     # Optional attributes set by the file wrappers below | 
 |     encoding = 'unknown' | 
 |  | 
 |     def __init__(self,stream,Reader,Writer,errors='strict'): | 
 |  | 
 |         """ Creates a StreamReaderWriter instance. | 
 |  | 
 |             stream must be a Stream-like object. | 
 |  | 
 |             Reader, Writer must be factory functions or classes | 
 |             providing the StreamReader, StreamWriter interface resp. | 
 |  | 
 |             Error handling is done in the same way as defined for the | 
 |             StreamWriter/Readers. | 
 |  | 
 |         """ | 
 |         self.stream = stream | 
 |         self.reader = Reader(stream, errors) | 
 |         self.writer = Writer(stream, errors) | 
 |         self.errors = errors | 
 |  | 
 |     def read(self,size=-1): | 
 |  | 
 |         return self.reader.read(size) | 
 |  | 
 |     def readline(self, size=None): | 
 |  | 
 |         return self.reader.readline(size) | 
 |  | 
 |     def readlines(self, sizehint=None): | 
 |  | 
 |         return self.reader.readlines(sizehint) | 
 |  | 
 |     def write(self,data): | 
 |  | 
 |         return self.writer.write(data) | 
 |  | 
 |     def writelines(self,list): | 
 |  | 
 |         return self.writer.writelines(list) | 
 |  | 
 |     def reset(self): | 
 |  | 
 |         self.reader.reset() | 
 |         self.writer.reset() | 
 |  | 
 |     def __getattr__(self,name, | 
 |  | 
 |                     getattr=getattr): | 
 |  | 
 |         """ Inherit all other methods from the underlying stream. | 
 |         """ | 
 |         return getattr(self.stream,name) | 
 |  | 
 | ### | 
 |  | 
 | class StreamRecoder: | 
 |  | 
 |     """ StreamRecoder instances provide a frontend - backend | 
 |         view of encoding data. | 
 |  | 
 |         They use the complete set of APIs returned by the | 
 |         codecs.lookup() function to implement their task. | 
 |  | 
 |         Data written to the stream is first decoded into an | 
 |         intermediate format (which is dependent on the given codec | 
 |         combination) and then written to the stream using an instance | 
 |         of the provided Writer class. | 
 |  | 
 |         In the other direction, data is read from the stream using a | 
 |         Reader instance and then return encoded data to the caller. | 
 |  | 
 |     """ | 
 |     # Optional attributes set by the file wrappers below | 
 |     data_encoding = 'unknown' | 
 |     file_encoding = 'unknown' | 
 |  | 
 |     def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'): | 
 |  | 
 |         """ Creates a StreamRecoder instance which implements a two-way | 
 |             conversion: encode and decode work on the frontend (the | 
 |             input to .read() and output of .write()) while | 
 |             Reader and Writer work on the backend (reading and | 
 |             writing to the stream). | 
 |  | 
 |             You can use these objects to do transparent direct | 
 |             recodings from e.g. latin-1 to utf-8 and back. | 
 |  | 
 |             stream must be a file-like object. | 
 |  | 
 |             encode, decode must adhere to the Codec interface, Reader, | 
 |             Writer must be factory functions or classes providing the | 
 |             StreamReader, StreamWriter interface resp. | 
 |  | 
 |             encode and decode are needed for the frontend translation, | 
 |             Reader and Writer for the backend translation. Unicode is | 
 |             used as intermediate encoding. | 
 |  | 
 |             Error handling is done in the same way as defined for the | 
 |             StreamWriter/Readers. | 
 |  | 
 |         """ | 
 |         self.stream = stream | 
 |         self.encode = encode | 
 |         self.decode = decode | 
 |         self.reader = Reader(stream, errors) | 
 |         self.writer = Writer(stream, errors) | 
 |         self.errors = errors | 
 |  | 
 |     def read(self,size=-1): | 
 |  | 
 |         data = self.reader.read(size) | 
 |         data, bytesencoded = self.encode(data, self.errors) | 
 |         return data | 
 |  | 
 |     def readline(self,size=None): | 
 |  | 
 |         if size is None: | 
 |             data = self.reader.readline() | 
 |         else: | 
 |             data = self.reader.readline(size) | 
 |         data, bytesencoded = self.encode(data, self.errors) | 
 |         return data | 
 |  | 
 |     def readlines(self,sizehint=None): | 
 |  | 
 |         if sizehint is None: | 
 |             data = self.reader.read() | 
 |         else: | 
 |             data = self.reader.read(sizehint) | 
 |         data, bytesencoded = self.encode(data, self.errors) | 
 |         return data.splitlines(1) | 
 |  | 
 |     def write(self,data): | 
 |  | 
 |         data, bytesdecoded = self.decode(data, self.errors) | 
 |         return self.writer.write(data) | 
 |  | 
 |     def writelines(self,list): | 
 |  | 
 |         data = ''.join(list) | 
 |         data, bytesdecoded = self.decode(data, self.errors) | 
 |         return self.writer.write(data) | 
 |  | 
 |     def reset(self): | 
 |  | 
 |         self.reader.reset() | 
 |         self.writer.reset() | 
 |  | 
 |     def __getattr__(self,name, | 
 |  | 
 |                     getattr=getattr): | 
 |  | 
 |         """ Inherit all other methods from the underlying stream. | 
 |         """ | 
 |         return getattr(self.stream,name) | 
 |  | 
 | ### Shortcuts | 
 |  | 
 | def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): | 
 |  | 
 |     """ Open an encoded file using the given mode and return | 
 |         a wrapped version providing transparent encoding/decoding. | 
 |  | 
 |         Note: The wrapped version will only accept the object format | 
 |         defined by the codecs, i.e. Unicode objects for most builtin | 
 |         codecs. Output is also codec dependent and will usually by | 
 |         Unicode as well. | 
 |  | 
 |         Files are always opened in binary mode, even if no binary mode | 
 |         was specified. Thisis done to avoid data loss due to encodings | 
 |         using 8-bit values. The default file mode is 'rb' meaning to | 
 |         open the file in binary read mode. | 
 |  | 
 |         encoding specifies the encoding which is to be used for the | 
 |         the file. | 
 |  | 
 |         errors may be given to define the error handling. It defaults | 
 |         to 'strict' which causes ValueErrors to be raised in case an | 
 |         encoding error occurs. | 
 |  | 
 |         buffering has the same meaning as for the builtin open() API. | 
 |         It defaults to line buffered. | 
 |  | 
 |         The returned wrapped file object provides an extra attribute | 
 |         .encoding which allows querying the used encoding. This | 
 |         attribute is only available if an encoding was specified as | 
 |         parameter. | 
 |  | 
 |     """ | 
 |     if encoding is not None and \ | 
 |        'b' not in mode: | 
 |         # Force opening of the file in binary mode | 
 |         mode = mode + 'b' | 
 |     file = __builtin__.open(filename, mode, buffering) | 
 |     if encoding is None: | 
 |         return file | 
 |     (e,d,sr,sw) = lookup(encoding) | 
 |     srw = StreamReaderWriter(file, sr, sw, errors) | 
 |     # Add attributes to simplify introspection | 
 |     srw.encoding = encoding | 
 |     return srw | 
 |  | 
 | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): | 
 |  | 
 |     """ Return a wrapped version of file which provides transparent | 
 |         encoding translation. | 
 |  | 
 |         Strings written to the wrapped file are interpreted according | 
 |         to the given data_encoding and then written to the original | 
 |         file as string using file_encoding. The intermediate encoding | 
 |         will usually be Unicode but depends on the specified codecs. | 
 |  | 
 |         Strings are read from the file using file_encoding and then | 
 |         passed back to the caller as string using data_encoding. | 
 |  | 
 |         If file_encoding is not given, it defaults to data_encoding. | 
 |  | 
 |         errors may be given to define the error handling. It defaults | 
 |         to 'strict' which causes ValueErrors to be raised in case an | 
 |         encoding error occurs. | 
 |  | 
 |         The returned wrapped file object provides two extra attributes | 
 |         .data_encoding and .file_encoding which reflect the given | 
 |         parameters of the same name. The attributes can be used for | 
 |         introspection by Python programs. | 
 |  | 
 |     """ | 
 |     if file_encoding is None: | 
 |         file_encoding = data_encoding | 
 |     encode, decode = lookup(data_encoding)[:2] | 
 |     Reader, Writer = lookup(file_encoding)[2:] | 
 |     sr = StreamRecoder(file, | 
 |                        encode,decode,Reader,Writer, | 
 |                        errors) | 
 |     # Add attributes to simplify introspection | 
 |     sr.data_encoding = data_encoding | 
 |     sr.file_encoding = file_encoding | 
 |     return sr | 
 |  | 
 | ### Helpers for charmap-based codecs | 
 |  | 
 | def make_identity_dict(rng): | 
 |  | 
 |     """ make_identity_dict(rng) -> dict | 
 |  | 
 |         Return a dictionary where elements of the rng sequence are | 
 |         mapped to themselves. | 
 |  | 
 |     """ | 
 |     res = {} | 
 |     for i in rng: | 
 |         res[i]=i | 
 |     return res | 
 |  | 
 | ### Tests | 
 |  | 
 | if __name__ == '__main__': | 
 |  | 
 |     import sys | 
 |  | 
 |     # Make stdout translate Latin-1 output into UTF-8 output | 
 |     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') | 
 |  | 
 |     # Have stdin translate Latin-1 input into UTF-8 input | 
 |     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |