| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 1 | """ codecs -- Python Codec Registry, API and helpers. | 
 | 2 |  | 
 | 3 |  | 
 | 4 | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
 | 5 |  | 
 | 6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
 | 7 |  | 
| Victor Stinner | 272d888 | 2017-06-16 08:59:01 +0200 | [diff] [blame] | 8 | """ | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 9 |  | 
| Victor Stinner | 272d888 | 2017-06-16 08:59:01 +0200 | [diff] [blame] | 10 | import builtins | 
 | 11 | import sys | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 12 |  | 
 | 13 | ### Registry and builtin stateless codec functions | 
 | 14 |  | 
| Guido van Rossum | b95de4f | 2000-03-31 17:25:23 +0000 | [diff] [blame] | 15 | try: | 
 | 16 |     from _codecs import * | 
| Guido van Rossum | b940e11 | 2007-01-10 16:19:56 +0000 | [diff] [blame] | 17 | except ImportError as why: | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 18 |     raise SystemError('Failed to load the builtin codecs: %s' % why) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 19 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 20 | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 21 |            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", | 
 | 22 |            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", | 
| Walter Dörwald | 3aeb632 | 2002-09-02 13:14:32 +0000 | [diff] [blame] | 23 |            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", | 
| Serhiy Storchaka | de3ee5b | 2014-12-20 17:42:38 +0200 | [diff] [blame] | 24 |            "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", | 
 | 25 |            "StreamReader", "StreamWriter", | 
 | 26 |            "StreamReaderWriter", "StreamRecoder", | 
 | 27 |            "getencoder", "getdecoder", "getincrementalencoder", | 
 | 28 |            "getincrementaldecoder", "getreader", "getwriter", | 
 | 29 |            "encode", "decode", "iterencode", "iterdecode", | 
| Walter Dörwald | 3aeb632 | 2002-09-02 13:14:32 +0000 | [diff] [blame] | 30 |            "strict_errors", "ignore_errors", "replace_errors", | 
 | 31 |            "xmlcharrefreplace_errors", | 
| Serhiy Storchaka | 166ebc4 | 2014-11-25 13:57:17 +0200 | [diff] [blame] | 32 |            "backslashreplace_errors", "namereplace_errors", | 
| Walter Dörwald | 3aeb632 | 2002-09-02 13:14:32 +0000 | [diff] [blame] | 33 |            "register_error", "lookup_error"] | 
| Skip Montanaro | e99d5ea | 2001-01-20 19:54:20 +0000 | [diff] [blame] | 34 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 35 | ### Constants | 
 | 36 |  | 
 | 37 | # | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 38 | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) | 
 | 39 | # and its possible byte string values | 
 | 40 | # for UTF8/UTF16/UTF32 output and little/big endian machines | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 41 | # | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 42 |  | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 43 | # UTF-8 | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 44 | BOM_UTF8 = b'\xef\xbb\xbf' | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 45 |  | 
 | 46 | # UTF-16, little endian | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 47 | BOM_LE = BOM_UTF16_LE = b'\xff\xfe' | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 48 |  | 
 | 49 | # UTF-16, big endian | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 50 | BOM_BE = BOM_UTF16_BE = b'\xfe\xff' | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 51 |  | 
 | 52 | # UTF-32, little endian | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 53 | BOM_UTF32_LE = b'\xff\xfe\x00\x00' | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 54 |  | 
 | 55 | # UTF-32, big endian | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 56 | BOM_UTF32_BE = b'\x00\x00\xfe\xff' | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 57 |  | 
| Marc-André Lemburg | b28de0d | 2002-12-12 17:37:50 +0000 | [diff] [blame] | 58 | if sys.byteorder == 'little': | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 59 |  | 
| Marc-André Lemburg | b28de0d | 2002-12-12 17:37:50 +0000 | [diff] [blame] | 60 |     # UTF-16, native endianness | 
 | 61 |     BOM = BOM_UTF16 = BOM_UTF16_LE | 
 | 62 |  | 
 | 63 |     # UTF-32, native endianness | 
 | 64 |     BOM_UTF32 = BOM_UTF32_LE | 
 | 65 |  | 
 | 66 | else: | 
 | 67 |  | 
 | 68 |     # UTF-16, native endianness | 
 | 69 |     BOM = BOM_UTF16 = BOM_UTF16_BE | 
 | 70 |  | 
 | 71 |     # UTF-32, native endianness | 
 | 72 |     BOM_UTF32 = BOM_UTF32_BE | 
| Walter Dörwald | 474458d | 2002-06-04 15:16:29 +0000 | [diff] [blame] | 73 |  | 
 | 74 | # Old broken names (don't use in new code) | 
 | 75 | BOM32_LE = BOM_UTF16_LE | 
 | 76 | BOM32_BE = BOM_UTF16_BE | 
 | 77 | BOM64_LE = BOM_UTF32_LE | 
 | 78 | BOM64_BE = BOM_UTF32_BE | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 79 |  | 
 | 80 |  | 
 | 81 | ### Codec base classes (defining the API) | 
 | 82 |  | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 83 | class CodecInfo(tuple): | 
| Nick Coghlan | c72e4e6 | 2013-11-22 22:39:36 +1000 | [diff] [blame] | 84 |     """Codec details when looking up the codec registry""" | 
 | 85 |  | 
 | 86 |     # Private API to allow Python 3.4 to blacklist the known non-Unicode | 
 | 87 |     # codecs in the standard library. A more general mechanism to | 
 | 88 |     # reliably distinguish test encodings from other codecs will hopefully | 
 | 89 |     # be defined for Python 3.5 | 
 | 90 |     # | 
 | 91 |     # See http://bugs.python.org/issue19619 | 
 | 92 |     _is_text_encoding = True # Assume codecs are text encodings by default | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 93 |  | 
 | 94 |     def __new__(cls, encode, decode, streamreader=None, streamwriter=None, | 
| Nick Coghlan | c72e4e6 | 2013-11-22 22:39:36 +1000 | [diff] [blame] | 95 |         incrementalencoder=None, incrementaldecoder=None, name=None, | 
 | 96 |         *, _is_text_encoding=None): | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 97 |         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) | 
 | 98 |         self.name = name | 
 | 99 |         self.encode = encode | 
 | 100 |         self.decode = decode | 
 | 101 |         self.incrementalencoder = incrementalencoder | 
 | 102 |         self.incrementaldecoder = incrementaldecoder | 
 | 103 |         self.streamwriter = streamwriter | 
 | 104 |         self.streamreader = streamreader | 
| Nick Coghlan | c72e4e6 | 2013-11-22 22:39:36 +1000 | [diff] [blame] | 105 |         if _is_text_encoding is not None: | 
 | 106 |             self._is_text_encoding = _is_text_encoding | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 107 |         return self | 
 | 108 |  | 
 | 109 |     def __repr__(self): | 
| Serhiy Storchaka | 521e586 | 2014-07-22 15:00:37 +0300 | [diff] [blame] | 110 |         return "<%s.%s object for encoding %s at %#x>" % \ | 
 | 111 |                 (self.__class__.__module__, self.__class__.__qualname__, | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 112 |                  self.name, id(self)) | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 113 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 114 | class Codec: | 
 | 115 |  | 
 | 116 |     """ Defines the interface for stateless encoders/decoders. | 
 | 117 |  | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 118 |         The .encode()/.decode() methods may use different error | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 119 |         handling schemes by providing the errors argument. These | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 120 |         string values are predefined: | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 121 |  | 
| Guido van Rossum | d8855fd | 2000-03-24 22:14:19 +0000 | [diff] [blame] | 122 |          'strict' - raise a ValueError error (or a subclass) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 123 |          'ignore' - ignore the character and continue with the next | 
 | 124 |          'replace' - replace with a suitable replacement character; | 
 | 125 |                     Python will use the official U+FFFD REPLACEMENT | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 126 |                     CHARACTER for the builtin Unicode codecs on | 
 | 127 |                     decoding and '?' on encoding. | 
| Serhiy Storchaka | d3faf43 | 2015-01-18 11:28:37 +0200 | [diff] [blame] | 128 |          'surrogateescape' - replace with private code points U+DCnn. | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 129 |          'xmlcharrefreplace' - Replace with the appropriate XML | 
 | 130 |                                character reference (only for encoding). | 
| Serhiy Storchaka | 07985ef | 2015-01-25 22:56:57 +0200 | [diff] [blame] | 131 |          'backslashreplace'  - Replace with backslashed escape sequences. | 
 | 132 |          'namereplace'       - Replace with \\N{...} escape sequences | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 133 |                                (only for encoding). | 
 | 134 |  | 
 | 135 |         The set of allowed values can be extended via register_error. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 136 |  | 
 | 137 |     """ | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 138 |     def encode(self, input, errors='strict'): | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 139 |  | 
| Fred Drake | 3e74c0d | 2000-03-17 15:40:35 +0000 | [diff] [blame] | 140 |         """ Encodes the object input and returns a tuple (output | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 141 |             object, length consumed). | 
 | 142 |  | 
 | 143 |             errors defines the error handling to apply. It defaults to | 
 | 144 |             'strict' handling. | 
 | 145 |  | 
 | 146 |             The method may not store state in the Codec instance. Use | 
| Berker Peksag | 41ca828 | 2015-07-30 18:26:10 +0300 | [diff] [blame] | 147 |             StreamWriter for codecs which have to keep state in order to | 
 | 148 |             make encoding efficient. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 149 |  | 
 | 150 |             The encoder must be able to handle zero length input and | 
 | 151 |             return an empty object of the output object type in this | 
 | 152 |             situation. | 
 | 153 |  | 
 | 154 |         """ | 
 | 155 |         raise NotImplementedError | 
 | 156 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 157 |     def decode(self, input, errors='strict'): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 158 |  | 
 | 159 |         """ Decodes the object input and returns a tuple (output | 
 | 160 |             object, length consumed). | 
 | 161 |  | 
 | 162 |             input must be an object which provides the bf_getreadbuf | 
 | 163 |             buffer slot. Python strings, buffer objects and memory | 
 | 164 |             mapped files are examples of objects providing this slot. | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 165 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 166 |             errors defines the error handling to apply. It defaults to | 
 | 167 |             'strict' handling. | 
 | 168 |  | 
 | 169 |             The method may not store state in the Codec instance. Use | 
| Berker Peksag | 41ca828 | 2015-07-30 18:26:10 +0300 | [diff] [blame] | 170 |             StreamReader for codecs which have to keep state in order to | 
 | 171 |             make decoding efficient. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 172 |  | 
 | 173 |             The decoder must be able to handle zero length input and | 
 | 174 |             return an empty object of the output object type in this | 
 | 175 |             situation. | 
 | 176 |  | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 177 |         """ | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 178 |         raise NotImplementedError | 
 | 179 |  | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 180 | class IncrementalEncoder(object): | 
 | 181 |     """ | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 182 |     An IncrementalEncoder encodes an input in multiple steps. The input can | 
 | 183 |     be passed piece by piece to the encode() method. The IncrementalEncoder | 
 | 184 |     remembers the state of the encoding process between calls to encode(). | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 185 |     """ | 
 | 186 |     def __init__(self, errors='strict'): | 
 | 187 |         """ | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 188 |         Creates an IncrementalEncoder instance. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 189 |  | 
 | 190 |         The IncrementalEncoder may use different error handling schemes by | 
 | 191 |         providing the errors keyword argument. See the module docstring | 
 | 192 |         for a list of possible values. | 
 | 193 |         """ | 
 | 194 |         self.errors = errors | 
 | 195 |         self.buffer = "" | 
 | 196 |  | 
 | 197 |     def encode(self, input, final=False): | 
 | 198 |         """ | 
 | 199 |         Encodes input and returns the resulting object. | 
 | 200 |         """ | 
 | 201 |         raise NotImplementedError | 
 | 202 |  | 
 | 203 |     def reset(self): | 
 | 204 |         """ | 
 | 205 |         Resets the encoder to the initial state. | 
 | 206 |         """ | 
 | 207 |  | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 208 |     def getstate(self): | 
 | 209 |         """ | 
 | 210 |         Return the current state of the encoder. | 
 | 211 |         """ | 
 | 212 |         return 0 | 
 | 213 |  | 
 | 214 |     def setstate(self, state): | 
 | 215 |         """ | 
 | 216 |         Set the current state of the encoder. state must have been | 
 | 217 |         returned by getstate(). | 
 | 218 |         """ | 
 | 219 |  | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 220 | class BufferedIncrementalEncoder(IncrementalEncoder): | 
 | 221 |     """ | 
 | 222 |     This subclass of IncrementalEncoder can be used as the baseclass for an | 
 | 223 |     incremental encoder if the encoder must keep some of the output in a | 
 | 224 |     buffer between calls to encode(). | 
 | 225 |     """ | 
 | 226 |     def __init__(self, errors='strict'): | 
 | 227 |         IncrementalEncoder.__init__(self, errors) | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 228 |         # unencoded input that is kept between calls to encode() | 
 | 229 |         self.buffer = "" | 
| Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 230 |  | 
 | 231 |     def _buffer_encode(self, input, errors, final): | 
 | 232 |         # Overwrite this method in subclasses: It must encode input | 
 | 233 |         # and return an (output, length consumed) tuple | 
 | 234 |         raise NotImplementedError | 
 | 235 |  | 
 | 236 |     def encode(self, input, final=False): | 
 | 237 |         # encode input (taking the buffer into account) | 
 | 238 |         data = self.buffer + input | 
 | 239 |         (result, consumed) = self._buffer_encode(data, self.errors, final) | 
 | 240 |         # keep unencoded input until the next call | 
 | 241 |         self.buffer = data[consumed:] | 
 | 242 |         return result | 
 | 243 |  | 
 | 244 |     def reset(self): | 
 | 245 |         IncrementalEncoder.reset(self) | 
 | 246 |         self.buffer = "" | 
 | 247 |  | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 248 |     def getstate(self): | 
 | 249 |         return self.buffer or 0 | 
 | 250 |  | 
 | 251 |     def setstate(self, state): | 
 | 252 |         self.buffer = state or "" | 
 | 253 |  | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 254 | class IncrementalDecoder(object): | 
 | 255 |     """ | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 256 |     An IncrementalDecoder decodes an input in multiple steps. The input can | 
 | 257 |     be passed piece by piece to the decode() method. The IncrementalDecoder | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 258 |     remembers the state of the decoding process between calls to decode(). | 
 | 259 |     """ | 
 | 260 |     def __init__(self, errors='strict'): | 
 | 261 |         """ | 
| Martin Panter | 7462b649 | 2015-11-02 03:37:02 +0000 | [diff] [blame] | 262 |         Create an IncrementalDecoder instance. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 263 |  | 
 | 264 |         The IncrementalDecoder may use different error handling schemes by | 
 | 265 |         providing the errors keyword argument. See the module docstring | 
 | 266 |         for a list of possible values. | 
 | 267 |         """ | 
 | 268 |         self.errors = errors | 
 | 269 |  | 
 | 270 |     def decode(self, input, final=False): | 
 | 271 |         """ | 
| Ka-Ping Yee | f44c7e8 | 2008-03-18 04:51:32 +0000 | [diff] [blame] | 272 |         Decode input and returns the resulting object. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 273 |         """ | 
 | 274 |         raise NotImplementedError | 
 | 275 |  | 
 | 276 |     def reset(self): | 
 | 277 |         """ | 
| Ka-Ping Yee | f44c7e8 | 2008-03-18 04:51:32 +0000 | [diff] [blame] | 278 |         Reset the decoder to the initial state. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 279 |         """ | 
 | 280 |  | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 281 |     def getstate(self): | 
 | 282 |         """ | 
| Ka-Ping Yee | f44c7e8 | 2008-03-18 04:51:32 +0000 | [diff] [blame] | 283 |         Return the current state of the decoder. | 
 | 284 |  | 
 | 285 |         This must be a (buffered_input, additional_state_info) tuple. | 
 | 286 |         buffered_input must be a bytes object containing bytes that | 
 | 287 |         were passed to decode() that have not yet been converted. | 
 | 288 |         additional_state_info must be a non-negative integer | 
 | 289 |         representing the state of the decoder WITHOUT yet having | 
 | 290 |         processed the contents of buffered_input.  In the initial state | 
 | 291 |         and after reset(), getstate() must return (b"", 0). | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 292 |         """ | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 293 |         return (b"", 0) | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 294 |  | 
 | 295 |     def setstate(self, state): | 
 | 296 |         """ | 
| Ka-Ping Yee | f44c7e8 | 2008-03-18 04:51:32 +0000 | [diff] [blame] | 297 |         Set the current state of the decoder. | 
 | 298 |  | 
 | 299 |         state must have been returned by getstate().  The effect of | 
 | 300 |         setstate((b"", 0)) must be equivalent to reset(). | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 301 |         """ | 
 | 302 |  | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 303 | class BufferedIncrementalDecoder(IncrementalDecoder): | 
 | 304 |     """ | 
 | 305 |     This subclass of IncrementalDecoder can be used as the baseclass for an | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 306 |     incremental decoder if the decoder must be able to handle incomplete | 
 | 307 |     byte sequences. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 308 |     """ | 
 | 309 |     def __init__(self, errors='strict'): | 
 | 310 |         IncrementalDecoder.__init__(self, errors) | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 311 |         # undecoded input that is kept between calls to decode() | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 312 |         self.buffer = b"" | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 313 |  | 
 | 314 |     def _buffer_decode(self, input, errors, final): | 
 | 315 |         # Overwrite this method in subclasses: It must decode input | 
 | 316 |         # and return an (output, length consumed) tuple | 
 | 317 |         raise NotImplementedError | 
 | 318 |  | 
 | 319 |     def decode(self, input, final=False): | 
 | 320 |         # decode input (taking the buffer into account) | 
 | 321 |         data = self.buffer + input | 
 | 322 |         (result, consumed) = self._buffer_decode(data, self.errors, final) | 
 | 323 |         # keep undecoded input until the next call | 
 | 324 |         self.buffer = data[consumed:] | 
 | 325 |         return result | 
 | 326 |  | 
 | 327 |     def reset(self): | 
 | 328 |         IncrementalDecoder.reset(self) | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 329 |         self.buffer = b"" | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 330 |  | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 331 |     def getstate(self): | 
 | 332 |         # additional state info is always 0 | 
 | 333 |         return (self.buffer, 0) | 
 | 334 |  | 
 | 335 |     def setstate(self, state): | 
 | 336 |         # ignore additional state info | 
 | 337 |         self.buffer = state[0] | 
 | 338 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 339 | # | 
 | 340 | # The StreamWriter and StreamReader class provide generic working | 
| Andrew M. Kuchling | 97c5635 | 2001-09-18 20:29:48 +0000 | [diff] [blame] | 341 | # interfaces which can be used to implement new encoding submodules | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 342 | # very easily. See encodings/utf_8.py for an example on how this is | 
 | 343 | # done. | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 344 | # | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 345 |  | 
 | 346 | class StreamWriter(Codec): | 
 | 347 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 348 |     def __init__(self, stream, errors='strict'): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 349 |  | 
 | 350 |         """ Creates a StreamWriter instance. | 
 | 351 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 352 |             stream must be a file-like object open for writing. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 353 |  | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 354 |             The StreamWriter may use different error handling | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 355 |             schemes by providing the errors keyword argument. These | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 356 |             parameters are predefined: | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 357 |  | 
 | 358 |              'strict' - raise a ValueError (or a subclass) | 
 | 359 |              'ignore' - ignore the character and continue with the next | 
 | 360 |              'replace'- replace with a suitable replacement character | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 361 |              'xmlcharrefreplace' - Replace with the appropriate XML | 
 | 362 |                                    character reference. | 
 | 363 |              'backslashreplace'  - Replace with backslashed escape | 
| Serhiy Storchaka | 07985ef | 2015-01-25 22:56:57 +0200 | [diff] [blame] | 364 |                                    sequences. | 
 | 365 |              'namereplace'       - Replace with \\N{...} escape sequences. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 366 |  | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 367 |             The set of allowed parameter values can be extended via | 
 | 368 |             register_error. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 369 |         """ | 
 | 370 |         self.stream = stream | 
 | 371 |         self.errors = errors | 
 | 372 |  | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 373 |     def write(self, object): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 374 |  | 
 | 375 |         """ Writes the object's contents encoded to self.stream. | 
 | 376 |         """ | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 377 |         data, consumed = self.encode(object, self.errors) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 378 |         self.stream.write(data) | 
 | 379 |  | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 380 |     def writelines(self, list): | 
 | 381 |  | 
 | 382 |         """ Writes the concatenated list of strings to the stream | 
 | 383 |             using .write(). | 
 | 384 |         """ | 
 | 385 |         self.write(''.join(list)) | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 386 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 387 |     def reset(self): | 
 | 388 |  | 
 | 389 |         """ Flushes and resets the codec buffers used for keeping state. | 
 | 390 |  | 
 | 391 |             Calling this method should ensure that the data on the | 
 | 392 |             output is put into a clean state, that allows appending | 
 | 393 |             of new fresh data without having to rescan the whole | 
 | 394 |             stream to recover state. | 
 | 395 |  | 
 | 396 |         """ | 
 | 397 |         pass | 
 | 398 |  | 
| Victor Stinner | a92ad7e | 2010-05-22 16:59:09 +0000 | [diff] [blame] | 399 |     def seek(self, offset, whence=0): | 
 | 400 |         self.stream.seek(offset, whence) | 
 | 401 |         if whence == 0 and offset == 0: | 
 | 402 |             self.reset() | 
 | 403 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 404 |     def __getattr__(self, name, | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 405 |                     getattr=getattr): | 
 | 406 |  | 
 | 407 |         """ Inherit all other methods from the underlying stream. | 
 | 408 |         """ | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 409 |         return getattr(self.stream, name) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 410 |  | 
| Thomas Wouters | 89f507f | 2006-12-13 04:49:30 +0000 | [diff] [blame] | 411 |     def __enter__(self): | 
 | 412 |         return self | 
 | 413 |  | 
 | 414 |     def __exit__(self, type, value, tb): | 
 | 415 |         self.stream.close() | 
 | 416 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 417 | ### | 
 | 418 |  | 
 | 419 | class StreamReader(Codec): | 
 | 420 |  | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 421 |     charbuffertype = str | 
 | 422 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 423 |     def __init__(self, stream, errors='strict'): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 424 |  | 
 | 425 |         """ Creates a StreamReader instance. | 
 | 426 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 427 |             stream must be a file-like object open for reading. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 428 |  | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 429 |             The StreamReader may use different error handling | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 430 |             schemes by providing the errors keyword argument. These | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 431 |             parameters are predefined: | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 432 |  | 
 | 433 |              'strict' - raise a ValueError (or a subclass) | 
 | 434 |              'ignore' - ignore the character and continue with the next | 
| Serhiy Storchaka | 07985ef | 2015-01-25 22:56:57 +0200 | [diff] [blame] | 435 |              'replace'- replace with a suitable replacement character | 
 | 436 |              'backslashreplace' - Replace with backslashed escape sequences; | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 437 |  | 
| Walter Dörwald | 7f82f79 | 2002-11-19 21:42:53 +0000 | [diff] [blame] | 438 |             The set of allowed parameter values can be extended via | 
 | 439 |             register_error. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 440 |         """ | 
 | 441 |         self.stream = stream | 
 | 442 |         self.errors = errors | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 443 |         self.bytebuffer = b"" | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 444 |         self._empty_charbuffer = self.charbuffertype() | 
 | 445 |         self.charbuffer = self._empty_charbuffer | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 446 |         self.linebuffer = None | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 447 |  | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 448 |     def decode(self, input, errors='strict'): | 
 | 449 |         raise NotImplementedError | 
 | 450 |  | 
| Martin v. Löwis | 56066d2 | 2005-08-24 07:38:12 +0000 | [diff] [blame] | 451 |     def read(self, size=-1, chars=-1, firstline=False): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 452 |  | 
 | 453 |         """ Decodes data from the stream self.stream and returns the | 
 | 454 |             resulting object. | 
 | 455 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 456 |             chars indicates the number of decoded code points or bytes to | 
 | 457 |             return. read() will never return more data than requested, | 
 | 458 |             but it might return less, if there is not enough available. | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 459 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 460 |             size indicates the approximate maximum number of decoded | 
 | 461 |             bytes or code points to read for decoding. The decoder | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 462 |             can modify this setting as appropriate. The default value | 
 | 463 |             -1 indicates to read and decode as much as possible.  size | 
 | 464 |             is intended to prevent having to decode huge files in one | 
 | 465 |             step. | 
 | 466 |  | 
| Martin v. Löwis | 56066d2 | 2005-08-24 07:38:12 +0000 | [diff] [blame] | 467 |             If firstline is true, and a UnicodeDecodeError happens | 
 | 468 |             after the first line terminator in the input only the first line | 
 | 469 |             will be returned, the rest of the input will be kept until the | 
 | 470 |             next call to read(). | 
 | 471 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 472 |             The method should use a greedy read strategy, meaning that | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 473 |             it should read as much data as is allowed within the | 
 | 474 |             definition of the encoding and the given size, e.g.  if | 
 | 475 |             optional encoding endings or state markers are available | 
 | 476 |             on the stream, these should be read too. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 477 |         """ | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 478 |         # If we have lines cached, first merge them back into characters | 
 | 479 |         if self.linebuffer: | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 480 |             self.charbuffer = self._empty_charbuffer.join(self.linebuffer) | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 481 |             self.linebuffer = None | 
| Tim Peters | 536cf99 | 2005-12-25 23:18:31 +0000 | [diff] [blame] | 482 |  | 
| Serhiy Storchaka | 219c2de | 2017-11-29 01:30:00 +0200 | [diff] [blame] | 483 |         if chars < 0: | 
 | 484 |             # For compatibility with other read() methods that take a | 
 | 485 |             # single argument | 
 | 486 |             chars = size | 
 | 487 |  | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 488 |         # read until we get the required number of characters (if available) | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 489 |         while True: | 
| Tim Golden | 621302c | 2012-10-01 16:40:40 +0100 | [diff] [blame] | 490 |             # can the request be satisfied from the character buffer? | 
| Serhiy Storchaka | dbe0982 | 2014-01-26 19:27:56 +0200 | [diff] [blame] | 491 |             if chars >= 0: | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 492 |                 if len(self.charbuffer) >= chars: | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 493 |                     break | 
 | 494 |             # we need more data | 
 | 495 |             if size < 0: | 
 | 496 |                 newdata = self.stream.read() | 
 | 497 |             else: | 
 | 498 |                 newdata = self.stream.read(size) | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 499 |             # decode bytes (those remaining from the last call included) | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 500 |             data = self.bytebuffer + newdata | 
| Serhiy Storchaka | dbe0982 | 2014-01-26 19:27:56 +0200 | [diff] [blame] | 501 |             if not data: | 
 | 502 |                 break | 
| Martin v. Löwis | 56066d2 | 2005-08-24 07:38:12 +0000 | [diff] [blame] | 503 |             try: | 
 | 504 |                 newchars, decodedbytes = self.decode(data, self.errors) | 
| Guido van Rossum | b940e11 | 2007-01-10 16:19:56 +0000 | [diff] [blame] | 505 |             except UnicodeDecodeError as exc: | 
| Martin v. Löwis | 56066d2 | 2005-08-24 07:38:12 +0000 | [diff] [blame] | 506 |                 if firstline: | 
| Walter Dörwald | 3abcb01 | 2007-04-16 22:10:50 +0000 | [diff] [blame] | 507 |                     newchars, decodedbytes = \ | 
 | 508 |                         self.decode(data[:exc.start], self.errors) | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 509 |                     lines = newchars.splitlines(keepends=True) | 
| Martin v. Löwis | 56066d2 | 2005-08-24 07:38:12 +0000 | [diff] [blame] | 510 |                     if len(lines)<=1: | 
 | 511 |                         raise | 
 | 512 |                 else: | 
 | 513 |                     raise | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 514 |             # keep undecoded bytes until the next call | 
 | 515 |             self.bytebuffer = data[decodedbytes:] | 
 | 516 |             # put new characters in the character buffer | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 517 |             self.charbuffer += newchars | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 518 |             # there was no data available | 
 | 519 |             if not newdata: | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 520 |                 break | 
 | 521 |         if chars < 0: | 
 | 522 |             # Return everything we've got | 
 | 523 |             result = self.charbuffer | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 524 |             self.charbuffer = self._empty_charbuffer | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 525 |         else: | 
 | 526 |             # Return the first chars characters | 
 | 527 |             result = self.charbuffer[:chars] | 
 | 528 |             self.charbuffer = self.charbuffer[chars:] | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 529 |         return result | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 530 |  | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 531 |     def readline(self, size=None, keepends=True): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 532 |  | 
 | 533 |         """ Read one line from the input stream and return the | 
 | 534 |             decoded data. | 
 | 535 |  | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 536 |             size, if given, is passed as size argument to the | 
 | 537 |             read() method. | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 538 |  | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 539 |         """ | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 540 |         # If we have lines cached from an earlier read, return | 
 | 541 |         # them unconditionally | 
 | 542 |         if self.linebuffer: | 
 | 543 |             line = self.linebuffer[0] | 
 | 544 |             del self.linebuffer[0] | 
 | 545 |             if len(self.linebuffer) == 1: | 
 | 546 |                 # revert to charbuffer mode; we might need more data | 
 | 547 |                 # next time | 
 | 548 |                 self.charbuffer = self.linebuffer[0] | 
 | 549 |                 self.linebuffer = None | 
 | 550 |             if not keepends: | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 551 |                 line = line.splitlines(keepends=False)[0] | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 552 |             return line | 
| Tim Peters | 536cf99 | 2005-12-25 23:18:31 +0000 | [diff] [blame] | 553 |  | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 554 |         readsize = size or 72 | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 555 |         line = self._empty_charbuffer | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 556 |         # If size is given, we call read() only once | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 557 |         while True: | 
| Martin v. Löwis | 56066d2 | 2005-08-24 07:38:12 +0000 | [diff] [blame] | 558 |             data = self.read(readsize, firstline=True) | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 559 |             if data: | 
| Walter Dörwald | a4eb2d5 | 2005-04-21 21:42:35 +0000 | [diff] [blame] | 560 |                 # If we're at a "\r" read one extra character (which might | 
 | 561 |                 # be a "\n") to get a proper line ending. If the stream is | 
| Walter Dörwald | bc8e642 | 2005-04-21 21:32:03 +0000 | [diff] [blame] | 562 |                 # temporarily exhausted we return the wrong line ending. | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 563 |                 if (isinstance(data, str) and data.endswith("\r")) or \ | 
 | 564 |                    (isinstance(data, bytes) and data.endswith(b"\r")): | 
| Walter Dörwald | 7a6dc13 | 2005-04-04 21:38:47 +0000 | [diff] [blame] | 565 |                     data += self.read(size=1, chars=1) | 
| Walter Dörwald | 7a6dc13 | 2005-04-04 21:38:47 +0000 | [diff] [blame] | 566 |  | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 567 |             line += data | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 568 |             lines = line.splitlines(keepends=True) | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 569 |             if lines: | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 570 |                 if len(lines) > 1: | 
 | 571 |                     # More than one line result; the first line is a full line | 
 | 572 |                     # to return | 
 | 573 |                     line = lines[0] | 
 | 574 |                     del lines[0] | 
 | 575 |                     if len(lines) > 1: | 
 | 576 |                         # cache the remaining lines | 
 | 577 |                         lines[-1] += self.charbuffer | 
 | 578 |                         self.linebuffer = lines | 
 | 579 |                         self.charbuffer = None | 
 | 580 |                     else: | 
 | 581 |                         # only one remaining line, put it back into charbuffer | 
 | 582 |                         self.charbuffer = lines[0] + self.charbuffer | 
 | 583 |                     if not keepends: | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 584 |                         line = line.splitlines(keepends=False)[0] | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 585 |                     break | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 586 |                 line0withend = lines[0] | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 587 |                 line0withoutend = lines[0].splitlines(keepends=False)[0] | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 588 |                 if line0withend != line0withoutend: # We really have a line end | 
 | 589 |                     # Put the rest back together and keep it until the next call | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 590 |                     self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ | 
 | 591 |                                       self.charbuffer | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 592 |                     if keepends: | 
 | 593 |                         line = line0withend | 
 | 594 |                     else: | 
 | 595 |                         line = line0withoutend | 
| Walter Dörwald | 9fa0946 | 2005-01-10 12:01:39 +0000 | [diff] [blame] | 596 |                     break | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 597 |             # we didn't get anything or this was our only try | 
| Walter Dörwald | 9fa0946 | 2005-01-10 12:01:39 +0000 | [diff] [blame] | 598 |             if not data or size is not None: | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 599 |                 if line and not keepends: | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 600 |                     line = line.splitlines(keepends=False)[0] | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 601 |                 break | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 602 |             if readsize < 8000: | 
| Walter Dörwald | e57d7b1 | 2004-12-21 22:24:00 +0000 | [diff] [blame] | 603 |                 readsize *= 2 | 
 | 604 |         return line | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 605 |  | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 606 |     def readlines(self, sizehint=None, keepends=True): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 607 |  | 
 | 608 |         """ Read all lines available on the input stream | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 609 |             and return them as a list. | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 610 |  | 
 | 611 |             Line breaks are implemented using the codec's decoder | 
 | 612 |             method and are included in the list entries. | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 613 |  | 
| Marc-André Lemburg | d594849 | 2004-02-26 15:22:17 +0000 | [diff] [blame] | 614 |             sizehint, if given, is ignored since there is no efficient | 
 | 615 |             way to finding the true end-of-line. | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 616 |  | 
 | 617 |         """ | 
| Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 618 |         data = self.read() | 
| Hye-Shik Chang | af5c7cf | 2004-10-17 23:51:21 +0000 | [diff] [blame] | 619 |         return data.splitlines(keepends) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 620 |  | 
 | 621 |     def reset(self): | 
 | 622 |  | 
 | 623 |         """ Resets the codec buffers used for keeping state. | 
 | 624 |  | 
 | 625 |             Note that no stream repositioning should take place. | 
| Thomas Wouters | 7e47402 | 2000-07-16 12:04:32 +0000 | [diff] [blame] | 626 |             This method is primarily intended to be able to recover | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 627 |             from decoding errors. | 
 | 628 |  | 
 | 629 |         """ | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 630 |         self.bytebuffer = b"" | 
| Georg Brandl | 0252462 | 2010-12-02 18:06:51 +0000 | [diff] [blame] | 631 |         self.charbuffer = self._empty_charbuffer | 
| Martin v. Löwis | 4ed6738 | 2005-09-18 08:34:39 +0000 | [diff] [blame] | 632 |         self.linebuffer = None | 
| Walter Dörwald | 729c31f | 2005-03-14 19:06:30 +0000 | [diff] [blame] | 633 |  | 
| Walter Dörwald | 71fd90d | 2005-03-14 19:25:41 +0000 | [diff] [blame] | 634 |     def seek(self, offset, whence=0): | 
| Walter Dörwald | 729c31f | 2005-03-14 19:06:30 +0000 | [diff] [blame] | 635 |         """ Set the input stream's current position. | 
 | 636 |  | 
 | 637 |             Resets the codec buffers used for keeping state. | 
 | 638 |         """ | 
| Walter Dörwald | 729c31f | 2005-03-14 19:06:30 +0000 | [diff] [blame] | 639 |         self.stream.seek(offset, whence) | 
| Victor Stinner | a92ad7e | 2010-05-22 16:59:09 +0000 | [diff] [blame] | 640 |         self.reset() | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 641 |  | 
| Georg Brandl | a18af4e | 2007-04-21 15:47:16 +0000 | [diff] [blame] | 642 |     def __next__(self): | 
| Walter Dörwald | 4dbf192 | 2002-11-06 16:53:44 +0000 | [diff] [blame] | 643 |  | 
 | 644 |         """ Return the next decoded line from the input stream.""" | 
 | 645 |         line = self.readline() | 
 | 646 |         if line: | 
 | 647 |             return line | 
 | 648 |         raise StopIteration | 
 | 649 |  | 
 | 650 |     def __iter__(self): | 
 | 651 |         return self | 
 | 652 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 653 |     def __getattr__(self, name, | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 654 |                     getattr=getattr): | 
 | 655 |  | 
 | 656 |         """ Inherit all other methods from the underlying stream. | 
 | 657 |         """ | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 658 |         return getattr(self.stream, name) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 659 |  | 
| Thomas Wouters | 89f507f | 2006-12-13 04:49:30 +0000 | [diff] [blame] | 660 |     def __enter__(self): | 
 | 661 |         return self | 
 | 662 |  | 
 | 663 |     def __exit__(self, type, value, tb): | 
 | 664 |         self.stream.close() | 
 | 665 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 666 | ### | 
 | 667 |  | 
 | 668 | class StreamReaderWriter: | 
 | 669 |  | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 670 |     """ StreamReaderWriter instances allow wrapping streams which | 
 | 671 |         work in both read and write modes. | 
 | 672 |  | 
 | 673 |         The design is such that one can use the factory functions | 
| Thomas Wouters | 7e47402 | 2000-07-16 12:04:32 +0000 | [diff] [blame] | 674 |         returned by the codec.lookup() function to construct the | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 675 |         instance. | 
 | 676 |  | 
 | 677 |     """ | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 678 |     # Optional attributes set by the file wrappers below | 
 | 679 |     encoding = 'unknown' | 
 | 680 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 681 |     def __init__(self, stream, Reader, Writer, errors='strict'): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 682 |  | 
 | 683 |         """ Creates a StreamReaderWriter instance. | 
 | 684 |  | 
 | 685 |             stream must be a Stream-like object. | 
 | 686 |  | 
 | 687 |             Reader, Writer must be factory functions or classes | 
 | 688 |             providing the StreamReader, StreamWriter interface resp. | 
 | 689 |  | 
 | 690 |             Error handling is done in the same way as defined for the | 
 | 691 |             StreamWriter/Readers. | 
 | 692 |  | 
 | 693 |         """ | 
 | 694 |         self.stream = stream | 
 | 695 |         self.reader = Reader(stream, errors) | 
 | 696 |         self.writer = Writer(stream, errors) | 
 | 697 |         self.errors = errors | 
 | 698 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 699 |     def read(self, size=-1): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 700 |  | 
 | 701 |         return self.reader.read(size) | 
 | 702 |  | 
| Guido van Rossum | d58c26f | 2000-05-01 16:17:32 +0000 | [diff] [blame] | 703 |     def readline(self, size=None): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 704 |  | 
 | 705 |         return self.reader.readline(size) | 
 | 706 |  | 
| Guido van Rossum | d58c26f | 2000-05-01 16:17:32 +0000 | [diff] [blame] | 707 |     def readlines(self, sizehint=None): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 708 |  | 
 | 709 |         return self.reader.readlines(sizehint) | 
 | 710 |  | 
| Georg Brandl | a18af4e | 2007-04-21 15:47:16 +0000 | [diff] [blame] | 711 |     def __next__(self): | 
| Walter Dörwald | 4dbf192 | 2002-11-06 16:53:44 +0000 | [diff] [blame] | 712 |  | 
 | 713 |         """ Return the next decoded line from the input stream.""" | 
| Georg Brandl | a18af4e | 2007-04-21 15:47:16 +0000 | [diff] [blame] | 714 |         return next(self.reader) | 
| Walter Dörwald | 4dbf192 | 2002-11-06 16:53:44 +0000 | [diff] [blame] | 715 |  | 
 | 716 |     def __iter__(self): | 
 | 717 |         return self | 
 | 718 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 719 |     def write(self, data): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 720 |  | 
 | 721 |         return self.writer.write(data) | 
 | 722 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 723 |     def writelines(self, list): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 724 |  | 
 | 725 |         return self.writer.writelines(list) | 
 | 726 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 727 |     def reset(self): | 
 | 728 |  | 
 | 729 |         self.reader.reset() | 
 | 730 |         self.writer.reset() | 
 | 731 |  | 
| Victor Stinner | 3fed087 | 2010-05-22 02:16:27 +0000 | [diff] [blame] | 732 |     def seek(self, offset, whence=0): | 
| Victor Stinner | a92ad7e | 2010-05-22 16:59:09 +0000 | [diff] [blame] | 733 |         self.stream.seek(offset, whence) | 
 | 734 |         self.reader.reset() | 
 | 735 |         if whence == 0 and offset == 0: | 
 | 736 |             self.writer.reset() | 
| Victor Stinner | 3fed087 | 2010-05-22 02:16:27 +0000 | [diff] [blame] | 737 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 738 |     def __getattr__(self, name, | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 739 |                     getattr=getattr): | 
 | 740 |  | 
 | 741 |         """ Inherit all other methods from the underlying stream. | 
 | 742 |         """ | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 743 |         return getattr(self.stream, name) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 744 |  | 
| Victor Stinner | 272d888 | 2017-06-16 08:59:01 +0200 | [diff] [blame] | 745 |     # these are needed to make "with StreamReaderWriter(...)" work properly | 
| Thomas Wouters | 89f507f | 2006-12-13 04:49:30 +0000 | [diff] [blame] | 746 |  | 
 | 747 |     def __enter__(self): | 
 | 748 |         return self | 
 | 749 |  | 
 | 750 |     def __exit__(self, type, value, tb): | 
 | 751 |         self.stream.close() | 
 | 752 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 753 | ### | 
 | 754 |  | 
 | 755 | class StreamRecoder: | 
 | 756 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 757 |     """ StreamRecoder instances translate data from one encoding to another. | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 758 |  | 
 | 759 |         They use the complete set of APIs returned by the | 
 | 760 |         codecs.lookup() function to implement their task. | 
 | 761 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 762 |         Data written to the StreamRecoder is first decoded into an | 
 | 763 |         intermediate format (depending on the "decode" codec) and then | 
 | 764 |         written to the underlying stream using an instance of the provided | 
 | 765 |         Writer class. | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 766 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 767 |         In the other direction, data is read from the underlying stream using | 
 | 768 |         a Reader instance and then encoded and returned to the caller. | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 769 |  | 
 | 770 |     """ | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 771 |     # Optional attributes set by the file wrappers below | 
 | 772 |     data_encoding = 'unknown' | 
 | 773 |     file_encoding = 'unknown' | 
 | 774 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 775 |     def __init__(self, stream, encode, decode, Reader, Writer, | 
 | 776 |                  errors='strict'): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 777 |  | 
 | 778 |         """ Creates a StreamRecoder instance which implements a two-way | 
 | 779 |             conversion: encode and decode work on the frontend (the | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 780 |             data visible to .read() and .write()) while Reader and Writer | 
 | 781 |             work on the backend (the data in stream). | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 782 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 783 |             You can use these objects to do transparent | 
 | 784 |             transcodings from e.g. latin-1 to utf-8 and back. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 785 |  | 
 | 786 |             stream must be a file-like object. | 
 | 787 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 788 |             encode and decode must adhere to the Codec interface; Reader and | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 789 |             Writer must be factory functions or classes providing the | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 790 |             StreamReader and StreamWriter interfaces resp. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 791 |  | 
 | 792 |             Error handling is done in the same way as defined for the | 
 | 793 |             StreamWriter/Readers. | 
 | 794 |  | 
 | 795 |         """ | 
 | 796 |         self.stream = stream | 
 | 797 |         self.encode = encode | 
 | 798 |         self.decode = decode | 
 | 799 |         self.reader = Reader(stream, errors) | 
 | 800 |         self.writer = Writer(stream, errors) | 
 | 801 |         self.errors = errors | 
 | 802 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 803 |     def read(self, size=-1): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 804 |  | 
 | 805 |         data = self.reader.read(size) | 
 | 806 |         data, bytesencoded = self.encode(data, self.errors) | 
 | 807 |         return data | 
 | 808 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 809 |     def readline(self, size=None): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 810 |  | 
 | 811 |         if size is None: | 
 | 812 |             data = self.reader.readline() | 
 | 813 |         else: | 
 | 814 |             data = self.reader.readline(size) | 
 | 815 |         data, bytesencoded = self.encode(data, self.errors) | 
 | 816 |         return data | 
 | 817 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 818 |     def readlines(self, sizehint=None): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 819 |  | 
| Marc-André Lemburg | d594849 | 2004-02-26 15:22:17 +0000 | [diff] [blame] | 820 |         data = self.reader.read() | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 821 |         data, bytesencoded = self.encode(data, self.errors) | 
| Ezio Melotti | d8b509b | 2011-09-28 17:37:55 +0300 | [diff] [blame] | 822 |         return data.splitlines(keepends=True) | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 823 |  | 
| Georg Brandl | a18af4e | 2007-04-21 15:47:16 +0000 | [diff] [blame] | 824 |     def __next__(self): | 
| Walter Dörwald | 4dbf192 | 2002-11-06 16:53:44 +0000 | [diff] [blame] | 825 |  | 
 | 826 |         """ Return the next decoded line from the input stream.""" | 
| Georg Brandl | a18af4e | 2007-04-21 15:47:16 +0000 | [diff] [blame] | 827 |         data = next(self.reader) | 
| Walter Dörwald | c5238b8 | 2005-09-01 11:56:53 +0000 | [diff] [blame] | 828 |         data, bytesencoded = self.encode(data, self.errors) | 
 | 829 |         return data | 
| Walter Dörwald | 4dbf192 | 2002-11-06 16:53:44 +0000 | [diff] [blame] | 830 |  | 
 | 831 |     def __iter__(self): | 
 | 832 |         return self | 
 | 833 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 834 |     def write(self, data): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 835 |  | 
 | 836 |         data, bytesdecoded = self.decode(data, self.errors) | 
 | 837 |         return self.writer.write(data) | 
 | 838 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 839 |     def writelines(self, list): | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 840 |  | 
| Jelle Zijlstra | b3be407 | 2019-05-22 08:18:26 -0700 | [diff] [blame] | 841 |         data = b''.join(list) | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 842 |         data, bytesdecoded = self.decode(data, self.errors) | 
 | 843 |         return self.writer.write(data) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 844 |  | 
 | 845 |     def reset(self): | 
 | 846 |  | 
 | 847 |         self.reader.reset() | 
 | 848 |         self.writer.reset() | 
 | 849 |  | 
| Ammar Askar | a6ec1ce | 2019-05-31 12:44:01 -0700 | [diff] [blame] | 850 |     def seek(self, offset, whence=0): | 
 | 851 |         # Seeks must be propagated to both the readers and writers | 
 | 852 |         # as they might need to reset their internal buffers. | 
 | 853 |         self.reader.seek(offset, whence) | 
 | 854 |         self.writer.seek(offset, whence) | 
 | 855 |  | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 856 |     def __getattr__(self, name, | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 857 |                     getattr=getattr): | 
 | 858 |  | 
 | 859 |         """ Inherit all other methods from the underlying stream. | 
 | 860 |         """ | 
| Tim Peters | 30324a7 | 2001-05-15 17:19:16 +0000 | [diff] [blame] | 861 |         return getattr(self.stream, name) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 862 |  | 
| Thomas Wouters | 89f507f | 2006-12-13 04:49:30 +0000 | [diff] [blame] | 863 |     def __enter__(self): | 
 | 864 |         return self | 
 | 865 |  | 
 | 866 |     def __exit__(self, type, value, tb): | 
 | 867 |         self.stream.close() | 
 | 868 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 869 | ### Shortcuts | 
 | 870 |  | 
| Alexey Izbyshev | a267056 | 2018-10-20 03:22:31 +0300 | [diff] [blame] | 871 | def open(filename, mode='r', encoding=None, errors='strict', buffering=-1): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 872 |  | 
 | 873 |     """ Open an encoded file using the given mode and return | 
 | 874 |         a wrapped version providing transparent encoding/decoding. | 
 | 875 |  | 
 | 876 |         Note: The wrapped version will only accept the object format | 
 | 877 |         defined by the codecs, i.e. Unicode objects for most builtin | 
| Skip Montanaro | 9f5f9d9 | 2005-03-16 03:51:56 +0000 | [diff] [blame] | 878 |         codecs. Output is also codec dependent and will usually be | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 879 |         Unicode as well. | 
 | 880 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 881 |         Underlying encoded files are always opened in binary mode. | 
 | 882 |         The default file mode is 'r', meaning to open the file in read mode. | 
| Marc-André Lemburg | 349a3d3 | 2000-06-21 21:21:04 +0000 | [diff] [blame] | 883 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 884 |         encoding specifies the encoding which is to be used for the | 
| Walter Dörwald | 7f3ed74 | 2003-02-02 23:08:27 +0000 | [diff] [blame] | 885 |         file. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 886 |  | 
 | 887 |         errors may be given to define the error handling. It defaults | 
 | 888 |         to 'strict' which causes ValueErrors to be raised in case an | 
 | 889 |         encoding error occurs. | 
 | 890 |  | 
 | 891 |         buffering has the same meaning as for the builtin open() API. | 
| Alexey Izbyshev | a267056 | 2018-10-20 03:22:31 +0300 | [diff] [blame] | 892 |         It defaults to -1 which means that the default buffer size will | 
 | 893 |         be used. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 894 |  | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 895 |         The returned wrapped file object provides an extra attribute | 
 | 896 |         .encoding which allows querying the used encoding. This | 
 | 897 |         attribute is only available if an encoding was specified as | 
 | 898 |         parameter. | 
 | 899 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 900 |     """ | 
 | 901 |     if encoding is not None and \ | 
 | 902 |        'b' not in mode: | 
 | 903 |         # Force opening of the file in binary mode | 
 | 904 |         mode = mode + 'b' | 
| Georg Brandl | 1a3284e | 2007-12-02 09:40:06 +0000 | [diff] [blame] | 905 |     file = builtins.open(filename, mode, buffering) | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 906 |     if encoding is None: | 
 | 907 |         return file | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 908 |     info = lookup(encoding) | 
 | 909 |     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 910 |     # Add attributes to simplify introspection | 
 | 911 |     srw.encoding = encoding | 
 | 912 |     return srw | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 913 |  | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 914 | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 915 |  | 
 | 916 |     """ Return a wrapped version of file which provides transparent | 
 | 917 |         encoding translation. | 
 | 918 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 919 |         Data written to the wrapped file is decoded according | 
 | 920 |         to the given data_encoding and then encoded to the underlying | 
 | 921 |         file using file_encoding. The intermediate data type | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 922 |         will usually be Unicode but depends on the specified codecs. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 923 |  | 
| Nick Coghlan | b9fdb7a | 2015-01-07 00:22:00 +1000 | [diff] [blame] | 924 |         Bytes read from the file are decoded using file_encoding and then | 
 | 925 |         passed back to the caller encoded using data_encoding. | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 926 |  | 
 | 927 |         If file_encoding is not given, it defaults to data_encoding. | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 928 |  | 
 | 929 |         errors may be given to define the error handling. It defaults | 
 | 930 |         to 'strict' which causes ValueErrors to be raised in case an | 
 | 931 |         encoding error occurs. | 
 | 932 |  | 
| Fred Drake | 49fd107 | 2000-04-13 14:11:21 +0000 | [diff] [blame] | 933 |         The returned wrapped file object provides two extra attributes | 
 | 934 |         .data_encoding and .file_encoding which reflect the given | 
 | 935 |         parameters of the same name. The attributes can be used for | 
 | 936 |         introspection by Python programs. | 
 | 937 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 938 |     """ | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 939 |     if file_encoding is None: | 
 | 940 |         file_encoding = data_encoding | 
| Thomas Wouters | 89f507f | 2006-12-13 04:49:30 +0000 | [diff] [blame] | 941 |     data_info = lookup(data_encoding) | 
 | 942 |     file_info = lookup(file_encoding) | 
 | 943 |     sr = StreamRecoder(file, data_info.encode, data_info.decode, | 
 | 944 |                        file_info.streamreader, file_info.streamwriter, errors) | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 945 |     # Add attributes to simplify introspection | 
 | 946 |     sr.data_encoding = data_encoding | 
 | 947 |     sr.file_encoding = file_encoding | 
 | 948 |     return sr | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 949 |  | 
| Marc-André Lemburg | aa32c5a | 2001-09-19 11:24:48 +0000 | [diff] [blame] | 950 | ### Helpers for codec lookup | 
 | 951 |  | 
 | 952 | def getencoder(encoding): | 
 | 953 |  | 
 | 954 |     """ Lookup up the codec for the given encoding and return | 
 | 955 |         its encoder function. | 
 | 956 |  | 
 | 957 |         Raises a LookupError in case the encoding cannot be found. | 
 | 958 |  | 
 | 959 |     """ | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 960 |     return lookup(encoding).encode | 
| Marc-André Lemburg | aa32c5a | 2001-09-19 11:24:48 +0000 | [diff] [blame] | 961 |  | 
 | 962 | def getdecoder(encoding): | 
 | 963 |  | 
 | 964 |     """ Lookup up the codec for the given encoding and return | 
 | 965 |         its decoder function. | 
 | 966 |  | 
 | 967 |         Raises a LookupError in case the encoding cannot be found. | 
 | 968 |  | 
 | 969 |     """ | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 970 |     return lookup(encoding).decode | 
 | 971 |  | 
 | 972 | def getincrementalencoder(encoding): | 
 | 973 |  | 
 | 974 |     """ Lookup up the codec for the given encoding and return | 
 | 975 |         its IncrementalEncoder class or factory function. | 
 | 976 |  | 
 | 977 |         Raises a LookupError in case the encoding cannot be found | 
 | 978 |         or the codecs doesn't provide an incremental encoder. | 
 | 979 |  | 
 | 980 |     """ | 
 | 981 |     encoder = lookup(encoding).incrementalencoder | 
 | 982 |     if encoder is None: | 
 | 983 |         raise LookupError(encoding) | 
 | 984 |     return encoder | 
 | 985 |  | 
 | 986 | def getincrementaldecoder(encoding): | 
 | 987 |  | 
 | 988 |     """ Lookup up the codec for the given encoding and return | 
 | 989 |         its IncrementalDecoder class or factory function. | 
 | 990 |  | 
 | 991 |         Raises a LookupError in case the encoding cannot be found | 
 | 992 |         or the codecs doesn't provide an incremental decoder. | 
 | 993 |  | 
 | 994 |     """ | 
 | 995 |     decoder = lookup(encoding).incrementaldecoder | 
 | 996 |     if decoder is None: | 
 | 997 |         raise LookupError(encoding) | 
 | 998 |     return decoder | 
| Marc-André Lemburg | aa32c5a | 2001-09-19 11:24:48 +0000 | [diff] [blame] | 999 |  | 
 | 1000 | def getreader(encoding): | 
 | 1001 |  | 
 | 1002 |     """ Lookup up the codec for the given encoding and return | 
 | 1003 |         its StreamReader class or factory function. | 
 | 1004 |  | 
 | 1005 |         Raises a LookupError in case the encoding cannot be found. | 
 | 1006 |  | 
 | 1007 |     """ | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 1008 |     return lookup(encoding).streamreader | 
| Marc-André Lemburg | aa32c5a | 2001-09-19 11:24:48 +0000 | [diff] [blame] | 1009 |  | 
 | 1010 | def getwriter(encoding): | 
 | 1011 |  | 
 | 1012 |     """ Lookup up the codec for the given encoding and return | 
 | 1013 |         its StreamWriter class or factory function. | 
 | 1014 |  | 
 | 1015 |         Raises a LookupError in case the encoding cannot be found. | 
 | 1016 |  | 
 | 1017 |     """ | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 1018 |     return lookup(encoding).streamwriter | 
 | 1019 |  | 
 | 1020 | def iterencode(iterator, encoding, errors='strict', **kwargs): | 
 | 1021 |     """ | 
 | 1022 |     Encoding iterator. | 
 | 1023 |  | 
| Martin Panter | 7462b649 | 2015-11-02 03:37:02 +0000 | [diff] [blame] | 1024 |     Encodes the input strings from the iterator using an IncrementalEncoder. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 1025 |  | 
 | 1026 |     errors and kwargs are passed through to the IncrementalEncoder | 
 | 1027 |     constructor. | 
 | 1028 |     """ | 
 | 1029 |     encoder = getincrementalencoder(encoding)(errors, **kwargs) | 
 | 1030 |     for input in iterator: | 
 | 1031 |         output = encoder.encode(input) | 
 | 1032 |         if output: | 
 | 1033 |             yield output | 
 | 1034 |     output = encoder.encode("", True) | 
 | 1035 |     if output: | 
 | 1036 |         yield output | 
 | 1037 |  | 
 | 1038 | def iterdecode(iterator, encoding, errors='strict', **kwargs): | 
 | 1039 |     """ | 
 | 1040 |     Decoding iterator. | 
 | 1041 |  | 
| Martin Panter | 7462b649 | 2015-11-02 03:37:02 +0000 | [diff] [blame] | 1042 |     Decodes the input strings from the iterator using an IncrementalDecoder. | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 1043 |  | 
 | 1044 |     errors and kwargs are passed through to the IncrementalDecoder | 
 | 1045 |     constructor. | 
 | 1046 |     """ | 
 | 1047 |     decoder = getincrementaldecoder(encoding)(errors, **kwargs) | 
 | 1048 |     for input in iterator: | 
 | 1049 |         output = decoder.decode(input) | 
 | 1050 |         if output: | 
 | 1051 |             yield output | 
| Walter Dörwald | ca8a8d0 | 2007-05-04 13:05:09 +0000 | [diff] [blame] | 1052 |     output = decoder.decode(b"", True) | 
| Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 1053 |     if output: | 
 | 1054 |         yield output | 
| Marc-André Lemburg | aa32c5a | 2001-09-19 11:24:48 +0000 | [diff] [blame] | 1055 |  | 
| Marc-André Lemburg | a866df8 | 2001-01-03 21:29:14 +0000 | [diff] [blame] | 1056 | ### Helpers for charmap-based codecs | 
 | 1057 |  | 
 | 1058 | def make_identity_dict(rng): | 
 | 1059 |  | 
 | 1060 |     """ make_identity_dict(rng) -> dict | 
 | 1061 |  | 
 | 1062 |         Return a dictionary where elements of the rng sequence are | 
 | 1063 |         mapped to themselves. | 
| Tim Peters | 88869f9 | 2001-01-14 23:36:06 +0000 | [diff] [blame] | 1064 |  | 
| Marc-André Lemburg | a866df8 | 2001-01-03 21:29:14 +0000 | [diff] [blame] | 1065 |     """ | 
| Antoine Pitrou | aaefac7 | 2012-06-16 22:48:21 +0200 | [diff] [blame] | 1066 |     return {i:i for i in rng} | 
| Marc-André Lemburg | a866df8 | 2001-01-03 21:29:14 +0000 | [diff] [blame] | 1067 |  | 
| Marc-André Lemburg | 716cf91 | 2001-05-16 09:41:45 +0000 | [diff] [blame] | 1068 | def make_encoding_map(decoding_map): | 
 | 1069 |  | 
 | 1070 |     """ Creates an encoding map from a decoding map. | 
 | 1071 |  | 
| Walter Dörwald | 7f3ed74 | 2003-02-02 23:08:27 +0000 | [diff] [blame] | 1072 |         If a target mapping in the decoding map occurs multiple | 
| Marc-André Lemburg | 716cf91 | 2001-05-16 09:41:45 +0000 | [diff] [blame] | 1073 |         times, then that target is mapped to None (undefined mapping), | 
 | 1074 |         causing an exception when encountered by the charmap codec | 
 | 1075 |         during translation. | 
 | 1076 |  | 
 | 1077 |         One example where this happens is cp875.py which decodes | 
| Serhiy Storchaka | 9f8a891 | 2015-04-03 18:12:41 +0300 | [diff] [blame] | 1078 |         multiple character to \\u001a. | 
| Marc-André Lemburg | 716cf91 | 2001-05-16 09:41:45 +0000 | [diff] [blame] | 1079 |  | 
 | 1080 |     """ | 
 | 1081 |     m = {} | 
 | 1082 |     for k,v in decoding_map.items(): | 
| Raymond Hettinger | 54f0222 | 2002-06-01 14:18:47 +0000 | [diff] [blame] | 1083 |         if not v in m: | 
| Marc-André Lemburg | 716cf91 | 2001-05-16 09:41:45 +0000 | [diff] [blame] | 1084 |             m[v] = k | 
 | 1085 |         else: | 
 | 1086 |             m[v] = None | 
 | 1087 |     return m | 
| Tim Peters | 3a2ab1a | 2001-05-29 06:06:54 +0000 | [diff] [blame] | 1088 |  | 
| Walter Dörwald | 3aeb632 | 2002-09-02 13:14:32 +0000 | [diff] [blame] | 1089 | ### error handlers | 
 | 1090 |  | 
| Martin v. Löwis | e2713be | 2005-03-08 15:03:08 +0000 | [diff] [blame] | 1091 | try: | 
 | 1092 |     strict_errors = lookup_error("strict") | 
 | 1093 |     ignore_errors = lookup_error("ignore") | 
 | 1094 |     replace_errors = lookup_error("replace") | 
 | 1095 |     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") | 
 | 1096 |     backslashreplace_errors = lookup_error("backslashreplace") | 
| Serhiy Storchaka | 166ebc4 | 2014-11-25 13:57:17 +0200 | [diff] [blame] | 1097 |     namereplace_errors = lookup_error("namereplace") | 
| Martin v. Löwis | e2713be | 2005-03-08 15:03:08 +0000 | [diff] [blame] | 1098 | except LookupError: | 
 | 1099 |     # In --disable-unicode builds, these error handler are missing | 
 | 1100 |     strict_errors = None | 
 | 1101 |     ignore_errors = None | 
 | 1102 |     replace_errors = None | 
 | 1103 |     xmlcharrefreplace_errors = None | 
 | 1104 |     backslashreplace_errors = None | 
| Serhiy Storchaka | 166ebc4 | 2014-11-25 13:57:17 +0200 | [diff] [blame] | 1105 |     namereplace_errors = None | 
| Walter Dörwald | 3aeb632 | 2002-09-02 13:14:32 +0000 | [diff] [blame] | 1106 |  | 
| Martin v. Löwis | 6cd441d | 2001-07-31 08:54:55 +0000 | [diff] [blame] | 1107 | # Tell modulefinder that using codecs probably needs the encodings | 
 | 1108 | # package | 
 | 1109 | _false = 0 | 
 | 1110 | if _false: | 
 | 1111 |     import encodings | 
 | 1112 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 1113 | ### Tests | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 1114 |  | 
| Guido van Rossum | 0612d84 | 2000-03-10 23:20:43 +0000 | [diff] [blame] | 1115 | if __name__ == '__main__': | 
 | 1116 |  | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 1117 |     # Make stdout translate Latin-1 output into UTF-8 output | 
 | 1118 |     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') | 
| Guido van Rossum | 1c89b0e | 2000-04-11 15:41:38 +0000 | [diff] [blame] | 1119 |  | 
| Guido van Rossum | a327713 | 2000-04-11 15:37:43 +0000 | [diff] [blame] | 1120 |     # Have stdin translate Latin-1 input into UTF-8 input | 
 | 1121 |     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |