Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

Victor Stinner

2017-06-16 08:59:01 +0200

[diff] [blame]

8

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

9

Victor Stinner

272d888

2017-06-16 08:59:01 +0200

[diff] [blame]

10

import builtins

11

import sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

12

13

### Registry and builtin stateless codec functions

14

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

15

try:

16

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

17

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

18

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

19

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

20

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

21

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

22

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

23

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

Serhiy Storchaka

de3ee5b

2014-12-20 17:42:38 +0200

[diff] [blame]

24

"CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",

25

"StreamReader", "StreamWriter",

26

"StreamReaderWriter", "StreamRecoder",

27

"getencoder", "getdecoder", "getincrementalencoder",

28

"getincrementaldecoder", "getreader", "getwriter",

29

"encode", "decode", "iterencode", "iterdecode",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

30

"strict_errors", "ignore_errors", "replace_errors",

31

"xmlcharrefreplace_errors",

Serhiy Storchaka

166ebc4

2014-11-25 13:57:17 +0200

[diff] [blame]

32

"backslashreplace_errors", "namereplace_errors",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

33

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

34

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

35

### Constants

36

37

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

38

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

39

# and its possible byte string values

40

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

41

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

42

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

43

# UTF-8

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

44

BOM_UTF8 = b'\xef\xbb\xbf'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

45

46

# UTF-16, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

47

BOM_LE = BOM_UTF16_LE = b'\xff\xfe'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

48

49

# UTF-16, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

50

BOM_BE = BOM_UTF16_BE = b'\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

51

52

# UTF-32, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

53

BOM_UTF32_LE = b'\xff\xfe\x00\x00'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

54

55

# UTF-32, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

56

BOM_UTF32_BE = b'\x00\x00\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

57

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

58

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

59

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

60

# UTF-16, native endianness

61

BOM = BOM_UTF16 = BOM_UTF16_LE

62

63

# UTF-32, native endianness

64

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

69

BOM = BOM_UTF16 = BOM_UTF16_BE

70

71

# UTF-32, native endianness

72

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

73

74

# Old broken names (don't use in new code)

75

BOM32_LE = BOM_UTF16_LE

76

BOM32_BE = BOM_UTF16_BE

77

BOM64_LE = BOM_UTF32_LE

78

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

79

80

81

### Codec base classes (defining the API)

82

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

83

class CodecInfo(tuple):

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

84

"""Codec details when looking up the codec registry"""

85

86

# Private API to allow Python 3.4 to blacklist the known non-Unicode

87

# codecs in the standard library. A more general mechanism to

88

# reliably distinguish test encodings from other codecs will hopefully

89

# be defined for Python 3.5

90

#

91

# See http://bugs.python.org/issue19619

92

_is_text_encoding = True # Assume codecs are text encodings by default

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

93

94

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

95

incrementalencoder=None, incrementaldecoder=None, name=None,

96

*, _is_text_encoding=None):

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

97

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

102

self.incrementaldecoder = incrementaldecoder

103

self.streamwriter = streamwriter

104

self.streamreader = streamreader

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

105

if _is_text_encoding is not None:

106

self._is_text_encoding = _is_text_encoding

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

107

return self

108

109

def __repr__(self):

Serhiy Storchaka

521e586

2014-07-22 15:00:37 +0300

[diff] [blame]

110

return "<%s.%s object for encoding %s at %#x>" % \

111

(self.__class__.__module__, self.__class__.__qualname__,

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

112

self.name, id(self))

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

113

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

114

class Codec:

115

116

""" Defines the interface for stateless encoders/decoders.

117

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

118

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

119

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

120

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

121

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

122

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

123

'ignore' - ignore the character and continue with the next

124

'replace' - replace with a suitable replacement character;

125

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

126

CHARACTER for the builtin Unicode codecs on

127

decoding and '?' on encoding.

Serhiy Storchaka

d3faf43

2015-01-18 11:28:37 +0200

[diff] [blame]

128

'surrogateescape' - replace with private code points U+DCnn.

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

129

'xmlcharrefreplace' - Replace with the appropriate XML

130

character reference (only for encoding).

Serhiy Storchaka

07985ef

2015-01-25 22:56:57 +0200

[diff] [blame]

131

'backslashreplace' - Replace with backslashed escape sequences.

132

'namereplace' - Replace with \\N{...} escape sequences

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

133

(only for encoding).

134

135

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

136

137

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

138

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

139

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

140

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

141

object, length consumed).

142

143

errors defines the error handling to apply. It defaults to

144

'strict' handling.

145

146

The method may not store state in the Codec instance. Use

Berker Peksag

41ca828

2015-07-30 18:26:10 +0300

[diff] [blame]

147

StreamWriter for codecs which have to keep state in order to

148

make encoding efficient.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

149

150

The encoder must be able to handle zero length input and

151

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

156

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

157

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

158

159

""" Decodes the object input and returns a tuple (output

160

object, length consumed).

161

162

input must be an object which provides the bf_getreadbuf

163

buffer slot. Python strings, buffer objects and memory

164

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

165

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

166

errors defines the error handling to apply. It defaults to

167

'strict' handling.

168

169

The method may not store state in the Codec instance. Use

Berker Peksag

41ca828

2015-07-30 18:26:10 +0300

[diff] [blame]

170

StreamReader for codecs which have to keep state in order to

171

make decoding efficient.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

172

173

The decoder must be able to handle zero length input and

174

return an empty object of the output object type in this

175

situation.

176

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

177

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

178

raise NotImplementedError

179

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

180

class IncrementalEncoder(object):

181

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

182

An IncrementalEncoder encodes an input in multiple steps. The input can

183

be passed piece by piece to the encode() method. The IncrementalEncoder

184

remembers the state of the encoding process between calls to encode().

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

185

"""

186

def __init__(self, errors='strict'):

187

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

188

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

189

190

The IncrementalEncoder may use different error handling schemes by

191

providing the errors keyword argument. See the module docstring

192

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

198

"""

199

Encodes input and returns the resulting object.

200

"""

201

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

206

"""

207

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

208

def getstate(self):

209

"""

210

Return the current state of the encoder.

"""

return 0

def setstate(self, state):

215

"""

216

Set the current state of the encoder. state must have been

217

returned by getstate().

218

"""

219

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

220

class BufferedIncrementalEncoder(IncrementalEncoder):

221

"""

222

This subclass of IncrementalEncoder can be used as the baseclass for an

223

incremental encoder if the encoder must keep some of the output in a

224

buffer between calls to encode().

225

"""

226

def __init__(self, errors='strict'):

227

IncrementalEncoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

228

# unencoded input that is kept between calls to encode()

229

self.buffer = ""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

230

231

def _buffer_encode(self, input, errors, final):

232

# Overwrite this method in subclasses: It must encode input

233

# and return an (output, length consumed) tuple

234

raise NotImplementedError

235

236

def encode(self, input, final=False):

237

# encode input (taking the buffer into account)

238

data = self.buffer + input

239

(result, consumed) = self._buffer_encode(data, self.errors, final)

240

# keep unencoded input until the next call

241

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

246

self.buffer = ""

247

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

248

def getstate(self):

249

return self.buffer or 0

250

251

def setstate(self, state):

252

self.buffer = state or ""

253

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

254

class IncrementalDecoder(object):

255

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

256

An IncrementalDecoder decodes an input in multiple steps. The input can

257

be passed piece by piece to the decode() method. The IncrementalDecoder

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

258

remembers the state of the decoding process between calls to decode().

259

"""

260

def __init__(self, errors='strict'):

261

"""

Martin Panter

7462b649

2015-11-02 03:37:02 +0000

[diff] [blame]

262

Create an IncrementalDecoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

263

264

The IncrementalDecoder may use different error handling schemes by

265

providing the errors keyword argument. See the module docstring

266

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

271

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

272

Decode input and returns the resulting object.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

273

"""

274

raise NotImplementedError

275

276

def reset(self):

277

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

278

Reset the decoder to the initial state.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

279

"""

280

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

281

def getstate(self):

282

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

283

Return the current state of the decoder.

284

285

This must be a (buffered_input, additional_state_info) tuple.

286

buffered_input must be a bytes object containing bytes that

287

were passed to decode() that have not yet been converted.

288

additional_state_info must be a non-negative integer

289

representing the state of the decoder WITHOUT yet having

290

processed the contents of buffered_input. In the initial state

291

and after reset(), getstate() must return (b"", 0).

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

292

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

293

return (b"", 0)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

294

295

def setstate(self, state):

296

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

297

Set the current state of the decoder.

298

299

state must have been returned by getstate(). The effect of

300

setstate((b"", 0)) must be equivalent to reset().

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

301

"""

302

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

303

class BufferedIncrementalDecoder(IncrementalDecoder):

304

"""

305

This subclass of IncrementalDecoder can be used as the baseclass for an

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

306

incremental decoder if the decoder must be able to handle incomplete

307

byte sequences.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

308

"""

309

def __init__(self, errors='strict'):

310

IncrementalDecoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

311

# undecoded input that is kept between calls to decode()

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

312

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

313

314

def _buffer_decode(self, input, errors, final):

315

# Overwrite this method in subclasses: It must decode input

316

# and return an (output, length consumed) tuple

317

raise NotImplementedError

318

319

def decode(self, input, final=False):

320

# decode input (taking the buffer into account)

321

data = self.buffer + input

322

(result, consumed) = self._buffer_decode(data, self.errors, final)

323

# keep undecoded input until the next call

324

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

329

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

330

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

331

def getstate(self):

332

# additional state info is always 0

333

return (self.buffer, 0)

334

335

def setstate(self, state):

336

# ignore additional state info

337

self.buffer = state[0]

338

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

339

#

340

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

341

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

342

# very easily. See encodings/utf_8.py for an example on how this is

343

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

344

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

345

346

class StreamWriter(Codec):

347

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

348

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

349

350

""" Creates a StreamWriter instance.

351

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

352

stream must be a file-like object open for writing.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

353

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

354

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

355

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

356

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

357

358

'strict' - raise a ValueError (or a subclass)

359

'ignore' - ignore the character and continue with the next

360

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

361

'xmlcharrefreplace' - Replace with the appropriate XML

362

character reference.

363

'backslashreplace' - Replace with backslashed escape

Serhiy Storchaka

07985ef

2015-01-25 22:56:57 +0200

[diff] [blame]

364

sequences.

365

'namereplace' - Replace with \\N{...} escape sequences.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

366

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

367

The set of allowed parameter values can be extended via

368

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

373

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

374

375

""" Writes the object's contents encoded to self.stream.

376

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

377

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

378

self.stream.write(data)

379

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

380

def writelines(self, list):

381

382

""" Writes the concatenated list of strings to the stream

383

using .write().

384

"""

385

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

386

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

387

def reset(self):

388

389

""" Flushes and resets the codec buffers used for keeping state.

390

391

Calling this method should ensure that the data on the

392

output is put into a clean state, that allows appending

393

of new fresh data without having to rescan the whole

394

stream to recover state.

"""

pass

Victor Stinner

2010-05-22 16:59:09 +0000

[diff] [blame]

399

def seek(self, offset, whence=0):

400

self.stream.seek(offset, whence)

401

if whence == 0 and offset == 0:

402

self.reset()

403

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

404

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

405

getattr=getattr):

406

407

""" Inherit all other methods from the underlying stream.

408

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

409

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

410

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

415

self.stream.close()

416

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

417

###

418

419

class StreamReader(Codec):

420

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

421

charbuffertype = str

422

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

423

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

424

425

""" Creates a StreamReader instance.

426

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

427

stream must be a file-like object open for reading.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

428

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

429

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

430

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

431

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

432

433

'strict' - raise a ValueError (or a subclass)

434

'ignore' - ignore the character and continue with the next

Serhiy Storchaka

07985ef

2015-01-25 22:56:57 +0200

[diff] [blame]

435

'replace'- replace with a suitable replacement character

436

'backslashreplace' - Replace with backslashed escape sequences;

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

437

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

438

The set of allowed parameter values can be extended via

439

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

440

"""

441

self.stream = stream

442

self.errors = errors

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

443

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

444

self._empty_charbuffer = self.charbuffertype()

445

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

446

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

447

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

448

def decode(self, input, errors='strict'):

449

raise NotImplementedError

450

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

451

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

452

453

""" Decodes data from the stream self.stream and returns the

454

resulting object.

455

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

456

chars indicates the number of decoded code points or bytes to

457

return. read() will never return more data than requested,

458

but it might return less, if there is not enough available.

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

459

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

460

size indicates the approximate maximum number of decoded

461

bytes or code points to read for decoding. The decoder

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

462

can modify this setting as appropriate. The default value

463

-1 indicates to read and decode as much as possible. size

464

is intended to prevent having to decode huge files in one

465

step.

466

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

467

If firstline is true, and a UnicodeDecodeError happens

468

after the first line terminator in the input only the first line

469

will be returned, the rest of the input will be kept until the

470

next call to read().

471

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

472

The method should use a greedy read strategy, meaning that

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

473

it should read as much data as is allowed within the

474

definition of the encoding and the given size, e.g. if

475

optional encoding endings or state markers are available

476

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

477

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

478

# If we have lines cached, first merge them back into characters

479

if self.linebuffer:

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

480

self.charbuffer = self._empty_charbuffer.join(self.linebuffer)

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

481

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

482

Serhiy Storchaka

219c2de

2017-11-29 01:30:00 +0200

[diff] [blame]

483

if chars < 0:

484

# For compatibility with other read() methods that take a

# single argument

chars = size

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

488

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

489

while True:

Tim Golden

621302c

2012-10-01 16:40:40 +0100

[diff] [blame]

490

# can the request be satisfied from the character buffer?

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

491

if chars >= 0:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

492

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

break

# we need more data

if size < 0:

newdata = self.stream.read()

497

else:

498

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

499

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

500

data = self.bytebuffer + newdata

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

501

if not data:

502

break

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

503

try:

504

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

505

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

506

if firstline:

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

507

newchars, decodedbytes = \

508

self.decode(data[:exc.start], self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

509

lines = newchars.splitlines(keepends=True)

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

514

# keep undecoded bytes until the next call

515

self.bytebuffer = data[decodedbytes:]

516

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

517

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

518

# there was no data available

519

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

520

break

521

if chars < 0:

522

# Return everything we've got

523

result = self.charbuffer

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

524

self.charbuffer = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

525

else:

526

# Return the first chars characters

527

result = self.charbuffer[:chars]

528

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

529

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

530

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

531

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

532

533

""" Read one line from the input stream and return the

534

decoded data.

535

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

536

size, if given, is passed as size argument to the

537

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

538

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

539

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

540

# If we have lines cached from an earlier read, return

541

# them unconditionally

542

if self.linebuffer:

543

line = self.linebuffer[0]

544

del self.linebuffer[0]

545

if len(self.linebuffer) == 1:

546

# revert to charbuffer mode; we might need more data

547

# next time

548

self.charbuffer = self.linebuffer[0]

549

self.linebuffer = None

550

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

551

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

552

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

553

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

554

readsize = size or 72

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

555

line = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

556

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

557

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

558

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

559

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

560

# If we're at a "\r" read one extra character (which might

561

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

562

# temporarily exhausted we return the wrong line ending.

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

563

if (isinstance(data, str) and data.endswith("\r")) or \

564

(isinstance(data, bytes) and data.endswith(b"\r")):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

565

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

566

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

567

line += data

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

568

lines = line.splitlines(keepends=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

569

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

570

if len(lines) > 1:

571

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

577

lines[-1] += self.charbuffer

578

self.linebuffer = lines

579

self.charbuffer = None

580

else:

581

# only one remaining line, put it back into charbuffer

582

self.charbuffer = lines[0] + self.charbuffer

583

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

584

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

585

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

586

line0withend = lines[0]

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

587

line0withoutend = lines[0].splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

588

if line0withend != line0withoutend: # We really have a line end

589

# Put the rest back together and keep it until the next call

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

590

self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \

591

self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

596

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

597

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

598

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

599

if line and not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

600

line = line.splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

601

break

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

602

if readsize < 8000:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

603

readsize *= 2

604

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

605

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

606

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

607

608

""" Read all lines available on the input stream

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

609

and return them as a list.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

610

611

Line breaks are implemented using the codec's decoder

612

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

613

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

614

sizehint, if given, is ignored since there is no efficient

615

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

616

617

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

618

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

619

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

624

625

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

626

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

627

from decoding errors.

628

629

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

630

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

631

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

632

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

633

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

634

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

635

""" Set the input stream's current position.

636

637

Resets the codec buffers used for keeping state.

638

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

639

self.stream.seek(offset, whence)

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

640

self.reset()

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

641

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

642

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

643

644

""" Return the next decoded line from the input stream."""

645

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

653

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

654

getattr=getattr):

655

656

""" Inherit all other methods from the underlying stream.

657

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

658

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

659

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

664

self.stream.close()

665

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

666

###

667

668

class StreamReaderWriter:

669

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

670

""" StreamReaderWriter instances allow wrapping streams which

671

work in both read and write modes.

672

673

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

674

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

675

instance.

676

677

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

678

# Optional attributes set by the file wrappers below

679

encoding = 'unknown'

680

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

681

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

682

683

""" Creates a StreamReaderWriter instance.

684

685

stream must be a Stream-like object.

686

687

Reader, Writer must be factory functions or classes

688

providing the StreamReader, StreamWriter interface resp.

689

690

Error handling is done in the same way as defined for the

691

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

696

self.writer = Writer(stream, errors)

697

self.errors = errors

698

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

699

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

700

701

return self.reader.read(size)

702

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

703

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

704

705

return self.reader.readline(size)

706

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

707

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

708

709

return self.reader.readlines(sizehint)

710

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

711

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

712

713

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

714

return next(self.reader)

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

719

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

720

721

return self.writer.write(data)

722

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

723

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

724

725

return self.writer.writelines(list)

726

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Victor Stinner

2010-05-22 02:16:27 +0000

[diff] [blame]

732

def seek(self, offset, whence=0):

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

733

self.stream.seek(offset, whence)

734

self.reader.reset()

735

if whence == 0 and offset == 0:

736

self.writer.reset()

Victor Stinner

3fed087

2010-05-22 02:16:27 +0000

[diff] [blame]

737

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

738

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

739

getattr=getattr):

740

741

""" Inherit all other methods from the underlying stream.

742

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

743

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

744

Victor Stinner

272d888

2017-06-16 08:59:01 +0200

[diff] [blame]

745

# these are needed to make "with StreamReaderWriter(...)" work properly

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

751

self.stream.close()

752

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

757

""" StreamRecoder instances translate data from one encoding to another.

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

758

759

They use the complete set of APIs returned by the

760

codecs.lookup() function to implement their task.

761

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

762

Data written to the StreamRecoder is first decoded into an

763

intermediate format (depending on the "decode" codec) and then

764

written to the underlying stream using an instance of the provided

765

Writer class.

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

766

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

767

In the other direction, data is read from the underlying stream using

768

a Reader instance and then encoded and returned to the caller.

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

769

770

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

771

# Optional attributes set by the file wrappers below

772

data_encoding = 'unknown'

773

file_encoding = 'unknown'

774

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

775

def __init__(self, stream, encode, decode, Reader, Writer,

776

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

777

778

""" Creates a StreamRecoder instance which implements a two-way

779

conversion: encode and decode work on the frontend (the

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

780

data visible to .read() and .write()) while Reader and Writer

781

work on the backend (the data in stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

782

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

783

You can use these objects to do transparent

784

transcodings from e.g. latin-1 to utf-8 and back.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

785

786

stream must be a file-like object.

787

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

788

encode and decode must adhere to the Codec interface; Reader and

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

789

Writer must be factory functions or classes providing the

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

790

StreamReader and StreamWriter interfaces resp.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

791

792

Error handling is done in the same way as defined for the

793

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

800

self.writer = Writer(stream, errors)

801

self.errors = errors

802

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

803

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

804

805

data = self.reader.read(size)

806

data, bytesencoded = self.encode(data, self.errors)

807

return data

808

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

809

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

810

811

if size is None:

812

data = self.reader.readline()

813

else:

814

data = self.reader.readline(size)

815

data, bytesencoded = self.encode(data, self.errors)

816

return data

817

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

818

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

819

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

820

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

821

data, bytesencoded = self.encode(data, self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

822

return data.splitlines(keepends=True)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

823

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

824

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

825

826

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

827

data = next(self.reader)

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

828

data, bytesencoded = self.encode(data, self.errors)

829

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

834

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

835

836

data, bytesdecoded = self.decode(data, self.errors)

837

return self.writer.write(data)

838

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

839

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

840

Jelle Zijlstra

b3be407

2019-05-22 08:18:26 -0700

[diff] [blame]

841

data = b''.join(list)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

842

data, bytesdecoded = self.decode(data, self.errors)

843

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Ammar Askar

2019-05-31 12:44:01 -0700

[diff] [blame]

850

def seek(self, offset, whence=0):

851

# Seeks must be propagated to both the readers and writers

852

# as they might need to reset their internal buffers.

853

self.reader.seek(offset, whence)

854

self.writer.seek(offset, whence)

855

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

856

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

857

getattr=getattr):

858

859

""" Inherit all other methods from the underlying stream.

860

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

861

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

862

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

867

self.stream.close()

868

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

869

### Shortcuts

870

Alexey Izbyshev

a267056

2018-10-20 03:22:31 +0300

[diff] [blame]

871

def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

872

873

""" Open an encoded file using the given mode and return

874

a wrapped version providing transparent encoding/decoding.

875

876

Note: The wrapped version will only accept the object format

877

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

878

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

879

Unicode as well.

880

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

881

Underlying encoded files are always opened in binary mode.

882

The default file mode is 'r', meaning to open the file in read mode.

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

883

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

884

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

885

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

886

887

errors may be given to define the error handling. It defaults

888

to 'strict' which causes ValueErrors to be raised in case an

889

encoding error occurs.

890

891

buffering has the same meaning as for the builtin open() API.

Alexey Izbyshev

a267056

2018-10-20 03:22:31 +0300

[diff] [blame]

892

It defaults to -1 which means that the default buffer size will

893

be used.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

894

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

895

The returned wrapped file object provides an extra attribute

896

.encoding which allows querying the used encoding. This

897

attribute is only available if an encoding was specified as

898

parameter.

899

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

900

"""

901

if encoding is not None and \

902

'b' not in mode:

903

# Force opening of the file in binary mode

904

mode = mode + 'b'

Georg Brandl

1a3284e

2007-12-02 09:40:06 +0000

[diff] [blame]

905

file = builtins.open(filename, mode, buffering)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

906

if encoding is None:

907

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

908

info = lookup(encoding)

909

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

910

# Add attributes to simplify introspection

911

srw.encoding = encoding

912

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

913

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

914

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

915

916

""" Return a wrapped version of file which provides transparent

917

encoding translation.

918

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

919

Data written to the wrapped file is decoded according

920

to the given data_encoding and then encoded to the underlying

921

file using file_encoding. The intermediate data type

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

922

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

923

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

924

Bytes read from the file are decoded using file_encoding and then

925

passed back to the caller encoded using data_encoding.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

926

927

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

928

929

errors may be given to define the error handling. It defaults

930

to 'strict' which causes ValueErrors to be raised in case an

931

encoding error occurs.

932

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

933

The returned wrapped file object provides two extra attributes

934

.data_encoding and .file_encoding which reflect the given

935

parameters of the same name. The attributes can be used for

936

introspection by Python programs.

937

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

938

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

939

if file_encoding is None:

940

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

941

data_info = lookup(data_encoding)

942

file_info = lookup(file_encoding)

943

sr = StreamRecoder(file, data_info.encode, data_info.decode,

944

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

945

# Add attributes to simplify introspection

946

sr.data_encoding = data_encoding

947

sr.file_encoding = file_encoding

948

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

949

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

950

### Helpers for codec lookup

951

952

def getencoder(encoding):

953

954

""" Lookup up the codec for the given encoding and return

955

its encoder function.

956

957

Raises a LookupError in case the encoding cannot be found.

958

959

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

960

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

961

962

def getdecoder(encoding):

963

964

""" Lookup up the codec for the given encoding and return

965

its decoder function.

966

967

Raises a LookupError in case the encoding cannot be found.

968

969

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

970

return lookup(encoding).decode

971

972

def getincrementalencoder(encoding):

973

974

""" Lookup up the codec for the given encoding and return

975

its IncrementalEncoder class or factory function.

976

977

Raises a LookupError in case the encoding cannot be found

978

or the codecs doesn't provide an incremental encoder.

979

980

"""

981

encoder = lookup(encoding).incrementalencoder

982

if encoder is None:

983

raise LookupError(encoding)

984

return encoder

985

986

def getincrementaldecoder(encoding):

987

988

""" Lookup up the codec for the given encoding and return

989

its IncrementalDecoder class or factory function.

990

991

Raises a LookupError in case the encoding cannot be found

992

or the codecs doesn't provide an incremental decoder.

993

994

"""

995

decoder = lookup(encoding).incrementaldecoder

996

if decoder is None:

997

raise LookupError(encoding)

998

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

999

1000

def getreader(encoding):

1001

1002

""" Lookup up the codec for the given encoding and return

1003

its StreamReader class or factory function.

1004

1005

Raises a LookupError in case the encoding cannot be found.

1006

1007

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1008

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1009

1010

def getwriter(encoding):

1011

1012

""" Lookup up the codec for the given encoding and return

1013

its StreamWriter class or factory function.

1014

1015

Raises a LookupError in case the encoding cannot be found.

1016

1017

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1018

return lookup(encoding).streamwriter

1019

1020

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Martin Panter

2015-11-02 03:37:02 +0000

[diff] [blame]

1024

Encodes the input strings from the iterator using an IncrementalEncoder.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1025

1026

errors and kwargs are passed through to the IncrementalEncoder

1027

constructor.

1028

"""

1029

encoder = getincrementalencoder(encoding)(errors, **kwargs)

1030

for input in iterator:

1031

output = encoder.encode(input)

1032

if output:

1033

yield output

1034

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Martin Panter

2015-11-02 03:37:02 +0000

[diff] [blame]

1042

Decodes the input strings from the iterator using an IncrementalDecoder.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1043

1044

errors and kwargs are passed through to the IncrementalDecoder

1045

constructor.

1046

"""

1047

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

1048

for input in iterator:

1049

output = decoder.decode(input)

1050

if output:

1051

yield output

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

1052

output = decoder.decode(b"", True)

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1053

if output:

1054

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1055

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1056

### Helpers for charmap-based codecs

1057

1058

def make_identity_dict(rng):

1059

1060

""" make_identity_dict(rng) -> dict

1061

1062

Return a dictionary where elements of the rng sequence are

1063

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

1064

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1065

"""

Antoine Pitrou

aaefac7

2012-06-16 22:48:21 +0200

[diff] [blame]

1066

return {i:i for i in rng}

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1067

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

1068

def make_encoding_map(decoding_map):

1069

1070

""" Creates an encoding map from a decoding map.

1071

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

1072

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

1073

times, then that target is mapped to None (undefined mapping),

1074

causing an exception when encountered by the charmap codec

1075

during translation.

1076

1077

One example where this happens is cp875.py which decodes

Serhiy Storchaka

9f8a891

2015-04-03 18:12:41 +0300

[diff] [blame]

1078

multiple character to \\u001a.

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

1083

if not v in m:

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1088

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

1089

### error handlers

1090

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1091

try:

1092

strict_errors = lookup_error("strict")

1093

ignore_errors = lookup_error("ignore")

1094

replace_errors = lookup_error("replace")

1095

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1096

backslashreplace_errors = lookup_error("backslashreplace")

Serhiy Storchaka

166ebc4

2014-11-25 13:57:17 +0200

[diff] [blame]

1097

namereplace_errors = lookup_error("namereplace")

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1098

except LookupError:

1099

# In --disable-unicode builds, these error handler are missing

1100

strict_errors = None

1101

ignore_errors = None

1102

replace_errors = None

1103

xmlcharrefreplace_errors = None

1104

backslashreplace_errors = None

Serhiy Storchaka

166ebc4

2014-11-25 13:57:17 +0200

[diff] [blame]

1105

namereplace_errors = None

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

1106

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1107

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1113

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1114

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1115

if __name__ == '__main__':

1116

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1117

# Make stdout translate Latin-1 output into UTF-8 output

1118

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1119

Guido van Rossum