Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

"""#"

Georg Brandl

2007-12-02 09:40:06 +0000

[diff] [blame]

10

import builtins, sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

11

12

### Registry and builtin stateless codec functions

13

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

14

try:

15

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

16

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

17

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

18

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

19

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

20

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

21

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

22

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

Serhiy Storchaka

de3ee5b

2014-12-20 17:42:38 +0200

[diff] [blame]

23

"CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",

24

"StreamReader", "StreamWriter",

25

"StreamReaderWriter", "StreamRecoder",

26

"getencoder", "getdecoder", "getincrementalencoder",

27

"getincrementaldecoder", "getreader", "getwriter",

28

"encode", "decode", "iterencode", "iterdecode",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

29

"strict_errors", "ignore_errors", "replace_errors",

Serhiy Storchaka

de3ee5b

2014-12-20 17:42:38 +0200

[diff] [blame]

30

"xmlcharrefreplace_errors", "backslashreplace_errors",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

31

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

32

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

33

### Constants

34

35

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

36

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

37

# and its possible byte string values

38

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

39

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

40

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

41

# UTF-8

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

42

BOM_UTF8 = b'\xef\xbb\xbf'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

43

44

# UTF-16, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

45

BOM_LE = BOM_UTF16_LE = b'\xff\xfe'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

46

47

# UTF-16, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

48

BOM_BE = BOM_UTF16_BE = b'\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

49

50

# UTF-32, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

51

BOM_UTF32_LE = b'\xff\xfe\x00\x00'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

52

53

# UTF-32, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

54

BOM_UTF32_BE = b'\x00\x00\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

55

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

56

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

57

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

58

# UTF-16, native endianness

59

BOM = BOM_UTF16 = BOM_UTF16_LE

60

61

# UTF-32, native endianness

62

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

67

BOM = BOM_UTF16 = BOM_UTF16_BE

68

69

# UTF-32, native endianness

70

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

71

72

# Old broken names (don't use in new code)

73

BOM32_LE = BOM_UTF16_LE

74

BOM32_BE = BOM_UTF16_BE

75

BOM64_LE = BOM_UTF32_LE

76

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

77

78

79

### Codec base classes (defining the API)

80

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

81

class CodecInfo(tuple):

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

82

"""Codec details when looking up the codec registry"""

83

84

# Private API to allow Python 3.4 to blacklist the known non-Unicode

85

# codecs in the standard library. A more general mechanism to

86

# reliably distinguish test encodings from other codecs will hopefully

87

# be defined for Python 3.5

88

#

89

# See http://bugs.python.org/issue19619

90

_is_text_encoding = True # Assume codecs are text encodings by default

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

91

92

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

93

incrementalencoder=None, incrementaldecoder=None, name=None,

94

*, _is_text_encoding=None):

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

95

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

100

self.incrementaldecoder = incrementaldecoder

101

self.streamwriter = streamwriter

102

self.streamreader = streamreader

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

103

if _is_text_encoding is not None:

104

self._is_text_encoding = _is_text_encoding

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

105

return self

106

107

def __repr__(self):

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

108

return "<%s.%s object for encoding %s at 0x%x>" % \

109

(self.__class__.__module__, self.__class__.__name__,

110

self.name, id(self))

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

111

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

112

class Codec:

113

114

""" Defines the interface for stateless encoders/decoders.

115

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

116

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

117

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

118

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

119

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

120

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

121

'ignore' - ignore the character and continue with the next

122

'replace' - replace with a suitable replacement character;

123

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

124

CHARACTER for the builtin Unicode codecs on

125

decoding and '?' on encoding.

Serhiy Storchaka

d3faf43

2015-01-18 11:28:37 +0200

[diff] [blame]

126

'surrogateescape' - replace with private code points U+DCnn.

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

127

'xmlcharrefreplace' - Replace with the appropriate XML

128

character reference (only for encoding).

129

'backslashreplace' - Replace with backslashed escape sequences

130

(only for encoding).

131

132

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

133

134

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

135

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

136

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

137

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

138

object, length consumed).

139

140

errors defines the error handling to apply. It defaults to

141

'strict' handling.

142

143

The method may not store state in the Codec instance. Use

144

StreamCodec for codecs which have to keep state in order to

145

make encoding/decoding efficient.

146

147

The encoder must be able to handle zero length input and

148

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

153

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

154

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

155

156

""" Decodes the object input and returns a tuple (output

157

object, length consumed).

158

159

input must be an object which provides the bf_getreadbuf

160

buffer slot. Python strings, buffer objects and memory

161

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

162

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

163

errors defines the error handling to apply. It defaults to

164

'strict' handling.

165

166

The method may not store state in the Codec instance. Use

167

StreamCodec for codecs which have to keep state in order to

168

make encoding/decoding efficient.

169

170

The decoder must be able to handle zero length input and

171

return an empty object of the output object type in this

172

situation.

173

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

174

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

175

raise NotImplementedError

176

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

177

class IncrementalEncoder(object):

178

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

179

An IncrementalEncoder encodes an input in multiple steps. The input can

180

be passed piece by piece to the encode() method. The IncrementalEncoder

181

remembers the state of the encoding process between calls to encode().

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

182

"""

183

def __init__(self, errors='strict'):

184

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

185

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

186

187

The IncrementalEncoder may use different error handling schemes by

188

providing the errors keyword argument. See the module docstring

189

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

195

"""

196

Encodes input and returns the resulting object.

197

"""

198

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

203

"""

204

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

205

def getstate(self):

206

"""

207

Return the current state of the encoder.

"""

return 0

def setstate(self, state):

212

"""

213

Set the current state of the encoder. state must have been

214

returned by getstate().

215

"""

216

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

217

class BufferedIncrementalEncoder(IncrementalEncoder):

218

"""

219

This subclass of IncrementalEncoder can be used as the baseclass for an

220

incremental encoder if the encoder must keep some of the output in a

221

buffer between calls to encode().

222

"""

223

def __init__(self, errors='strict'):

224

IncrementalEncoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

225

# unencoded input that is kept between calls to encode()

226

self.buffer = ""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

227

228

def _buffer_encode(self, input, errors, final):

229

# Overwrite this method in subclasses: It must encode input

230

# and return an (output, length consumed) tuple

231

raise NotImplementedError

232

233

def encode(self, input, final=False):

234

# encode input (taking the buffer into account)

235

data = self.buffer + input

236

(result, consumed) = self._buffer_encode(data, self.errors, final)

237

# keep unencoded input until the next call

238

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

243

self.buffer = ""

244

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

245

def getstate(self):

246

return self.buffer or 0

247

248

def setstate(self, state):

249

self.buffer = state or ""

250

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

251

class IncrementalDecoder(object):

252

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

253

An IncrementalDecoder decodes an input in multiple steps. The input can

254

be passed piece by piece to the decode() method. The IncrementalDecoder

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

255

remembers the state of the decoding process between calls to decode().

256

"""

257

def __init__(self, errors='strict'):

258

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

259

Create a IncrementalDecoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

260

261

The IncrementalDecoder may use different error handling schemes by

262

providing the errors keyword argument. See the module docstring

263

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

268

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

269

Decode input and returns the resulting object.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

270

"""

271

raise NotImplementedError

272

273

def reset(self):

274

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

275

Reset the decoder to the initial state.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

276

"""

277

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

278

def getstate(self):

279

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

280

Return the current state of the decoder.

281

282

This must be a (buffered_input, additional_state_info) tuple.

283

buffered_input must be a bytes object containing bytes that

284

were passed to decode() that have not yet been converted.

285

additional_state_info must be a non-negative integer

286

representing the state of the decoder WITHOUT yet having

287

processed the contents of buffered_input. In the initial state

288

and after reset(), getstate() must return (b"", 0).

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

289

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

290

return (b"", 0)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

291

292

def setstate(self, state):

293

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

294

Set the current state of the decoder.

295

296

state must have been returned by getstate(). The effect of

297

setstate((b"", 0)) must be equivalent to reset().

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

298

"""

299

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

300

class BufferedIncrementalDecoder(IncrementalDecoder):

301

"""

302

This subclass of IncrementalDecoder can be used as the baseclass for an

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

303

incremental decoder if the decoder must be able to handle incomplete

304

byte sequences.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

305

"""

306

def __init__(self, errors='strict'):

307

IncrementalDecoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

308

# undecoded input that is kept between calls to decode()

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

309

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

310

311

def _buffer_decode(self, input, errors, final):

312

# Overwrite this method in subclasses: It must decode input

313

# and return an (output, length consumed) tuple

314

raise NotImplementedError

315

316

def decode(self, input, final=False):

317

# decode input (taking the buffer into account)

318

data = self.buffer + input

319

(result, consumed) = self._buffer_decode(data, self.errors, final)

320

# keep undecoded input until the next call

321

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

326

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

327

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

328

def getstate(self):

329

# additional state info is always 0

330

return (self.buffer, 0)

331

332

def setstate(self, state):

333

# ignore additional state info

334

self.buffer = state[0]

335

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

336

#

337

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

338

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

339

# very easily. See encodings/utf_8.py for an example on how this is

340

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

341

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

342

343

class StreamWriter(Codec):

344

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

345

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

346

347

""" Creates a StreamWriter instance.

348

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

349

stream must be a file-like object open for writing.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

350

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

351

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

352

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

353

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

354

355

'strict' - raise a ValueError (or a subclass)

356

'ignore' - ignore the character and continue with the next

357

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

358

'xmlcharrefreplace' - Replace with the appropriate XML

359

character reference.

360

'backslashreplace' - Replace with backslashed escape

361

sequences (only for encoding).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

362

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

363

The set of allowed parameter values can be extended via

364

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

369

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

370

371

""" Writes the object's contents encoded to self.stream.

372

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

373

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

374

self.stream.write(data)

375

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

376

def writelines(self, list):

377

378

""" Writes the concatenated list of strings to the stream

379

using .write().

380

"""

381

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

382

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

383

def reset(self):

384

385

""" Flushes and resets the codec buffers used for keeping state.

386

387

Calling this method should ensure that the data on the

388

output is put into a clean state, that allows appending

389

of new fresh data without having to rescan the whole

390

stream to recover state.

"""

pass

Victor Stinner

2010-05-22 16:59:09 +0000

[diff] [blame]

395

def seek(self, offset, whence=0):

396

self.stream.seek(offset, whence)

397

if whence == 0 and offset == 0:

398

self.reset()

399

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

400

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

401

getattr=getattr):

402

403

""" Inherit all other methods from the underlying stream.

404

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

405

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

406

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

411

self.stream.close()

412

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

413

###

414

415

class StreamReader(Codec):

416

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

417

charbuffertype = str

418

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

419

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

420

421

""" Creates a StreamReader instance.

422

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

423

stream must be a file-like object open for reading.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

424

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

425

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

426

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

427

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

428

429

'strict' - raise a ValueError (or a subclass)

430

'ignore' - ignore the character and continue with the next

431

'replace'- replace with a suitable replacement character;

432

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

433

The set of allowed parameter values can be extended via

434

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

435

"""

436

self.stream = stream

437

self.errors = errors

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

438

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

439

self._empty_charbuffer = self.charbuffertype()

440

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

441

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

442

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

443

def decode(self, input, errors='strict'):

444

raise NotImplementedError

445

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

446

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

447

448

""" Decodes data from the stream self.stream and returns the

449

resulting object.

450

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

451

chars indicates the number of decoded code points or bytes to

452

return. read() will never return more data than requested,

453

but it might return less, if there is not enough available.

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

454

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

455

size indicates the approximate maximum number of decoded

456

bytes or code points to read for decoding. The decoder

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

457

can modify this setting as appropriate. The default value

458

-1 indicates to read and decode as much as possible. size

459

is intended to prevent having to decode huge files in one

460

step.

461

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

462

If firstline is true, and a UnicodeDecodeError happens

463

after the first line terminator in the input only the first line

464

will be returned, the rest of the input will be kept until the

465

next call to read().

466

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

467

The method should use a greedy read strategy, meaning that

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

468

it should read as much data as is allowed within the

469

definition of the encoding and the given size, e.g. if

470

optional encoding endings or state markers are available

471

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

472

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

473

# If we have lines cached, first merge them back into characters

474

if self.linebuffer:

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

475

self.charbuffer = self._empty_charbuffer.join(self.linebuffer)

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

476

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

477

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

478

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

479

while True:

Tim Golden

621302c

2012-10-01 16:40:40 +0100

[diff] [blame]

480

# can the request be satisfied from the character buffer?

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

481

if chars >= 0:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

482

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

483

break

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

484

elif size >= 0:

485

if len(self.charbuffer) >= size:

486

break

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

487

# we need more data

488

if size < 0:

489

newdata = self.stream.read()

490

else:

491

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

492

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

493

data = self.bytebuffer + newdata

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

494

if not data:

495

break

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

496

try:

497

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

498

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

499

if firstline:

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

500

newchars, decodedbytes = \

501

self.decode(data[:exc.start], self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

502

lines = newchars.splitlines(keepends=True)

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

507

# keep undecoded bytes until the next call

508

self.bytebuffer = data[decodedbytes:]

509

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

510

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

511

# there was no data available

512

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

513

break

514

if chars < 0:

515

# Return everything we've got

516

result = self.charbuffer

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

517

self.charbuffer = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

518

else:

519

# Return the first chars characters

520

result = self.charbuffer[:chars]

521

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

522

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

523

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

524

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

525

526

""" Read one line from the input stream and return the

527

decoded data.

528

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

529

size, if given, is passed as size argument to the

530

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

531

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

532

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

533

# If we have lines cached from an earlier read, return

534

# them unconditionally

535

if self.linebuffer:

536

line = self.linebuffer[0]

537

del self.linebuffer[0]

538

if len(self.linebuffer) == 1:

539

# revert to charbuffer mode; we might need more data

540

# next time

541

self.charbuffer = self.linebuffer[0]

542

self.linebuffer = None

543

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

544

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

545

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

546

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

547

readsize = size or 72

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

548

line = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

549

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

550

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

551

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

552

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

553

# If we're at a "\r" read one extra character (which might

554

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

555

# temporarily exhausted we return the wrong line ending.

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

556

if (isinstance(data, str) and data.endswith("\r")) or \

557

(isinstance(data, bytes) and data.endswith(b"\r")):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

558

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

559

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

560

line += data

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

561

lines = line.splitlines(keepends=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

562

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

563

if len(lines) > 1:

564

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

570

lines[-1] += self.charbuffer

571

self.linebuffer = lines

572

self.charbuffer = None

573

else:

574

# only one remaining line, put it back into charbuffer

575

self.charbuffer = lines[0] + self.charbuffer

576

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

577

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

578

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

579

line0withend = lines[0]

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

580

line0withoutend = lines[0].splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

581

if line0withend != line0withoutend: # We really have a line end

582

# Put the rest back together and keep it until the next call

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

583

self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \

584

self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

589

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

590

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

591

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

592

if line and not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

593

line = line.splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

594

break

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

595

if readsize < 8000:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

596

readsize *= 2

597

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

598

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

599

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

600

601

""" Read all lines available on the input stream

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

602

and return them as a list.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

603

604

Line breaks are implemented using the codec's decoder

605

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

606

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

607

sizehint, if given, is ignored since there is no efficient

608

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

609

610

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

611

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

612

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

617

618

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

619

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

620

from decoding errors.

621

622

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

623

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

624

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

625

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

626

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

627

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

628

""" Set the input stream's current position.

629

630

Resets the codec buffers used for keeping state.

631

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

632

self.stream.seek(offset, whence)

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

633

self.reset()

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

634

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

635

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

636

637

""" Return the next decoded line from the input stream."""

638

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

646

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

647

getattr=getattr):

648

649

""" Inherit all other methods from the underlying stream.

650

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

651

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

652

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

657

self.stream.close()

658

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

659

###

660

661

class StreamReaderWriter:

662

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

663

""" StreamReaderWriter instances allow wrapping streams which

664

work in both read and write modes.

665

666

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

667

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

668

instance.

669

670

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

671

# Optional attributes set by the file wrappers below

672

encoding = 'unknown'

673

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

674

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

675

676

""" Creates a StreamReaderWriter instance.

677

678

stream must be a Stream-like object.

679

680

Reader, Writer must be factory functions or classes

681

providing the StreamReader, StreamWriter interface resp.

682

683

Error handling is done in the same way as defined for the

684

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

689

self.writer = Writer(stream, errors)

690

self.errors = errors

691

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

692

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

693

694

return self.reader.read(size)

695

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

696

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

697

698

return self.reader.readline(size)

699

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

700

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

701

702

return self.reader.readlines(sizehint)

703

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

704

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

705

706

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

707

return next(self.reader)

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

712

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

713

714

return self.writer.write(data)

715

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

716

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

717

718

return self.writer.writelines(list)

719

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Victor Stinner

2010-05-22 02:16:27 +0000

[diff] [blame]

725

def seek(self, offset, whence=0):

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

726

self.stream.seek(offset, whence)

727

self.reader.reset()

728

if whence == 0 and offset == 0:

729

self.writer.reset()

Victor Stinner

3fed087

2010-05-22 02:16:27 +0000

[diff] [blame]

730

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

731

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

732

getattr=getattr):

733

734

""" Inherit all other methods from the underlying stream.

735

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

736

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

737

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

738

# these are needed to make "with codecs.open(...)" work properly

def __enter__(self):

return self

def __exit__(self, type, value, tb):

744

self.stream.close()

745

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

750

""" StreamRecoder instances translate data from one encoding to another.

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

751

752

They use the complete set of APIs returned by the

753

codecs.lookup() function to implement their task.

754

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

755

Data written to the StreamRecoder is first decoded into an

756

intermediate format (depending on the "decode" codec) and then

757

written to the underlying stream using an instance of the provided

758

Writer class.

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

759

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

760

In the other direction, data is read from the underlying stream using

761

a Reader instance and then encoded and returned to the caller.

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

762

763

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

764

# Optional attributes set by the file wrappers below

765

data_encoding = 'unknown'

766

file_encoding = 'unknown'

767

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

768

def __init__(self, stream, encode, decode, Reader, Writer,

769

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

770

771

""" Creates a StreamRecoder instance which implements a two-way

772

conversion: encode and decode work on the frontend (the

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

773

data visible to .read() and .write()) while Reader and Writer

774

work on the backend (the data in stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

775

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

776

You can use these objects to do transparent

777

transcodings from e.g. latin-1 to utf-8 and back.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

778

779

stream must be a file-like object.

780

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

781

encode and decode must adhere to the Codec interface; Reader and

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

782

Writer must be factory functions or classes providing the

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

783

StreamReader and StreamWriter interfaces resp.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

784

785

Error handling is done in the same way as defined for the

786

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

793

self.writer = Writer(stream, errors)

794

self.errors = errors

795

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

796

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

797

798

data = self.reader.read(size)

799

data, bytesencoded = self.encode(data, self.errors)

800

return data

801

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

802

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

803

804

if size is None:

805

data = self.reader.readline()

806

else:

807

data = self.reader.readline(size)

808

data, bytesencoded = self.encode(data, self.errors)

809

return data

810

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

811

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

812

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

813

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

814

data, bytesencoded = self.encode(data, self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

815

return data.splitlines(keepends=True)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

816

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

817

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

818

819

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

820

data = next(self.reader)

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

821

data, bytesencoded = self.encode(data, self.errors)

822

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

827

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

828

829

data, bytesdecoded = self.decode(data, self.errors)

830

return self.writer.write(data)

831

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

832

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

833

834

data = ''.join(list)

835

data, bytesdecoded = self.decode(data, self.errors)

836

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

843

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

844

getattr=getattr):

845

846

""" Inherit all other methods from the underlying stream.

847

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

848

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

849

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

854

self.stream.close()

855

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

856

### Shortcuts

857

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

858

def open(filename, mode='r', encoding=None, errors='strict', buffering=1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

859

860

""" Open an encoded file using the given mode and return

861

a wrapped version providing transparent encoding/decoding.

862

863

Note: The wrapped version will only accept the object format

864

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

865

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

866

Unicode as well.

867

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

868

Underlying encoded files are always opened in binary mode.

869

The default file mode is 'r', meaning to open the file in read mode.

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

870

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

871

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

872

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

873

874

errors may be given to define the error handling. It defaults

875

to 'strict' which causes ValueErrors to be raised in case an

876

encoding error occurs.

877

878

buffering has the same meaning as for the builtin open() API.

879

It defaults to line buffered.

880

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

881

The returned wrapped file object provides an extra attribute

882

.encoding which allows querying the used encoding. This

883

attribute is only available if an encoding was specified as

884

parameter.

885

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

886

"""

887

if encoding is not None and \

888

'b' not in mode:

889

# Force opening of the file in binary mode

890

mode = mode + 'b'

Georg Brandl

1a3284e

2007-12-02 09:40:06 +0000

[diff] [blame]

891

file = builtins.open(filename, mode, buffering)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

892

if encoding is None:

893

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

894

info = lookup(encoding)

895

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

896

# Add attributes to simplify introspection

897

srw.encoding = encoding

898

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

899

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

900

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

901

902

""" Return a wrapped version of file which provides transparent

903

encoding translation.

904

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

905

Data written to the wrapped file is decoded according

906

to the given data_encoding and then encoded to the underlying

907

file using file_encoding. The intermediate data type

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

908

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

909

Nick Coghlan

2015-01-07 00:22:00 +1000

[diff] [blame]

910

Bytes read from the file are decoded using file_encoding and then

911

passed back to the caller encoded using data_encoding.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

912

913

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

914

915

errors may be given to define the error handling. It defaults

916

to 'strict' which causes ValueErrors to be raised in case an

917

encoding error occurs.

918

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

919

The returned wrapped file object provides two extra attributes

920

.data_encoding and .file_encoding which reflect the given

921

parameters of the same name. The attributes can be used for

922

introspection by Python programs.

923

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

924

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

925

if file_encoding is None:

926

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

927

data_info = lookup(data_encoding)

928

file_info = lookup(file_encoding)

929

sr = StreamRecoder(file, data_info.encode, data_info.decode,

930

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

931

# Add attributes to simplify introspection

932

sr.data_encoding = data_encoding

933

sr.file_encoding = file_encoding

934

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

935

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

936

### Helpers for codec lookup

937

938

def getencoder(encoding):

939

940

""" Lookup up the codec for the given encoding and return

941

its encoder function.

942

943

Raises a LookupError in case the encoding cannot be found.

944

945

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

946

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

947

948

def getdecoder(encoding):

949

950

""" Lookup up the codec for the given encoding and return

951

its decoder function.

952

953

Raises a LookupError in case the encoding cannot be found.

954

955

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

956

return lookup(encoding).decode

957

958

def getincrementalencoder(encoding):

959

960

""" Lookup up the codec for the given encoding and return

961

its IncrementalEncoder class or factory function.

962

963

Raises a LookupError in case the encoding cannot be found

964

or the codecs doesn't provide an incremental encoder.

965

966

"""

967

encoder = lookup(encoding).incrementalencoder

968

if encoder is None:

969

raise LookupError(encoding)

970

return encoder

971

972

def getincrementaldecoder(encoding):

973

974

""" Lookup up the codec for the given encoding and return

975

its IncrementalDecoder class or factory function.

976

977

Raises a LookupError in case the encoding cannot be found

978

or the codecs doesn't provide an incremental decoder.

979

980

"""

981

decoder = lookup(encoding).incrementaldecoder

982

if decoder is None:

983

raise LookupError(encoding)

984

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

985

986

def getreader(encoding):

987

988

""" Lookup up the codec for the given encoding and return

989

its StreamReader class or factory function.

990

991

Raises a LookupError in case the encoding cannot be found.

992

993

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

994

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

995

996

def getwriter(encoding):

997

998

""" Lookup up the codec for the given encoding and return

999

its StreamWriter class or factory function.

1000

1001

Raises a LookupError in case the encoding cannot be found.

1002

1003

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1004

return lookup(encoding).streamwriter

1005

1006

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Encodes the input strings from the iterator using a IncrementalEncoder.

1011

1012

errors and kwargs are passed through to the IncrementalEncoder

1013

constructor.

1014

"""

1015

encoder = getincrementalencoder(encoding)(errors, **kwargs)

1016

for input in iterator:

1017

output = encoder.encode(input)

1018

if output:

1019

yield output

1020

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Decodes the input strings from the iterator using a IncrementalDecoder.

1029

1030

errors and kwargs are passed through to the IncrementalDecoder

1031

constructor.

1032

"""

1033

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

1034

for input in iterator:

1035

output = decoder.decode(input)

1036

if output:

1037

yield output

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

1038

output = decoder.decode(b"", True)

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1039

if output:

1040

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1041

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1042

### Helpers for charmap-based codecs

1043

1044

def make_identity_dict(rng):

1045

1046

""" make_identity_dict(rng) -> dict

1047

1048

Return a dictionary where elements of the rng sequence are

1049

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

1050

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1051

"""

Antoine Pitrou

aaefac7

2012-06-16 22:48:21 +0200

[diff] [blame]

1052

return {i:i for i in rng}

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1053

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

1054

def make_encoding_map(decoding_map):

1055

1056

""" Creates an encoding map from a decoding map.

1057

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

1058

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

1059

times, then that target is mapped to None (undefined mapping),

1060

causing an exception when encountered by the charmap codec

1061

during translation.

1062

1063

One example where this happens is cp875.py which decodes

Serhiy Storchaka

9f8a891

2015-04-03 18:12:41 +0300

[diff] [blame]

1064

multiple character to \\u001a.

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

1069

if not v in m:

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1074

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

1075

### error handlers

1076

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1077

try:

1078

strict_errors = lookup_error("strict")

1079

ignore_errors = lookup_error("ignore")

1080

replace_errors = lookup_error("replace")

1081

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1082

backslashreplace_errors = lookup_error("backslashreplace")

1083

except LookupError:

1084

# In --disable-unicode builds, these error handler are missing

1085

strict_errors = None

1086

ignore_errors = None

1087

replace_errors = None

1088

xmlcharrefreplace_errors = None

1089

backslashreplace_errors = None

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

1090

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1091

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1097

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1098

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1099

if __name__ == '__main__':

1100

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1101

# Make stdout translate Latin-1 output into UTF-8 output

1102

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1103

Guido van Rossum