Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

"""#"

Georg Brandl

2007-12-02 09:40:06 +0000

[diff] [blame]

10

import builtins, sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

11

12

### Registry and builtin stateless codec functions

13

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

14

try:

15

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

16

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

17

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

18

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

19

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

20

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

21

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

22

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

23

"strict_errors", "ignore_errors", "replace_errors",

24

"xmlcharrefreplace_errors",

25

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

26

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

27

### Constants

28

29

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

30

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

31

# and its possible byte string values

32

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

33

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

34

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

35

# UTF-8

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

36

BOM_UTF8 = b'\xef\xbb\xbf'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

37

38

# UTF-16, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

39

BOM_LE = BOM_UTF16_LE = b'\xff\xfe'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

40

41

# UTF-16, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

42

BOM_BE = BOM_UTF16_BE = b'\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

43

44

# UTF-32, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

45

BOM_UTF32_LE = b'\xff\xfe\x00\x00'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

46

47

# UTF-32, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

48

BOM_UTF32_BE = b'\x00\x00\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

49

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

50

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

51

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

52

# UTF-16, native endianness

53

BOM = BOM_UTF16 = BOM_UTF16_LE

54

55

# UTF-32, native endianness

56

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

61

BOM = BOM_UTF16 = BOM_UTF16_BE

62

63

# UTF-32, native endianness

64

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

65

66

# Old broken names (don't use in new code)

67

BOM32_LE = BOM_UTF16_LE

68

BOM32_BE = BOM_UTF16_BE

69

BOM64_LE = BOM_UTF32_LE

70

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

71

72

73

### Codec base classes (defining the API)

74

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

75

class CodecInfo(tuple):

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

76

"""Codec details when looking up the codec registry"""

77

78

# Private API to allow Python 3.4 to blacklist the known non-Unicode

79

# codecs in the standard library. A more general mechanism to

80

# reliably distinguish test encodings from other codecs will hopefully

81

# be defined for Python 3.5

82

#

83

# See http://bugs.python.org/issue19619

84

_is_text_encoding = True # Assume codecs are text encodings by default

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

85

86

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

87

incrementalencoder=None, incrementaldecoder=None, name=None,

88

*, _is_text_encoding=None):

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

89

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

94

self.incrementaldecoder = incrementaldecoder

95

self.streamwriter = streamwriter

96

self.streamreader = streamreader

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

97

if _is_text_encoding is not None:

98

self._is_text_encoding = _is_text_encoding

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

99

return self

100

101

def __repr__(self):

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

102

return "<%s.%s object for encoding %s at 0x%x>" % \

103

(self.__class__.__module__, self.__class__.__name__,

104

self.name, id(self))

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

105

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

106

class Codec:

107

108

""" Defines the interface for stateless encoders/decoders.

109

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

110

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

111

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

112

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

113

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

114

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

115

'ignore' - ignore the character and continue with the next

116

'replace' - replace with a suitable replacement character;

117

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

118

CHARACTER for the builtin Unicode codecs on

119

decoding and '?' on encoding.

Andrew Kuchling

c7b6c50

2013-06-16 12:58:48 -0400

[diff] [blame]

120

'surrogateescape' - replace with private codepoints U+DCnn.

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

121

'xmlcharrefreplace' - Replace with the appropriate XML

122

character reference (only for encoding).

123

'backslashreplace' - Replace with backslashed escape sequences

124

(only for encoding).

125

126

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

127

128

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

129

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

130

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

131

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

132

object, length consumed).

133

134

errors defines the error handling to apply. It defaults to

135

'strict' handling.

136

137

The method may not store state in the Codec instance. Use

138

StreamCodec for codecs which have to keep state in order to

139

make encoding/decoding efficient.

140

141

The encoder must be able to handle zero length input and

142

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

147

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

148

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

149

150

""" Decodes the object input and returns a tuple (output

151

object, length consumed).

152

153

input must be an object which provides the bf_getreadbuf

154

buffer slot. Python strings, buffer objects and memory

155

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

156

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

157

errors defines the error handling to apply. It defaults to

158

'strict' handling.

159

160

The method may not store state in the Codec instance. Use

161

StreamCodec for codecs which have to keep state in order to

162

make encoding/decoding efficient.

163

164

The decoder must be able to handle zero length input and

165

return an empty object of the output object type in this

166

situation.

167

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

168

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

169

raise NotImplementedError

170

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

171

class IncrementalEncoder(object):

172

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

173

An IncrementalEncoder encodes an input in multiple steps. The input can

174

be passed piece by piece to the encode() method. The IncrementalEncoder

175

remembers the state of the encoding process between calls to encode().

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

176

"""

177

def __init__(self, errors='strict'):

178

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

179

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

180

181

The IncrementalEncoder may use different error handling schemes by

182

providing the errors keyword argument. See the module docstring

183

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

189

"""

190

Encodes input and returns the resulting object.

191

"""

192

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

197

"""

198

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

199

def getstate(self):

200

"""

201

Return the current state of the encoder.

"""

return 0

def setstate(self, state):

206

"""

207

Set the current state of the encoder. state must have been

208

returned by getstate().

209

"""

210

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

211

class BufferedIncrementalEncoder(IncrementalEncoder):

212

"""

213

This subclass of IncrementalEncoder can be used as the baseclass for an

214

incremental encoder if the encoder must keep some of the output in a

215

buffer between calls to encode().

216

"""

217

def __init__(self, errors='strict'):

218

IncrementalEncoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

219

# unencoded input that is kept between calls to encode()

220

self.buffer = ""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

221

222

def _buffer_encode(self, input, errors, final):

223

# Overwrite this method in subclasses: It must encode input

224

# and return an (output, length consumed) tuple

225

raise NotImplementedError

226

227

def encode(self, input, final=False):

228

# encode input (taking the buffer into account)

229

data = self.buffer + input

230

(result, consumed) = self._buffer_encode(data, self.errors, final)

231

# keep unencoded input until the next call

232

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

237

self.buffer = ""

238

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

239

def getstate(self):

240

return self.buffer or 0

241

242

def setstate(self, state):

243

self.buffer = state or ""

244

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

245

class IncrementalDecoder(object):

246

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

247

An IncrementalDecoder decodes an input in multiple steps. The input can

248

be passed piece by piece to the decode() method. The IncrementalDecoder

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

249

remembers the state of the decoding process between calls to decode().

250

"""

251

def __init__(self, errors='strict'):

252

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

253

Create a IncrementalDecoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

254

255

The IncrementalDecoder may use different error handling schemes by

256

providing the errors keyword argument. See the module docstring

257

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

262

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

263

Decode input and returns the resulting object.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

264

"""

265

raise NotImplementedError

266

267

def reset(self):

268

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

269

Reset the decoder to the initial state.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

270

"""

271

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

272

def getstate(self):

273

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

274

Return the current state of the decoder.

275

276

This must be a (buffered_input, additional_state_info) tuple.

277

buffered_input must be a bytes object containing bytes that

278

were passed to decode() that have not yet been converted.

279

additional_state_info must be a non-negative integer

280

representing the state of the decoder WITHOUT yet having

281

processed the contents of buffered_input. In the initial state

282

and after reset(), getstate() must return (b"", 0).

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

283

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

284

return (b"", 0)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

285

286

def setstate(self, state):

287

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

288

Set the current state of the decoder.

289

290

state must have been returned by getstate(). The effect of

291

setstate((b"", 0)) must be equivalent to reset().

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

292

"""

293

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

294

class BufferedIncrementalDecoder(IncrementalDecoder):

295

"""

296

This subclass of IncrementalDecoder can be used as the baseclass for an

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

297

incremental decoder if the decoder must be able to handle incomplete

298

byte sequences.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

299

"""

300

def __init__(self, errors='strict'):

301

IncrementalDecoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

302

# undecoded input that is kept between calls to decode()

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

303

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

304

305

def _buffer_decode(self, input, errors, final):

306

# Overwrite this method in subclasses: It must decode input

307

# and return an (output, length consumed) tuple

308

raise NotImplementedError

309

310

def decode(self, input, final=False):

311

# decode input (taking the buffer into account)

312

data = self.buffer + input

313

(result, consumed) = self._buffer_decode(data, self.errors, final)

314

# keep undecoded input until the next call

315

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

320

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

321

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

322

def getstate(self):

323

# additional state info is always 0

324

return (self.buffer, 0)

325

326

def setstate(self, state):

327

# ignore additional state info

328

self.buffer = state[0]

329

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

330

#

331

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

332

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

333

# very easily. See encodings/utf_8.py for an example on how this is

334

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

335

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

336

337

class StreamWriter(Codec):

338

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

339

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

340

341

""" Creates a StreamWriter instance.

342

343

stream must be a file-like object open for writing

344

(binary) data.

345

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

346

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

347

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

348

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

349

350

'strict' - raise a ValueError (or a subclass)

351

'ignore' - ignore the character and continue with the next

352

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

353

'xmlcharrefreplace' - Replace with the appropriate XML

354

character reference.

355

'backslashreplace' - Replace with backslashed escape

356

sequences (only for encoding).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

357

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

358

The set of allowed parameter values can be extended via

359

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

364

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

365

366

""" Writes the object's contents encoded to self.stream.

367

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

368

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

369

self.stream.write(data)

370

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

371

def writelines(self, list):

372

373

""" Writes the concatenated list of strings to the stream

374

using .write().

375

"""

376

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

377

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

378

def reset(self):

379

380

""" Flushes and resets the codec buffers used for keeping state.

381

382

Calling this method should ensure that the data on the

383

output is put into a clean state, that allows appending

384

of new fresh data without having to rescan the whole

385

stream to recover state.

"""

pass

Victor Stinner

2010-05-22 16:59:09 +0000

[diff] [blame]

390

def seek(self, offset, whence=0):

391

self.stream.seek(offset, whence)

392

if whence == 0 and offset == 0:

393

self.reset()

394

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

395

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

396

getattr=getattr):

397

398

""" Inherit all other methods from the underlying stream.

399

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

400

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

401

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

406

self.stream.close()

407

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

408

###

409

410

class StreamReader(Codec):

411

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

412

charbuffertype = str

413

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

414

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

415

416

""" Creates a StreamReader instance.

417

418

stream must be a file-like object open for reading

419

(binary) data.

420

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

421

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

422

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

423

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

424

425

'strict' - raise a ValueError (or a subclass)

426

'ignore' - ignore the character and continue with the next

427

'replace'- replace with a suitable replacement character;

428

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

429

The set of allowed parameter values can be extended via

430

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

431

"""

432

self.stream = stream

433

self.errors = errors

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

434

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

435

self._empty_charbuffer = self.charbuffertype()

436

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

437

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

438

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

439

def decode(self, input, errors='strict'):

440

raise NotImplementedError

441

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

442

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

443

444

""" Decodes data from the stream self.stream and returns the

445

resulting object.

446

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

447

chars indicates the number of characters to read from the

448

stream. read() will never return more than chars

449

characters, but it might return less, if there are not enough

450

characters available.

451

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

452

size indicates the approximate maximum number of bytes to

453

read from the stream for decoding purposes. The decoder

454

can modify this setting as appropriate. The default value

455

-1 indicates to read and decode as much as possible. size

456

is intended to prevent having to decode huge files in one

457

step.

458

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

459

If firstline is true, and a UnicodeDecodeError happens

460

after the first line terminator in the input only the first line

461

will be returned, the rest of the input will be kept until the

462

next call to read().

463

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

464

The method should use a greedy read strategy meaning that

465

it should read as much data as is allowed within the

466

definition of the encoding and the given size, e.g. if

467

optional encoding endings or state markers are available

468

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

469

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

470

# If we have lines cached, first merge them back into characters

471

if self.linebuffer:

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

472

self.charbuffer = self._empty_charbuffer.join(self.linebuffer)

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

473

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

474

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

475

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

476

while True:

Tim Golden

621302c

2012-10-01 16:40:40 +0100

[diff] [blame]

477

# can the request be satisfied from the character buffer?

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

478

if chars < 0:

Walter Dörwald

ca19943

2006-03-06 22:39:12 +0000

[diff] [blame]

if size < 0:

if self.charbuffer:

break

elif len(self.charbuffer) >= size:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

483

break

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

484

else:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

485

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

break

# we need more data

if size < 0:

newdata = self.stream.read()

490

else:

491

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

492

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

493

data = self.bytebuffer + newdata

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

494

try:

495

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

496

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

497

if firstline:

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

498

newchars, decodedbytes = \

499

self.decode(data[:exc.start], self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

500

lines = newchars.splitlines(keepends=True)

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

505

# keep undecoded bytes until the next call

506

self.bytebuffer = data[decodedbytes:]

507

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

508

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

509

# there was no data available

510

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

511

break

512

if chars < 0:

513

# Return everything we've got

514

result = self.charbuffer

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

515

self.charbuffer = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

516

else:

517

# Return the first chars characters

518

result = self.charbuffer[:chars]

519

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

520

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

521

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

522

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

523

524

""" Read one line from the input stream and return the

525

decoded data.

526

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

527

size, if given, is passed as size argument to the

528

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

529

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

530

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

531

# If we have lines cached from an earlier read, return

532

# them unconditionally

533

if self.linebuffer:

534

line = self.linebuffer[0]

535

del self.linebuffer[0]

536

if len(self.linebuffer) == 1:

537

# revert to charbuffer mode; we might need more data

538

# next time

539

self.charbuffer = self.linebuffer[0]

540

self.linebuffer = None

541

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

542

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

543

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

544

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

545

readsize = size or 72

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

546

line = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

547

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

548

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

549

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

550

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

551

# If we're at a "\r" read one extra character (which might

552

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

553

# temporarily exhausted we return the wrong line ending.

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

554

if (isinstance(data, str) and data.endswith("\r")) or \

555

(isinstance(data, bytes) and data.endswith(b"\r")):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

556

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

557

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

558

line += data

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

559

lines = line.splitlines(keepends=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

560

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

561

if len(lines) > 1:

562

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

568

lines[-1] += self.charbuffer

569

self.linebuffer = lines

570

self.charbuffer = None

571

else:

572

# only one remaining line, put it back into charbuffer

573

self.charbuffer = lines[0] + self.charbuffer

574

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

575

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

576

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

577

line0withend = lines[0]

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

578

line0withoutend = lines[0].splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

579

if line0withend != line0withoutend: # We really have a line end

580

# Put the rest back together and keep it until the next call

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

581

self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \

582

self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

587

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

588

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

589

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

590

if line and not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

591

line = line.splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

592

break

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

593

if readsize < 8000:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

594

readsize *= 2

595

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

596

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

597

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

598

599

""" Read all lines available on the input stream

600

and return them as list of lines.

601

602

Line breaks are implemented using the codec's decoder

603

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

604

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

605

sizehint, if given, is ignored since there is no efficient

606

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

607

608

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

609

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

610

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

615

616

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

617

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

618

from decoding errors.

619

620

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

621

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

622

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

623

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

624

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

625

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

626

""" Set the input stream's current position.

627

628

Resets the codec buffers used for keeping state.

629

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

630

self.stream.seek(offset, whence)

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

631

self.reset()

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

632

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

633

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

634

635

""" Return the next decoded line from the input stream."""

636

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

644

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

645

getattr=getattr):

646

647

""" Inherit all other methods from the underlying stream.

648

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

649

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

650

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

655

self.stream.close()

656

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

657

###

658

659

class StreamReaderWriter:

660

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

661

""" StreamReaderWriter instances allow wrapping streams which

662

work in both read and write modes.

663

664

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

665

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

666

instance.

667

668

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

669

# Optional attributes set by the file wrappers below

670

encoding = 'unknown'

671

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

672

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

673

674

""" Creates a StreamReaderWriter instance.

675

676

stream must be a Stream-like object.

677

678

Reader, Writer must be factory functions or classes

679

providing the StreamReader, StreamWriter interface resp.

680

681

Error handling is done in the same way as defined for the

682

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

687

self.writer = Writer(stream, errors)

688

self.errors = errors

689

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

690

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

691

692

return self.reader.read(size)

693

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

694

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

695

696

return self.reader.readline(size)

697

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

698

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

699

700

return self.reader.readlines(sizehint)

701

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

702

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

703

704

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

705

return next(self.reader)

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

710

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

711

712

return self.writer.write(data)

713

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

714

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

715

716

return self.writer.writelines(list)

717

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Victor Stinner

2010-05-22 02:16:27 +0000

[diff] [blame]

723

def seek(self, offset, whence=0):

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

724

self.stream.seek(offset, whence)

725

self.reader.reset()

726

if whence == 0 and offset == 0:

727

self.writer.reset()

Victor Stinner

3fed087

2010-05-22 02:16:27 +0000

[diff] [blame]

728

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

729

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

730

getattr=getattr):

731

732

""" Inherit all other methods from the underlying stream.

733

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

734

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

735

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

736

# these are needed to make "with codecs.open(...)" work properly

def __enter__(self):

return self

def __exit__(self, type, value, tb):

742

self.stream.close()

743

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

748

""" StreamRecoder instances provide a frontend - backend

749

view of encoding data.

750

751

They use the complete set of APIs returned by the

752

codecs.lookup() function to implement their task.

753

754

Data written to the stream is first decoded into an

755

intermediate format (which is dependent on the given codec

756

combination) and then written to the stream using an instance

757

of the provided Writer class.

758

759

In the other direction, data is read from the stream using a

760

Reader instance and then return encoded data to the caller.

761

762

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

763

# Optional attributes set by the file wrappers below

764

data_encoding = 'unknown'

765

file_encoding = 'unknown'

766

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

767

def __init__(self, stream, encode, decode, Reader, Writer,

768

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

769

770

""" Creates a StreamRecoder instance which implements a two-way

771

conversion: encode and decode work on the frontend (the

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

772

input to .read() and output of .write()) while

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

773

Reader and Writer work on the backend (reading and

Fred Drake

908670c

2000-03-17 15:42:11 +0000

[diff] [blame]

774

writing to the stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

775

776

You can use these objects to do transparent direct

777

recodings from e.g. latin-1 to utf-8 and back.

778

779

stream must be a file-like object.

780

781

encode, decode must adhere to the Codec interface, Reader,

782

Writer must be factory functions or classes providing the

783

StreamReader, StreamWriter interface resp.

784

785

encode and decode are needed for the frontend translation,

786

Reader and Writer for the backend translation. Unicode is

787

used as intermediate encoding.

788

789

Error handling is done in the same way as defined for the

790

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

797

self.writer = Writer(stream, errors)

798

self.errors = errors

799

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

800

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

801

802

data = self.reader.read(size)

803

data, bytesencoded = self.encode(data, self.errors)

804

return data

805

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

806

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

807

808

if size is None:

809

data = self.reader.readline()

810

else:

811

data = self.reader.readline(size)

812

data, bytesencoded = self.encode(data, self.errors)

813

return data

814

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

815

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

816

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

817

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

818

data, bytesencoded = self.encode(data, self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

819

return data.splitlines(keepends=True)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

820

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

821

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

822

823

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

824

data = next(self.reader)

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

825

data, bytesencoded = self.encode(data, self.errors)

826

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

831

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

832

833

data, bytesdecoded = self.decode(data, self.errors)

834

return self.writer.write(data)

835

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

836

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

837

838

data = ''.join(list)

839

data, bytesdecoded = self.decode(data, self.errors)

840

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

847

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

848

getattr=getattr):

849

850

""" Inherit all other methods from the underlying stream.

851

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

852

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

853

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

858

self.stream.close()

859

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

860

### Shortcuts

861

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

862

def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

863

864

""" Open an encoded file using the given mode and return

865

a wrapped version providing transparent encoding/decoding.

866

867

Note: The wrapped version will only accept the object format

868

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

869

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

870

Unicode as well.

871

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

872

Files are always opened in binary mode, even if no binary mode

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

873

was specified. This is done to avoid data loss due to encodings

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

874

using 8-bit values. The default file mode is 'rb' meaning to

875

open the file in binary read mode.

876

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

877

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

878

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

879

880

errors may be given to define the error handling. It defaults

881

to 'strict' which causes ValueErrors to be raised in case an

882

encoding error occurs.

883

884

buffering has the same meaning as for the builtin open() API.

885

It defaults to line buffered.

886

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

887

The returned wrapped file object provides an extra attribute

888

.encoding which allows querying the used encoding. This

889

attribute is only available if an encoding was specified as

890

parameter.

891

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

892

"""

893

if encoding is not None and \

894

'b' not in mode:

895

# Force opening of the file in binary mode

896

mode = mode + 'b'

Georg Brandl

1a3284e

2007-12-02 09:40:06 +0000

[diff] [blame]

897

file = builtins.open(filename, mode, buffering)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

898

if encoding is None:

899

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

900

info = lookup(encoding)

901

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

902

# Add attributes to simplify introspection

903

srw.encoding = encoding

904

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

905

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

906

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

907

908

""" Return a wrapped version of file which provides transparent

909

encoding translation.

910

911

Strings written to the wrapped file are interpreted according

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

912

to the given data_encoding and then written to the original

913

file as string using file_encoding. The intermediate encoding

914

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

915

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

916

Strings are read from the file using file_encoding and then

917

passed back to the caller as string using data_encoding.

918

919

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

920

921

errors may be given to define the error handling. It defaults

922

to 'strict' which causes ValueErrors to be raised in case an

923

encoding error occurs.

924

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

925

The returned wrapped file object provides two extra attributes

926

.data_encoding and .file_encoding which reflect the given

927

parameters of the same name. The attributes can be used for

928

introspection by Python programs.

929

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

930

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

931

if file_encoding is None:

932

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

933

data_info = lookup(data_encoding)

934

file_info = lookup(file_encoding)

935

sr = StreamRecoder(file, data_info.encode, data_info.decode,

936

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

937

# Add attributes to simplify introspection

938

sr.data_encoding = data_encoding

939

sr.file_encoding = file_encoding

940

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

941

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

942

### Helpers for codec lookup

943

944

def getencoder(encoding):

945

946

""" Lookup up the codec for the given encoding and return

947

its encoder function.

948

949

Raises a LookupError in case the encoding cannot be found.

950

951

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

952

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

953

954

def getdecoder(encoding):

955

956

""" Lookup up the codec for the given encoding and return

957

its decoder function.

958

959

Raises a LookupError in case the encoding cannot be found.

960

961

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

962

return lookup(encoding).decode

963

964

def getincrementalencoder(encoding):

965

966

""" Lookup up the codec for the given encoding and return

967

its IncrementalEncoder class or factory function.

968

969

Raises a LookupError in case the encoding cannot be found

970

or the codecs doesn't provide an incremental encoder.

971

972

"""

973

encoder = lookup(encoding).incrementalencoder

974

if encoder is None:

975

raise LookupError(encoding)

976

return encoder

977

978

def getincrementaldecoder(encoding):

979

980

""" Lookup up the codec for the given encoding and return

981

its IncrementalDecoder class or factory function.

982

983

Raises a LookupError in case the encoding cannot be found

984

or the codecs doesn't provide an incremental decoder.

985

986

"""

987

decoder = lookup(encoding).incrementaldecoder

988

if decoder is None:

989

raise LookupError(encoding)

990

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

991

992

def getreader(encoding):

993

994

""" Lookup up the codec for the given encoding and return

995

its StreamReader class or factory function.

996

997

Raises a LookupError in case the encoding cannot be found.

998

999

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1000

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1001

1002

def getwriter(encoding):

1003

1004

""" Lookup up the codec for the given encoding and return

1005

its StreamWriter class or factory function.

1006

1007

Raises a LookupError in case the encoding cannot be found.

1008

1009

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1010

return lookup(encoding).streamwriter

1011

1012

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Encodes the input strings from the iterator using a IncrementalEncoder.

1017

1018

errors and kwargs are passed through to the IncrementalEncoder

1019

constructor.

1020

"""

1021

encoder = getincrementalencoder(encoding)(errors, **kwargs)

1022

for input in iterator:

1023

output = encoder.encode(input)

1024

if output:

1025

yield output

1026

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Decodes the input strings from the iterator using a IncrementalDecoder.

1035

1036

errors and kwargs are passed through to the IncrementalDecoder

1037

constructor.

1038

"""

1039

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

1040

for input in iterator:

1041

output = decoder.decode(input)

1042

if output:

1043

yield output

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

1044

output = decoder.decode(b"", True)

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1045

if output:

1046

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1047

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1048

### Helpers for charmap-based codecs

1049

1050

def make_identity_dict(rng):

1051

1052

""" make_identity_dict(rng) -> dict

1053

1054

Return a dictionary where elements of the rng sequence are

1055

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

1056

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1057

"""

Antoine Pitrou

aaefac7

2012-06-16 22:48:21 +0200

[diff] [blame]

1058

return {i:i for i in rng}

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1059

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1060

def make_encoding_map(decoding_map):

1061

1062

""" Creates an encoding map from a decoding map.

1063

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

1064

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1065

times, then that target is mapped to None (undefined mapping),

1066

causing an exception when encountered by the charmap codec

1067

during translation.

1068

1069

One example where this happens is cp875.py which decodes

1070

multiple character to \u001a.

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

1075

if not v in m:

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1080

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1081

### error handlers

1082

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1083

try:

1084

strict_errors = lookup_error("strict")

1085

ignore_errors = lookup_error("ignore")

1086

replace_errors = lookup_error("replace")

1087

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1088

backslashreplace_errors = lookup_error("backslashreplace")

1089

except LookupError:

1090

# In --disable-unicode builds, these error handler are missing

1091

strict_errors = None

1092

ignore_errors = None

1093

replace_errors = None

1094

xmlcharrefreplace_errors = None

1095

backslashreplace_errors = None

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1096

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1097

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1103

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1104

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1105

if __name__ == '__main__':

1106

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1107

# Make stdout translate Latin-1 output into UTF-8 output

1108

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1109

Guido van Rossum