Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

"""#"

Georg Brandl

2007-12-02 09:40:06 +0000

[diff] [blame]

10

import builtins, sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

11

12

### Registry and builtin stateless codec functions

13

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

14

try:

15

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

16

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

17

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

18

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

19

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

20

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

21

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

22

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

23

"strict_errors", "ignore_errors", "replace_errors",

24

"xmlcharrefreplace_errors",

25

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

26

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

27

### Constants

28

29

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

30

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

31

# and its possible byte string values

32

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

33

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

34

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

35

# UTF-8

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

36

BOM_UTF8 = b'\xef\xbb\xbf'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

37

38

# UTF-16, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

39

BOM_LE = BOM_UTF16_LE = b'\xff\xfe'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

40

41

# UTF-16, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

42

BOM_BE = BOM_UTF16_BE = b'\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

43

44

# UTF-32, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

45

BOM_UTF32_LE = b'\xff\xfe\x00\x00'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

46

47

# UTF-32, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

48

BOM_UTF32_BE = b'\x00\x00\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

49

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

50

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

51

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

52

# UTF-16, native endianness

53

BOM = BOM_UTF16 = BOM_UTF16_LE

54

55

# UTF-32, native endianness

56

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

61

BOM = BOM_UTF16 = BOM_UTF16_BE

62

63

# UTF-32, native endianness

64

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

65

66

# Old broken names (don't use in new code)

67

BOM32_LE = BOM_UTF16_LE

68

BOM32_BE = BOM_UTF16_BE

69

BOM64_LE = BOM_UTF32_LE

70

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

71

72

73

### Codec base classes (defining the API)

74

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

75

class CodecInfo(tuple):

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

76

"""Codec details when looking up the codec registry"""

77

78

# Private API to allow Python 3.4 to blacklist the known non-Unicode

79

# codecs in the standard library. A more general mechanism to

80

# reliably distinguish test encodings from other codecs will hopefully

81

# be defined for Python 3.5

82

#

83

# See http://bugs.python.org/issue19619

84

_is_text_encoding = True # Assume codecs are text encodings by default

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

85

86

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

87

incrementalencoder=None, incrementaldecoder=None, name=None,

88

*, _is_text_encoding=None):

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

89

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

94

self.incrementaldecoder = incrementaldecoder

95

self.streamwriter = streamwriter

96

self.streamreader = streamreader

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

97

if _is_text_encoding is not None:

98

self._is_text_encoding = _is_text_encoding

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

99

return self

100

101

def __repr__(self):

Serhiy Storchaka

521e586

2014-07-22 15:00:37 +0300

[diff] [blame^]

102

return "<%s.%s object for encoding %s at %#x>" % \

103

(self.__class__.__module__, self.__class__.__qualname__,

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

104

self.name, id(self))

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

105

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

106

class Codec:

107

108

""" Defines the interface for stateless encoders/decoders.

109

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

110

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

111

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

112

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

113

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

114

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

115

'ignore' - ignore the character and continue with the next

116

'replace' - replace with a suitable replacement character;

117

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

118

CHARACTER for the builtin Unicode codecs on

119

decoding and '?' on encoding.

Andrew Kuchling

c7b6c50

2013-06-16 12:58:48 -0400

[diff] [blame]

120

'surrogateescape' - replace with private codepoints U+DCnn.

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

121

'xmlcharrefreplace' - Replace with the appropriate XML

122

character reference (only for encoding).

123

'backslashreplace' - Replace with backslashed escape sequences

124

(only for encoding).

125

126

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

127

128

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

129

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

130

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

131

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

132

object, length consumed).

133

134

errors defines the error handling to apply. It defaults to

135

'strict' handling.

136

137

The method may not store state in the Codec instance. Use

138

StreamCodec for codecs which have to keep state in order to

139

make encoding/decoding efficient.

140

141

The encoder must be able to handle zero length input and

142

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

147

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

148

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

149

150

""" Decodes the object input and returns a tuple (output

151

object, length consumed).

152

153

input must be an object which provides the bf_getreadbuf

154

buffer slot. Python strings, buffer objects and memory

155

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

156

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

157

errors defines the error handling to apply. It defaults to

158

'strict' handling.

159

160

The method may not store state in the Codec instance. Use

161

StreamCodec for codecs which have to keep state in order to

162

make encoding/decoding efficient.

163

164

The decoder must be able to handle zero length input and

165

return an empty object of the output object type in this

166

situation.

167

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

168

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

169

raise NotImplementedError

170

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

171

class IncrementalEncoder(object):

172

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

173

An IncrementalEncoder encodes an input in multiple steps. The input can

174

be passed piece by piece to the encode() method. The IncrementalEncoder

175

remembers the state of the encoding process between calls to encode().

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

176

"""

177

def __init__(self, errors='strict'):

178

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

179

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

180

181

The IncrementalEncoder may use different error handling schemes by

182

providing the errors keyword argument. See the module docstring

183

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

189

"""

190

Encodes input and returns the resulting object.

191

"""

192

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

197

"""

198

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

199

def getstate(self):

200

"""

201

Return the current state of the encoder.

"""

return 0

def setstate(self, state):

206

"""

207

Set the current state of the encoder. state must have been

208

returned by getstate().

209

"""

210

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

211

class BufferedIncrementalEncoder(IncrementalEncoder):

212

"""

213

This subclass of IncrementalEncoder can be used as the baseclass for an

214

incremental encoder if the encoder must keep some of the output in a

215

buffer between calls to encode().

216

"""

217

def __init__(self, errors='strict'):

218

IncrementalEncoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

219

# unencoded input that is kept between calls to encode()

220

self.buffer = ""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

221

222

def _buffer_encode(self, input, errors, final):

223

# Overwrite this method in subclasses: It must encode input

224

# and return an (output, length consumed) tuple

225

raise NotImplementedError

226

227

def encode(self, input, final=False):

228

# encode input (taking the buffer into account)

229

data = self.buffer + input

230

(result, consumed) = self._buffer_encode(data, self.errors, final)

231

# keep unencoded input until the next call

232

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

237

self.buffer = ""

238

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

239

def getstate(self):

240

return self.buffer or 0

241

242

def setstate(self, state):

243

self.buffer = state or ""

244

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

245

class IncrementalDecoder(object):

246

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

247

An IncrementalDecoder decodes an input in multiple steps. The input can

248

be passed piece by piece to the decode() method. The IncrementalDecoder

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

249

remembers the state of the decoding process between calls to decode().

250

"""

251

def __init__(self, errors='strict'):

252

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

253

Create a IncrementalDecoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

254

255

The IncrementalDecoder may use different error handling schemes by

256

providing the errors keyword argument. See the module docstring

257

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

262

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

263

Decode input and returns the resulting object.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

264

"""

265

raise NotImplementedError

266

267

def reset(self):

268

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

269

Reset the decoder to the initial state.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

270

"""

271

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

272

def getstate(self):

273

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

274

Return the current state of the decoder.

275

276

This must be a (buffered_input, additional_state_info) tuple.

277

buffered_input must be a bytes object containing bytes that

278

were passed to decode() that have not yet been converted.

279

additional_state_info must be a non-negative integer

280

representing the state of the decoder WITHOUT yet having

281

processed the contents of buffered_input. In the initial state

282

and after reset(), getstate() must return (b"", 0).

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

283

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

284

return (b"", 0)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

285

286

def setstate(self, state):

287

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

288

Set the current state of the decoder.

289

290

state must have been returned by getstate(). The effect of

291

setstate((b"", 0)) must be equivalent to reset().

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

292

"""

293

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

294

class BufferedIncrementalDecoder(IncrementalDecoder):

295

"""

296

This subclass of IncrementalDecoder can be used as the baseclass for an

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

297

incremental decoder if the decoder must be able to handle incomplete

298

byte sequences.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

299

"""

300

def __init__(self, errors='strict'):

301

IncrementalDecoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

302

# undecoded input that is kept between calls to decode()

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

303

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

304

305

def _buffer_decode(self, input, errors, final):

306

# Overwrite this method in subclasses: It must decode input

307

# and return an (output, length consumed) tuple

308

raise NotImplementedError

309

310

def decode(self, input, final=False):

311

# decode input (taking the buffer into account)

312

data = self.buffer + input

313

(result, consumed) = self._buffer_decode(data, self.errors, final)

314

# keep undecoded input until the next call

315

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

320

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

321

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

322

def getstate(self):

323

# additional state info is always 0

324

return (self.buffer, 0)

325

326

def setstate(self, state):

327

# ignore additional state info

328

self.buffer = state[0]

329

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

330

#

331

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

332

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

333

# very easily. See encodings/utf_8.py for an example on how this is

334

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

335

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

336

337

class StreamWriter(Codec):

338

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

339

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

340

341

""" Creates a StreamWriter instance.

342

343

stream must be a file-like object open for writing

344

(binary) data.

345

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

346

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

347

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

348

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

349

350

'strict' - raise a ValueError (or a subclass)

351

'ignore' - ignore the character and continue with the next

352

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

353

'xmlcharrefreplace' - Replace with the appropriate XML

354

character reference.

355

'backslashreplace' - Replace with backslashed escape

356

sequences (only for encoding).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

357

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

358

The set of allowed parameter values can be extended via

359

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

364

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

365

366

""" Writes the object's contents encoded to self.stream.

367

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

368

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

369

self.stream.write(data)

370

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

371

def writelines(self, list):

372

373

""" Writes the concatenated list of strings to the stream

374

using .write().

375

"""

376

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

377

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

378

def reset(self):

379

380

""" Flushes and resets the codec buffers used for keeping state.

381

382

Calling this method should ensure that the data on the

383

output is put into a clean state, that allows appending

384

of new fresh data without having to rescan the whole

385

stream to recover state.

"""

pass

Victor Stinner

2010-05-22 16:59:09 +0000

[diff] [blame]

390

def seek(self, offset, whence=0):

391

self.stream.seek(offset, whence)

392

if whence == 0 and offset == 0:

393

self.reset()

394

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

395

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

396

getattr=getattr):

397

398

""" Inherit all other methods from the underlying stream.

399

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

400

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

401

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

406

self.stream.close()

407

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

408

###

409

410

class StreamReader(Codec):

411

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

412

charbuffertype = str

413

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

414

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

415

416

""" Creates a StreamReader instance.

417

418

stream must be a file-like object open for reading

419

(binary) data.

420

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

421

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

422

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

423

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

424

425

'strict' - raise a ValueError (or a subclass)

426

'ignore' - ignore the character and continue with the next

427

'replace'- replace with a suitable replacement character;

428

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

429

The set of allowed parameter values can be extended via

430

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

431

"""

432

self.stream = stream

433

self.errors = errors

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

434

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

435

self._empty_charbuffer = self.charbuffertype()

436

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

437

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

438

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

439

def decode(self, input, errors='strict'):

440

raise NotImplementedError

441

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

442

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

443

444

""" Decodes data from the stream self.stream and returns the

445

resulting object.

446

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

447

chars indicates the number of characters to read from the

448

stream. read() will never return more than chars

449

characters, but it might return less, if there are not enough

450

characters available.

451

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

452

size indicates the approximate maximum number of bytes to

453

read from the stream for decoding purposes. The decoder

454

can modify this setting as appropriate. The default value

455

-1 indicates to read and decode as much as possible. size

456

is intended to prevent having to decode huge files in one

457

step.

458

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

459

If firstline is true, and a UnicodeDecodeError happens

460

after the first line terminator in the input only the first line

461

will be returned, the rest of the input will be kept until the

462

next call to read().

463

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

464

The method should use a greedy read strategy meaning that

465

it should read as much data as is allowed within the

466

definition of the encoding and the given size, e.g. if

467

optional encoding endings or state markers are available

468

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

469

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

470

# If we have lines cached, first merge them back into characters

471

if self.linebuffer:

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

472

self.charbuffer = self._empty_charbuffer.join(self.linebuffer)

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

473

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

474

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

475

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

476

while True:

Tim Golden

621302c

2012-10-01 16:40:40 +0100

[diff] [blame]

477

# can the request be satisfied from the character buffer?

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

478

if chars >= 0:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

479

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

480

break

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

481

elif size >= 0:

482

if len(self.charbuffer) >= size:

483

break

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

484

# we need more data

485

if size < 0:

486

newdata = self.stream.read()

487

else:

488

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

489

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

490

data = self.bytebuffer + newdata

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

491

if not data:

492

break

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

493

try:

494

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

495

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

496

if firstline:

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

497

newchars, decodedbytes = \

498

self.decode(data[:exc.start], self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

499

lines = newchars.splitlines(keepends=True)

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

504

# keep undecoded bytes until the next call

505

self.bytebuffer = data[decodedbytes:]

506

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

507

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

508

# there was no data available

509

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

510

break

511

if chars < 0:

512

# Return everything we've got

513

result = self.charbuffer

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

514

self.charbuffer = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

515

else:

516

# Return the first chars characters

517

result = self.charbuffer[:chars]

518

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

519

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

520

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

521

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

522

523

""" Read one line from the input stream and return the

524

decoded data.

525

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

526

size, if given, is passed as size argument to the

527

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

528

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

529

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

530

# If we have lines cached from an earlier read, return

531

# them unconditionally

532

if self.linebuffer:

533

line = self.linebuffer[0]

534

del self.linebuffer[0]

535

if len(self.linebuffer) == 1:

536

# revert to charbuffer mode; we might need more data

537

# next time

538

self.charbuffer = self.linebuffer[0]

539

self.linebuffer = None

540

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

541

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

542

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

543

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

544

readsize = size or 72

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

545

line = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

546

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

547

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

548

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

549

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

550

# If we're at a "\r" read one extra character (which might

551

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

552

# temporarily exhausted we return the wrong line ending.

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

553

if (isinstance(data, str) and data.endswith("\r")) or \

554

(isinstance(data, bytes) and data.endswith(b"\r")):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

555

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

556

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

557

line += data

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

558

lines = line.splitlines(keepends=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

559

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

560

if len(lines) > 1:

561

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

567

lines[-1] += self.charbuffer

568

self.linebuffer = lines

569

self.charbuffer = None

570

else:

571

# only one remaining line, put it back into charbuffer

572

self.charbuffer = lines[0] + self.charbuffer

573

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

574

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

575

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

576

line0withend = lines[0]

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

577

line0withoutend = lines[0].splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

578

if line0withend != line0withoutend: # We really have a line end

579

# Put the rest back together and keep it until the next call

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

580

self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \

581

self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

586

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

587

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

588

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

589

if line and not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

590

line = line.splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

591

break

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

592

if readsize < 8000:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

593

readsize *= 2

594

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

595

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

596

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

597

598

""" Read all lines available on the input stream

599

and return them as list of lines.

600

601

Line breaks are implemented using the codec's decoder

602

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

603

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

604

sizehint, if given, is ignored since there is no efficient

605

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

606

607

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

608

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

609

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

614

615

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

616

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

617

from decoding errors.

618

619

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

620

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

621

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

622

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

623

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

624

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

625

""" Set the input stream's current position.

626

627

Resets the codec buffers used for keeping state.

628

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

629

self.stream.seek(offset, whence)

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

630

self.reset()

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

631

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

632

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

633

634

""" Return the next decoded line from the input stream."""

635

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

643

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

644

getattr=getattr):

645

646

""" Inherit all other methods from the underlying stream.

647

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

648

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

649

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

654

self.stream.close()

655

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

656

###

657

658

class StreamReaderWriter:

659

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

660

""" StreamReaderWriter instances allow wrapping streams which

661

work in both read and write modes.

662

663

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

664

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

665

instance.

666

667

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

668

# Optional attributes set by the file wrappers below

669

encoding = 'unknown'

670

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

671

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

672

673

""" Creates a StreamReaderWriter instance.

674

675

stream must be a Stream-like object.

676

677

Reader, Writer must be factory functions or classes

678

providing the StreamReader, StreamWriter interface resp.

679

680

Error handling is done in the same way as defined for the

681

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

686

self.writer = Writer(stream, errors)

687

self.errors = errors

688

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

689

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

690

691

return self.reader.read(size)

692

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

693

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

694

695

return self.reader.readline(size)

696

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

697

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

698

699

return self.reader.readlines(sizehint)

700

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

701

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

702

703

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

704

return next(self.reader)

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

709

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

710

711

return self.writer.write(data)

712

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

713

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

714

715

return self.writer.writelines(list)

716

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Victor Stinner

2010-05-22 02:16:27 +0000

[diff] [blame]

722

def seek(self, offset, whence=0):

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

723

self.stream.seek(offset, whence)

724

self.reader.reset()

725

if whence == 0 and offset == 0:

726

self.writer.reset()

Victor Stinner

3fed087

2010-05-22 02:16:27 +0000

[diff] [blame]

727

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

728

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

729

getattr=getattr):

730

731

""" Inherit all other methods from the underlying stream.

732

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

733

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

734

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

735

# these are needed to make "with codecs.open(...)" work properly

def __enter__(self):

return self

def __exit__(self, type, value, tb):

741

self.stream.close()

742

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

747

""" StreamRecoder instances provide a frontend - backend

748

view of encoding data.

749

750

They use the complete set of APIs returned by the

751

codecs.lookup() function to implement their task.

752

753

Data written to the stream is first decoded into an

754

intermediate format (which is dependent on the given codec

755

combination) and then written to the stream using an instance

756

of the provided Writer class.

757

758

In the other direction, data is read from the stream using a

759

Reader instance and then return encoded data to the caller.

760

761

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

762

# Optional attributes set by the file wrappers below

763

data_encoding = 'unknown'

764

file_encoding = 'unknown'

765

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

766

def __init__(self, stream, encode, decode, Reader, Writer,

767

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

768

769

""" Creates a StreamRecoder instance which implements a two-way

770

conversion: encode and decode work on the frontend (the

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

771

input to .read() and output of .write()) while

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

772

Reader and Writer work on the backend (reading and

Fred Drake

908670c

2000-03-17 15:42:11 +0000

[diff] [blame]

773

writing to the stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

774

775

You can use these objects to do transparent direct

776

recodings from e.g. latin-1 to utf-8 and back.

777

778

stream must be a file-like object.

779

780

encode, decode must adhere to the Codec interface, Reader,

781

Writer must be factory functions or classes providing the

782

StreamReader, StreamWriter interface resp.

783

784

encode and decode are needed for the frontend translation,

785

Reader and Writer for the backend translation. Unicode is

786

used as intermediate encoding.

787

788

Error handling is done in the same way as defined for the

789

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

796

self.writer = Writer(stream, errors)

797

self.errors = errors

798

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

799

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

800

801

data = self.reader.read(size)

802

data, bytesencoded = self.encode(data, self.errors)

803

return data

804

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

805

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

806

807

if size is None:

808

data = self.reader.readline()

809

else:

810

data = self.reader.readline(size)

811

data, bytesencoded = self.encode(data, self.errors)

812

return data

813

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

814

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

815

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

816

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

817

data, bytesencoded = self.encode(data, self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

818

return data.splitlines(keepends=True)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

819

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

820

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

821

822

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

823

data = next(self.reader)

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

824

data, bytesencoded = self.encode(data, self.errors)

825

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

830

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

831

832

data, bytesdecoded = self.decode(data, self.errors)

833

return self.writer.write(data)

834

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

835

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

836

837

data = ''.join(list)

838

data, bytesdecoded = self.decode(data, self.errors)

839

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

846

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

847

getattr=getattr):

848

849

""" Inherit all other methods from the underlying stream.

850

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

851

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

852

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

857

self.stream.close()

858

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

859

### Shortcuts

860

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

861

def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

862

863

""" Open an encoded file using the given mode and return

864

a wrapped version providing transparent encoding/decoding.

865

866

Note: The wrapped version will only accept the object format

867

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

868

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

869

Unicode as well.

870

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

871

Files are always opened in binary mode, even if no binary mode

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

872

was specified. This is done to avoid data loss due to encodings

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

873

using 8-bit values. The default file mode is 'rb' meaning to

874

open the file in binary read mode.

875

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

876

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

877

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

878

879

errors may be given to define the error handling. It defaults

880

to 'strict' which causes ValueErrors to be raised in case an

881

encoding error occurs.

882

883

buffering has the same meaning as for the builtin open() API.

884

It defaults to line buffered.

885

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

886

The returned wrapped file object provides an extra attribute

887

.encoding which allows querying the used encoding. This

888

attribute is only available if an encoding was specified as

889

parameter.

890

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

891

"""

892

if encoding is not None and \

893

'b' not in mode:

894

# Force opening of the file in binary mode

895

mode = mode + 'b'

Georg Brandl

1a3284e

2007-12-02 09:40:06 +0000

[diff] [blame]

896

file = builtins.open(filename, mode, buffering)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

897

if encoding is None:

898

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

899

info = lookup(encoding)

900

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

901

# Add attributes to simplify introspection

902

srw.encoding = encoding

903

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

904

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

905

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

906

907

""" Return a wrapped version of file which provides transparent

908

encoding translation.

909

910

Strings written to the wrapped file are interpreted according

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

911

to the given data_encoding and then written to the original

912

file as string using file_encoding. The intermediate encoding

913

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

914

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

915

Strings are read from the file using file_encoding and then

916

passed back to the caller as string using data_encoding.

917

918

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

919

920

errors may be given to define the error handling. It defaults

921

to 'strict' which causes ValueErrors to be raised in case an

922

encoding error occurs.

923

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

924

The returned wrapped file object provides two extra attributes

925

.data_encoding and .file_encoding which reflect the given

926

parameters of the same name. The attributes can be used for

927

introspection by Python programs.

928

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

929

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

930

if file_encoding is None:

931

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

932

data_info = lookup(data_encoding)

933

file_info = lookup(file_encoding)

934

sr = StreamRecoder(file, data_info.encode, data_info.decode,

935

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

936

# Add attributes to simplify introspection

937

sr.data_encoding = data_encoding

938

sr.file_encoding = file_encoding

939

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

940

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

941

### Helpers for codec lookup

942

943

def getencoder(encoding):

944

945

""" Lookup up the codec for the given encoding and return

946

its encoder function.

947

948

Raises a LookupError in case the encoding cannot be found.

949

950

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

951

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

952

953

def getdecoder(encoding):

954

955

""" Lookup up the codec for the given encoding and return

956

its decoder function.

957

958

Raises a LookupError in case the encoding cannot be found.

959

960

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

961

return lookup(encoding).decode

962

963

def getincrementalencoder(encoding):

964

965

""" Lookup up the codec for the given encoding and return

966

its IncrementalEncoder class or factory function.

967

968

Raises a LookupError in case the encoding cannot be found

969

or the codecs doesn't provide an incremental encoder.

970

971

"""

972

encoder = lookup(encoding).incrementalencoder

973

if encoder is None:

974

raise LookupError(encoding)

975

return encoder

976

977

def getincrementaldecoder(encoding):

978

979

""" Lookup up the codec for the given encoding and return

980

its IncrementalDecoder class or factory function.

981

982

Raises a LookupError in case the encoding cannot be found

983

or the codecs doesn't provide an incremental decoder.

984

985

"""

986

decoder = lookup(encoding).incrementaldecoder

987

if decoder is None:

988

raise LookupError(encoding)

989

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

990

991

def getreader(encoding):

992

993

""" Lookup up the codec for the given encoding and return

994

its StreamReader class or factory function.

995

996

Raises a LookupError in case the encoding cannot be found.

997

998

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

999

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1000

1001

def getwriter(encoding):

1002

1003

""" Lookup up the codec for the given encoding and return

1004

its StreamWriter class or factory function.

1005

1006

Raises a LookupError in case the encoding cannot be found.

1007

1008

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1009

return lookup(encoding).streamwriter

1010

1011

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Encodes the input strings from the iterator using a IncrementalEncoder.

1016

1017

errors and kwargs are passed through to the IncrementalEncoder

1018

constructor.

1019

"""

1020

encoder = getincrementalencoder(encoding)(errors, **kwargs)

1021

for input in iterator:

1022

output = encoder.encode(input)

1023

if output:

1024

yield output

1025

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Decodes the input strings from the iterator using a IncrementalDecoder.

1034

1035

errors and kwargs are passed through to the IncrementalDecoder

1036

constructor.

1037

"""

1038

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

1039

for input in iterator:

1040

output = decoder.decode(input)

1041

if output:

1042

yield output

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

1043

output = decoder.decode(b"", True)

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1044

if output:

1045

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1046

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1047

### Helpers for charmap-based codecs

1048

1049

def make_identity_dict(rng):

1050

1051

""" make_identity_dict(rng) -> dict

1052

1053

Return a dictionary where elements of the rng sequence are

1054

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

1055

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1056

"""

Antoine Pitrou

aaefac7

2012-06-16 22:48:21 +0200

[diff] [blame]

1057

return {i:i for i in rng}

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1058

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1059

def make_encoding_map(decoding_map):

1060

1061

""" Creates an encoding map from a decoding map.

1062

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

1063

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1064

times, then that target is mapped to None (undefined mapping),

1065

causing an exception when encountered by the charmap codec

1066

during translation.

1067

1068

One example where this happens is cp875.py which decodes

1069

multiple character to \u001a.

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

1074

if not v in m:

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1079

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1080

### error handlers

1081

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1082

try:

1083

strict_errors = lookup_error("strict")

1084

ignore_errors = lookup_error("ignore")

1085

replace_errors = lookup_error("replace")

1086

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1087

backslashreplace_errors = lookup_error("backslashreplace")

1088

except LookupError:

1089

# In --disable-unicode builds, these error handler are missing

1090

strict_errors = None

1091

ignore_errors = None

1092

replace_errors = None

1093

xmlcharrefreplace_errors = None

1094

backslashreplace_errors = None

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1095

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1096

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1102

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1103

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1104

if __name__ == '__main__':

1105

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1106

# Make stdout translate Latin-1 output into UTF-8 output

1107

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1108

Guido van Rossum