Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

"""#"

Georg Brandl

2007-12-02 09:40:06 +0000

[diff] [blame]

10

import builtins, sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

11

12

### Registry and builtin stateless codec functions

13

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

14

try:

15

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

16

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

17

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

18

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

19

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

20

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

21

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

22

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

Serhiy Storchaka

de3ee5b

2014-12-20 17:42:38 +0200

[diff] [blame^]

23

"CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",

24

"StreamReader", "StreamWriter",

25

"StreamReaderWriter", "StreamRecoder",

26

"getencoder", "getdecoder", "getincrementalencoder",

27

"getincrementaldecoder", "getreader", "getwriter",

28

"encode", "decode", "iterencode", "iterdecode",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

29

"strict_errors", "ignore_errors", "replace_errors",

Serhiy Storchaka

de3ee5b

2014-12-20 17:42:38 +0200

[diff] [blame^]

30

"xmlcharrefreplace_errors", "backslashreplace_errors",

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

31

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

32

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

33

### Constants

34

35

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

36

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

37

# and its possible byte string values

38

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

39

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

40

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

41

# UTF-8

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

42

BOM_UTF8 = b'\xef\xbb\xbf'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

43

44

# UTF-16, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

45

BOM_LE = BOM_UTF16_LE = b'\xff\xfe'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

46

47

# UTF-16, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

48

BOM_BE = BOM_UTF16_BE = b'\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

49

50

# UTF-32, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

51

BOM_UTF32_LE = b'\xff\xfe\x00\x00'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

52

53

# UTF-32, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

54

BOM_UTF32_BE = b'\x00\x00\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

55

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

56

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

57

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

58

# UTF-16, native endianness

59

BOM = BOM_UTF16 = BOM_UTF16_LE

60

61

# UTF-32, native endianness

62

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

67

BOM = BOM_UTF16 = BOM_UTF16_BE

68

69

# UTF-32, native endianness

70

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

71

72

# Old broken names (don't use in new code)

73

BOM32_LE = BOM_UTF16_LE

74

BOM32_BE = BOM_UTF16_BE

75

BOM64_LE = BOM_UTF32_LE

76

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

77

78

79

### Codec base classes (defining the API)

80

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

81

class CodecInfo(tuple):

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

82

"""Codec details when looking up the codec registry"""

83

84

# Private API to allow Python 3.4 to blacklist the known non-Unicode

85

# codecs in the standard library. A more general mechanism to

86

# reliably distinguish test encodings from other codecs will hopefully

87

# be defined for Python 3.5

88

#

89

# See http://bugs.python.org/issue19619

90

_is_text_encoding = True # Assume codecs are text encodings by default

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

91

92

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

93

incrementalencoder=None, incrementaldecoder=None, name=None,

94

*, _is_text_encoding=None):

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

95

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

100

self.incrementaldecoder = incrementaldecoder

101

self.streamwriter = streamwriter

102

self.streamreader = streamreader

Nick Coghlan

c72e4e6

2013-11-22 22:39:36 +1000

[diff] [blame]

103

if _is_text_encoding is not None:

104

self._is_text_encoding = _is_text_encoding

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

105

return self

106

107

def __repr__(self):

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

108

return "<%s.%s object for encoding %s at 0x%x>" % \

109

(self.__class__.__module__, self.__class__.__name__,

110

self.name, id(self))

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

111

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

112

class Codec:

113

114

""" Defines the interface for stateless encoders/decoders.

115

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

116

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

117

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

118

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

119

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

120

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

121

'ignore' - ignore the character and continue with the next

122

'replace' - replace with a suitable replacement character;

123

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

124

CHARACTER for the builtin Unicode codecs on

125

decoding and '?' on encoding.

Andrew Kuchling

c7b6c50

2013-06-16 12:58:48 -0400

[diff] [blame]

126

'surrogateescape' - replace with private codepoints U+DCnn.

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

127

'xmlcharrefreplace' - Replace with the appropriate XML

128

character reference (only for encoding).

129

'backslashreplace' - Replace with backslashed escape sequences

130

(only for encoding).

131

132

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

133

134

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

135

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

136

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

137

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

138

object, length consumed).

139

140

errors defines the error handling to apply. It defaults to

141

'strict' handling.

142

143

The method may not store state in the Codec instance. Use

144

StreamCodec for codecs which have to keep state in order to

145

make encoding/decoding efficient.

146

147

The encoder must be able to handle zero length input and

148

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

153

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

154

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

155

156

""" Decodes the object input and returns a tuple (output

157

object, length consumed).

158

159

input must be an object which provides the bf_getreadbuf

160

buffer slot. Python strings, buffer objects and memory

161

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

162

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

163

errors defines the error handling to apply. It defaults to

164

'strict' handling.

165

166

The method may not store state in the Codec instance. Use

167

StreamCodec for codecs which have to keep state in order to

168

make encoding/decoding efficient.

169

170

The decoder must be able to handle zero length input and

171

return an empty object of the output object type in this

172

situation.

173

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

174

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

175

raise NotImplementedError

176

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

177

class IncrementalEncoder(object):

178

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

179

An IncrementalEncoder encodes an input in multiple steps. The input can

180

be passed piece by piece to the encode() method. The IncrementalEncoder

181

remembers the state of the encoding process between calls to encode().

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

182

"""

183

def __init__(self, errors='strict'):

184

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

185

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

186

187

The IncrementalEncoder may use different error handling schemes by

188

providing the errors keyword argument. See the module docstring

189

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

195

"""

196

Encodes input and returns the resulting object.

197

"""

198

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

203

"""

204

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

205

def getstate(self):

206

"""

207

Return the current state of the encoder.

"""

return 0

def setstate(self, state):

212

"""

213

Set the current state of the encoder. state must have been

214

returned by getstate().

215

"""

216

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

217

class BufferedIncrementalEncoder(IncrementalEncoder):

218

"""

219

This subclass of IncrementalEncoder can be used as the baseclass for an

220

incremental encoder if the encoder must keep some of the output in a

221

buffer between calls to encode().

222

"""

223

def __init__(self, errors='strict'):

224

IncrementalEncoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

225

# unencoded input that is kept between calls to encode()

226

self.buffer = ""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

227

228

def _buffer_encode(self, input, errors, final):

229

# Overwrite this method in subclasses: It must encode input

230

# and return an (output, length consumed) tuple

231

raise NotImplementedError

232

233

def encode(self, input, final=False):

234

# encode input (taking the buffer into account)

235

data = self.buffer + input

236

(result, consumed) = self._buffer_encode(data, self.errors, final)

237

# keep unencoded input until the next call

238

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

243

self.buffer = ""

244

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

245

def getstate(self):

246

return self.buffer or 0

247

248

def setstate(self, state):

249

self.buffer = state or ""

250

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

251

class IncrementalDecoder(object):

252

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

253

An IncrementalDecoder decodes an input in multiple steps. The input can

254

be passed piece by piece to the decode() method. The IncrementalDecoder

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

255

remembers the state of the decoding process between calls to decode().

256

"""

257

def __init__(self, errors='strict'):

258

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

259

Create a IncrementalDecoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

260

261

The IncrementalDecoder may use different error handling schemes by

262

providing the errors keyword argument. See the module docstring

263

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

268

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

269

Decode input and returns the resulting object.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

270

"""

271

raise NotImplementedError

272

273

def reset(self):

274

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

275

Reset the decoder to the initial state.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

276

"""

277

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

278

def getstate(self):

279

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

280

Return the current state of the decoder.

281

282

This must be a (buffered_input, additional_state_info) tuple.

283

buffered_input must be a bytes object containing bytes that

284

were passed to decode() that have not yet been converted.

285

additional_state_info must be a non-negative integer

286

representing the state of the decoder WITHOUT yet having

287

processed the contents of buffered_input. In the initial state

288

and after reset(), getstate() must return (b"", 0).

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

289

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

290

return (b"", 0)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

291

292

def setstate(self, state):

293

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

294

Set the current state of the decoder.

295

296

state must have been returned by getstate(). The effect of

297

setstate((b"", 0)) must be equivalent to reset().

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

298

"""

299

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

300

class BufferedIncrementalDecoder(IncrementalDecoder):

301

"""

302

This subclass of IncrementalDecoder can be used as the baseclass for an

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

303

incremental decoder if the decoder must be able to handle incomplete

304

byte sequences.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

305

"""

306

def __init__(self, errors='strict'):

307

IncrementalDecoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

308

# undecoded input that is kept between calls to decode()

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

309

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

310

311

def _buffer_decode(self, input, errors, final):

312

# Overwrite this method in subclasses: It must decode input

313

# and return an (output, length consumed) tuple

314

raise NotImplementedError

315

316

def decode(self, input, final=False):

317

# decode input (taking the buffer into account)

318

data = self.buffer + input

319

(result, consumed) = self._buffer_decode(data, self.errors, final)

320

# keep undecoded input until the next call

321

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

326

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

327

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

328

def getstate(self):

329

# additional state info is always 0

330

return (self.buffer, 0)

331

332

def setstate(self, state):

333

# ignore additional state info

334

self.buffer = state[0]

335

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

336

#

337

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

338

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

339

# very easily. See encodings/utf_8.py for an example on how this is

340

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

341

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

342

343

class StreamWriter(Codec):

344

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

345

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

346

347

""" Creates a StreamWriter instance.

348

349

stream must be a file-like object open for writing

350

(binary) data.

351

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

352

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

353

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

354

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

355

356

'strict' - raise a ValueError (or a subclass)

357

'ignore' - ignore the character and continue with the next

358

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

359

'xmlcharrefreplace' - Replace with the appropriate XML

360

character reference.

361

'backslashreplace' - Replace with backslashed escape

362

sequences (only for encoding).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

363

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

364

The set of allowed parameter values can be extended via

365

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

370

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

371

372

""" Writes the object's contents encoded to self.stream.

373

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

374

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

375

self.stream.write(data)

376

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

377

def writelines(self, list):

378

379

""" Writes the concatenated list of strings to the stream

380

using .write().

381

"""

382

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

383

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

384

def reset(self):

385

386

""" Flushes and resets the codec buffers used for keeping state.

387

388

Calling this method should ensure that the data on the

389

output is put into a clean state, that allows appending

390

of new fresh data without having to rescan the whole

391

stream to recover state.

"""

pass

Victor Stinner

2010-05-22 16:59:09 +0000

[diff] [blame]

396

def seek(self, offset, whence=0):

397

self.stream.seek(offset, whence)

398

if whence == 0 and offset == 0:

399

self.reset()

400

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

401

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

402

getattr=getattr):

403

404

""" Inherit all other methods from the underlying stream.

405

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

406

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

407

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

412

self.stream.close()

413

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

414

###

415

416

class StreamReader(Codec):

417

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

418

charbuffertype = str

419

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

420

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

421

422

""" Creates a StreamReader instance.

423

424

stream must be a file-like object open for reading

425

(binary) data.

426

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

427

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

428

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

429

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

430

431

'strict' - raise a ValueError (or a subclass)

432

'ignore' - ignore the character and continue with the next

433

'replace'- replace with a suitable replacement character;

434

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

435

The set of allowed parameter values can be extended via

436

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

437

"""

438

self.stream = stream

439

self.errors = errors

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

440

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

441

self._empty_charbuffer = self.charbuffertype()

442

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

443

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

444

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

445

def decode(self, input, errors='strict'):

446

raise NotImplementedError

447

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

448

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

449

450

""" Decodes data from the stream self.stream and returns the

451

resulting object.

452

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

453

chars indicates the number of characters to read from the

454

stream. read() will never return more than chars

455

characters, but it might return less, if there are not enough

456

characters available.

457

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

458

size indicates the approximate maximum number of bytes to

459

read from the stream for decoding purposes. The decoder

460

can modify this setting as appropriate. The default value

461

-1 indicates to read and decode as much as possible. size

462

is intended to prevent having to decode huge files in one

463

step.

464

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

465

If firstline is true, and a UnicodeDecodeError happens

466

after the first line terminator in the input only the first line

467

will be returned, the rest of the input will be kept until the

468

next call to read().

469

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

470

The method should use a greedy read strategy meaning that

471

it should read as much data as is allowed within the

472

definition of the encoding and the given size, e.g. if

473

optional encoding endings or state markers are available

474

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

475

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

476

# If we have lines cached, first merge them back into characters

477

if self.linebuffer:

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

478

self.charbuffer = self._empty_charbuffer.join(self.linebuffer)

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

479

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

480

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

481

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

482

while True:

Tim Golden

621302c

2012-10-01 16:40:40 +0100

[diff] [blame]

483

# can the request be satisfied from the character buffer?

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

484

if chars >= 0:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

485

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

486

break

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

487

elif size >= 0:

488

if len(self.charbuffer) >= size:

489

break

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

490

# we need more data

491

if size < 0:

492

newdata = self.stream.read()

493

else:

494

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

495

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

496

data = self.bytebuffer + newdata

Serhiy Storchaka

dbe0982

2014-01-26 19:27:56 +0200

[diff] [blame]

497

if not data:

498

break

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

499

try:

500

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

501

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

502

if firstline:

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

503

newchars, decodedbytes = \

504

self.decode(data[:exc.start], self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

505

lines = newchars.splitlines(keepends=True)

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

510

# keep undecoded bytes until the next call

511

self.bytebuffer = data[decodedbytes:]

512

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

513

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

514

# there was no data available

515

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

516

break

517

if chars < 0:

518

# Return everything we've got

519

result = self.charbuffer

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

520

self.charbuffer = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

521

else:

522

# Return the first chars characters

523

result = self.charbuffer[:chars]

524

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

525

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

526

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

527

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

528

529

""" Read one line from the input stream and return the

530

decoded data.

531

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

532

size, if given, is passed as size argument to the

533

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

534

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

535

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

536

# If we have lines cached from an earlier read, return

537

# them unconditionally

538

if self.linebuffer:

539

line = self.linebuffer[0]

540

del self.linebuffer[0]

541

if len(self.linebuffer) == 1:

542

# revert to charbuffer mode; we might need more data

543

# next time

544

self.charbuffer = self.linebuffer[0]

545

self.linebuffer = None

546

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

547

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

548

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

549

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

550

readsize = size or 72

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

551

line = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

552

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

553

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

554

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

555

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

556

# If we're at a "\r" read one extra character (which might

557

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

558

# temporarily exhausted we return the wrong line ending.

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

559

if (isinstance(data, str) and data.endswith("\r")) or \

560

(isinstance(data, bytes) and data.endswith(b"\r")):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

561

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

562

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

563

line += data

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

564

lines = line.splitlines(keepends=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

565

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

566

if len(lines) > 1:

567

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

573

lines[-1] += self.charbuffer

574

self.linebuffer = lines

575

self.charbuffer = None

576

else:

577

# only one remaining line, put it back into charbuffer

578

self.charbuffer = lines[0] + self.charbuffer

579

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

580

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

581

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

582

line0withend = lines[0]

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

583

line0withoutend = lines[0].splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

584

if line0withend != line0withoutend: # We really have a line end

585

# Put the rest back together and keep it until the next call

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

586

self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \

587

self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

592

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

593

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

594

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

595

if line and not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

596

line = line.splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

597

break

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

598

if readsize < 8000:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

599

readsize *= 2

600

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

601

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

602

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

603

604

""" Read all lines available on the input stream

605

and return them as list of lines.

606

607

Line breaks are implemented using the codec's decoder

608

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

609

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

610

sizehint, if given, is ignored since there is no efficient

611

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

612

613

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

614

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

615

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

620

621

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

622

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

623

from decoding errors.

624

625

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

626

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

627

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

628

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

629

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

630

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

631

""" Set the input stream's current position.

632

633

Resets the codec buffers used for keeping state.

634

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

635

self.stream.seek(offset, whence)

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

636

self.reset()

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

637

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

638

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

639

640

""" Return the next decoded line from the input stream."""

641

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

649

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

650

getattr=getattr):

651

652

""" Inherit all other methods from the underlying stream.

653

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

654

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

655

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

660

self.stream.close()

661

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

662

###

663

664

class StreamReaderWriter:

665

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

666

""" StreamReaderWriter instances allow wrapping streams which

667

work in both read and write modes.

668

669

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

670

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

671

instance.

672

673

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

674

# Optional attributes set by the file wrappers below

675

encoding = 'unknown'

676

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

677

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

678

679

""" Creates a StreamReaderWriter instance.

680

681

stream must be a Stream-like object.

682

683

Reader, Writer must be factory functions or classes

684

providing the StreamReader, StreamWriter interface resp.

685

686

Error handling is done in the same way as defined for the

687

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

692

self.writer = Writer(stream, errors)

693

self.errors = errors

694

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

695

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

696

697

return self.reader.read(size)

698

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

699

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

700

701

return self.reader.readline(size)

702

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

703

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

704

705

return self.reader.readlines(sizehint)

706

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

707

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

708

709

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

710

return next(self.reader)

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

715

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

716

717

return self.writer.write(data)

718

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

719

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

720

721

return self.writer.writelines(list)

722

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Victor Stinner

2010-05-22 02:16:27 +0000

[diff] [blame]

728

def seek(self, offset, whence=0):

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

729

self.stream.seek(offset, whence)

730

self.reader.reset()

731

if whence == 0 and offset == 0:

732

self.writer.reset()

Victor Stinner

3fed087

2010-05-22 02:16:27 +0000

[diff] [blame]

733

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

734

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

735

getattr=getattr):

736

737

""" Inherit all other methods from the underlying stream.

738

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

739

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

740

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

741

# these are needed to make "with codecs.open(...)" work properly

def __enter__(self):

return self

def __exit__(self, type, value, tb):

747

self.stream.close()

748

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

753

""" StreamRecoder instances provide a frontend - backend

754

view of encoding data.

755

756

They use the complete set of APIs returned by the

757

codecs.lookup() function to implement their task.

758

759

Data written to the stream is first decoded into an

760

intermediate format (which is dependent on the given codec

761

combination) and then written to the stream using an instance

762

of the provided Writer class.

763

764

In the other direction, data is read from the stream using a

765

Reader instance and then return encoded data to the caller.

766

767

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

768

# Optional attributes set by the file wrappers below

769

data_encoding = 'unknown'

770

file_encoding = 'unknown'

771

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

772

def __init__(self, stream, encode, decode, Reader, Writer,

773

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

774

775

""" Creates a StreamRecoder instance which implements a two-way

776

conversion: encode and decode work on the frontend (the

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

777

input to .read() and output of .write()) while

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

778

Reader and Writer work on the backend (reading and

Fred Drake

908670c

2000-03-17 15:42:11 +0000

[diff] [blame]

779

writing to the stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

780

781

You can use these objects to do transparent direct

782

recodings from e.g. latin-1 to utf-8 and back.

783

784

stream must be a file-like object.

785

786

encode, decode must adhere to the Codec interface, Reader,

787

Writer must be factory functions or classes providing the

788

StreamReader, StreamWriter interface resp.

789

790

encode and decode are needed for the frontend translation,

791

Reader and Writer for the backend translation. Unicode is

792

used as intermediate encoding.

793

794

Error handling is done in the same way as defined for the

795

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

802

self.writer = Writer(stream, errors)

803

self.errors = errors

804

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

805

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

806

807

data = self.reader.read(size)

808

data, bytesencoded = self.encode(data, self.errors)

809

return data

810

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

811

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

812

813

if size is None:

814

data = self.reader.readline()

815

else:

816

data = self.reader.readline(size)

817

data, bytesencoded = self.encode(data, self.errors)

818

return data

819

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

820

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

821

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

822

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

823

data, bytesencoded = self.encode(data, self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

824

return data.splitlines(keepends=True)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

825

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

826

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

827

828

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

829

data = next(self.reader)

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

830

data, bytesencoded = self.encode(data, self.errors)

831

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

836

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

837

838

data, bytesdecoded = self.decode(data, self.errors)

839

return self.writer.write(data)

840

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

841

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

842

843

data = ''.join(list)

844

data, bytesdecoded = self.decode(data, self.errors)

845

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

852

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

853

getattr=getattr):

854

855

""" Inherit all other methods from the underlying stream.

856

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

857

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

858

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

863

self.stream.close()

864

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

865

### Shortcuts

866

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

867

def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

868

869

""" Open an encoded file using the given mode and return

870

a wrapped version providing transparent encoding/decoding.

871

872

Note: The wrapped version will only accept the object format

873

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

874

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

875

Unicode as well.

876

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

877

Files are always opened in binary mode, even if no binary mode

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

878

was specified. This is done to avoid data loss due to encodings

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

879

using 8-bit values. The default file mode is 'rb' meaning to

880

open the file in binary read mode.

881

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

882

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

883

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

884

885

errors may be given to define the error handling. It defaults

886

to 'strict' which causes ValueErrors to be raised in case an

887

encoding error occurs.

888

889

buffering has the same meaning as for the builtin open() API.

890

It defaults to line buffered.

891

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

892

The returned wrapped file object provides an extra attribute

893

.encoding which allows querying the used encoding. This

894

attribute is only available if an encoding was specified as

895

parameter.

896

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

897

"""

898

if encoding is not None and \

899

'b' not in mode:

900

# Force opening of the file in binary mode

901

mode = mode + 'b'

Georg Brandl

1a3284e

2007-12-02 09:40:06 +0000

[diff] [blame]

902

file = builtins.open(filename, mode, buffering)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

903

if encoding is None:

904

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

905

info = lookup(encoding)

906

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

907

# Add attributes to simplify introspection

908

srw.encoding = encoding

909

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

910

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

911

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

912

913

""" Return a wrapped version of file which provides transparent

914

encoding translation.

915

916

Strings written to the wrapped file are interpreted according

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

917

to the given data_encoding and then written to the original

918

file as string using file_encoding. The intermediate encoding

919

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

920

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

921

Strings are read from the file using file_encoding and then

922

passed back to the caller as string using data_encoding.

923

924

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

925

926

errors may be given to define the error handling. It defaults

927

to 'strict' which causes ValueErrors to be raised in case an

928

encoding error occurs.

929

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

930

The returned wrapped file object provides two extra attributes

931

.data_encoding and .file_encoding which reflect the given

932

parameters of the same name. The attributes can be used for

933

introspection by Python programs.

934

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

935

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

936

if file_encoding is None:

937

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

938

data_info = lookup(data_encoding)

939

file_info = lookup(file_encoding)

940

sr = StreamRecoder(file, data_info.encode, data_info.decode,

941

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

942

# Add attributes to simplify introspection

943

sr.data_encoding = data_encoding

944

sr.file_encoding = file_encoding

945

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

946

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

947

### Helpers for codec lookup

948

949

def getencoder(encoding):

950

951

""" Lookup up the codec for the given encoding and return

952

its encoder function.

953

954

Raises a LookupError in case the encoding cannot be found.

955

956

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

957

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

958

959

def getdecoder(encoding):

960

961

""" Lookup up the codec for the given encoding and return

962

its decoder function.

963

964

Raises a LookupError in case the encoding cannot be found.

965

966

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

967

return lookup(encoding).decode

968

969

def getincrementalencoder(encoding):

970

971

""" Lookup up the codec for the given encoding and return

972

its IncrementalEncoder class or factory function.

973

974

Raises a LookupError in case the encoding cannot be found

975

or the codecs doesn't provide an incremental encoder.

976

977

"""

978

encoder = lookup(encoding).incrementalencoder

979

if encoder is None:

980

raise LookupError(encoding)

981

return encoder

982

983

def getincrementaldecoder(encoding):

984

985

""" Lookup up the codec for the given encoding and return

986

its IncrementalDecoder class or factory function.

987

988

Raises a LookupError in case the encoding cannot be found

989

or the codecs doesn't provide an incremental decoder.

990

991

"""

992

decoder = lookup(encoding).incrementaldecoder

993

if decoder is None:

994

raise LookupError(encoding)

995

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

996

997

def getreader(encoding):

998

999

""" Lookup up the codec for the given encoding and return

1000

its StreamReader class or factory function.

1001

1002

Raises a LookupError in case the encoding cannot be found.

1003

1004

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1005

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1006

1007

def getwriter(encoding):

1008

1009

""" Lookup up the codec for the given encoding and return

1010

its StreamWriter class or factory function.

1011

1012

Raises a LookupError in case the encoding cannot be found.

1013

1014

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1015

return lookup(encoding).streamwriter

1016

1017

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Encodes the input strings from the iterator using a IncrementalEncoder.

1022

1023

errors and kwargs are passed through to the IncrementalEncoder

1024

constructor.

1025

"""

1026

encoder = getincrementalencoder(encoding)(errors, **kwargs)

1027

for input in iterator:

1028

output = encoder.encode(input)

1029

if output:

1030

yield output

1031

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Decodes the input strings from the iterator using a IncrementalDecoder.

1040

1041

errors and kwargs are passed through to the IncrementalDecoder

1042

constructor.

1043

"""

1044

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

1045

for input in iterator:

1046

output = decoder.decode(input)

1047

if output:

1048

yield output

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

1049

output = decoder.decode(b"", True)

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1050

if output:

1051

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1052

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1053

### Helpers for charmap-based codecs

1054

1055

def make_identity_dict(rng):

1056

1057

""" make_identity_dict(rng) -> dict

1058

1059

Return a dictionary where elements of the rng sequence are

1060

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

1061

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1062

"""

Antoine Pitrou

aaefac7

2012-06-16 22:48:21 +0200

[diff] [blame]

1063

return {i:i for i in rng}

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1064

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1065

def make_encoding_map(decoding_map):

1066

1067

""" Creates an encoding map from a decoding map.

1068

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

1069

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1070

times, then that target is mapped to None (undefined mapping),

1071

causing an exception when encountered by the charmap codec

1072

during translation.

1073

1074

One example where this happens is cp875.py which decodes

1075

multiple character to \u001a.

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

1080

if not v in m:

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1085

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

1086

### error handlers

1087

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1088

try:

1089

strict_errors = lookup_error("strict")

1090

ignore_errors = lookup_error("ignore")

1091

replace_errors = lookup_error("replace")

1092

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1093

backslashreplace_errors = lookup_error("backslashreplace")

1094

except LookupError:

1095

# In --disable-unicode builds, these error handler are missing

1096

strict_errors = None

1097

ignore_errors = None

1098

replace_errors = None

1099

xmlcharrefreplace_errors = None

1100

backslashreplace_errors = None

Walter Dörwald

2002-09-02 13:14:32 +0000

[diff] [blame]

1101

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1102

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1108

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1109

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1110

if __name__ == '__main__':

1111

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1112

# Make stdout translate Latin-1 output into UTF-8 output

1113

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1114

Guido van Rossum