Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

"""#"

Georg Brandl

2007-12-02 09:40:06 +0000

[diff] [blame]

10

import builtins, sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

11

12

### Registry and builtin stateless codec functions

13

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

14

try:

15

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

16

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

17

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

18

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

19

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

20

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

21

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

22

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

23

"strict_errors", "ignore_errors", "replace_errors",

24

"xmlcharrefreplace_errors",

25

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

26

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

27

### Constants

28

29

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

30

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

31

# and its possible byte string values

32

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

33

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

34

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

35

# UTF-8

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

36

BOM_UTF8 = b'\xef\xbb\xbf'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

37

38

# UTF-16, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

39

BOM_LE = BOM_UTF16_LE = b'\xff\xfe'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

40

41

# UTF-16, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

42

BOM_BE = BOM_UTF16_BE = b'\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

43

44

# UTF-32, little endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

45

BOM_UTF32_LE = b'\xff\xfe\x00\x00'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

46

47

# UTF-32, big endian

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

48

BOM_UTF32_BE = b'\x00\x00\xfe\xff'

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

49

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

50

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

51

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

52

# UTF-16, native endianness

53

BOM = BOM_UTF16 = BOM_UTF16_LE

54

55

# UTF-32, native endianness

56

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

61

BOM = BOM_UTF16 = BOM_UTF16_BE

62

63

# UTF-32, native endianness

64

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

65

66

# Old broken names (don't use in new code)

67

BOM32_LE = BOM_UTF16_LE

68

BOM32_BE = BOM_UTF16_BE

69

BOM64_LE = BOM_UTF32_LE

70

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

71

72

73

### Codec base classes (defining the API)

74

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

75

class CodecInfo(tuple):

76

77

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

78

incrementalencoder=None, incrementaldecoder=None, name=None):

79

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

84

self.incrementaldecoder = incrementaldecoder

85

self.streamwriter = streamwriter

86

self.streamreader = streamreader

87

return self

88

89

def __repr__(self):

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

90

return "<%s.%s object for encoding %s at 0x%x>" % \

91

(self.__class__.__module__, self.__class__.__name__,

92

self.name, id(self))

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

93

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

94

class Codec:

95

96

""" Defines the interface for stateless encoders/decoders.

97

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

98

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

99

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

100

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

101

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

102

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

103

'ignore' - ignore the character and continue with the next

104

'replace' - replace with a suitable replacement character;

105

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

106

CHARACTER for the builtin Unicode codecs on

107

decoding and '?' on encoding.

Andrew Kuchling

c7b6c50

2013-06-16 12:58:48 -0400

[diff] [blame]

108

'surrogateescape' - replace with private codepoints U+DCnn.

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

109

'xmlcharrefreplace' - Replace with the appropriate XML

110

character reference (only for encoding).

111

'backslashreplace' - Replace with backslashed escape sequences

112

(only for encoding).

113

114

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

115

116

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

117

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

118

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

119

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

120

object, length consumed).

121

122

errors defines the error handling to apply. It defaults to

123

'strict' handling.

124

125

The method may not store state in the Codec instance. Use

126

StreamCodec for codecs which have to keep state in order to

127

make encoding/decoding efficient.

128

129

The encoder must be able to handle zero length input and

130

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

135

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

136

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

137

138

""" Decodes the object input and returns a tuple (output

139

object, length consumed).

140

141

input must be an object which provides the bf_getreadbuf

142

buffer slot. Python strings, buffer objects and memory

143

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

144

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

145

errors defines the error handling to apply. It defaults to

146

'strict' handling.

147

148

The method may not store state in the Codec instance. Use

149

StreamCodec for codecs which have to keep state in order to

150

make encoding/decoding efficient.

151

152

The decoder must be able to handle zero length input and

153

return an empty object of the output object type in this

154

situation.

155

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

156

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

157

raise NotImplementedError

158

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

159

class IncrementalEncoder(object):

160

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

161

An IncrementalEncoder encodes an input in multiple steps. The input can

162

be passed piece by piece to the encode() method. The IncrementalEncoder

163

remembers the state of the encoding process between calls to encode().

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

164

"""

165

def __init__(self, errors='strict'):

166

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

167

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

168

169

The IncrementalEncoder may use different error handling schemes by

170

providing the errors keyword argument. See the module docstring

171

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

177

"""

178

Encodes input and returns the resulting object.

179

"""

180

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

185

"""

186

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

187

def getstate(self):

188

"""

189

Return the current state of the encoder.

"""

return 0

def setstate(self, state):

194

"""

195

Set the current state of the encoder. state must have been

196

returned by getstate().

197

"""

198

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

199

class BufferedIncrementalEncoder(IncrementalEncoder):

200

"""

201

This subclass of IncrementalEncoder can be used as the baseclass for an

202

incremental encoder if the encoder must keep some of the output in a

203

buffer between calls to encode().

204

"""

205

def __init__(self, errors='strict'):

206

IncrementalEncoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

207

# unencoded input that is kept between calls to encode()

208

self.buffer = ""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

209

210

def _buffer_encode(self, input, errors, final):

211

# Overwrite this method in subclasses: It must encode input

212

# and return an (output, length consumed) tuple

213

raise NotImplementedError

214

215

def encode(self, input, final=False):

216

# encode input (taking the buffer into account)

217

data = self.buffer + input

218

(result, consumed) = self._buffer_encode(data, self.errors, final)

219

# keep unencoded input until the next call

220

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

225

self.buffer = ""

226

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

227

def getstate(self):

228

return self.buffer or 0

229

230

def setstate(self, state):

231

self.buffer = state or ""

232

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

233

class IncrementalDecoder(object):

234

"""

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

235

An IncrementalDecoder decodes an input in multiple steps. The input can

236

be passed piece by piece to the decode() method. The IncrementalDecoder

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

237

remembers the state of the decoding process between calls to decode().

238

"""

239

def __init__(self, errors='strict'):

240

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

241

Create a IncrementalDecoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

242

243

The IncrementalDecoder may use different error handling schemes by

244

providing the errors keyword argument. See the module docstring

245

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

250

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

251

Decode input and returns the resulting object.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

252

"""

253

raise NotImplementedError

254

255

def reset(self):

256

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

257

Reset the decoder to the initial state.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

258

"""

259

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

260

def getstate(self):

261

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

262

Return the current state of the decoder.

263

264

This must be a (buffered_input, additional_state_info) tuple.

265

buffered_input must be a bytes object containing bytes that

266

were passed to decode() that have not yet been converted.

267

additional_state_info must be a non-negative integer

268

representing the state of the decoder WITHOUT yet having

269

processed the contents of buffered_input. In the initial state

270

and after reset(), getstate() must return (b"", 0).

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

271

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

272

return (b"", 0)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

273

274

def setstate(self, state):

275

"""

Ka-Ping Yee

2008-03-18 04:51:32 +0000

[diff] [blame]

276

Set the current state of the decoder.

277

278

state must have been returned by getstate(). The effect of

279

setstate((b"", 0)) must be equivalent to reset().

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

280

"""

281

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

282

class BufferedIncrementalDecoder(IncrementalDecoder):

283

"""

284

This subclass of IncrementalDecoder can be used as the baseclass for an

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

285

incremental decoder if the decoder must be able to handle incomplete

286

byte sequences.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

287

"""

288

def __init__(self, errors='strict'):

289

IncrementalDecoder.__init__(self, errors)

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

290

# undecoded input that is kept between calls to decode()

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

291

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

292

293

def _buffer_decode(self, input, errors, final):

294

# Overwrite this method in subclasses: It must decode input

295

# and return an (output, length consumed) tuple

296

raise NotImplementedError

297

298

def decode(self, input, final=False):

299

# decode input (taking the buffer into account)

300

data = self.buffer + input

301

(result, consumed) = self._buffer_decode(data, self.errors, final)

302

# keep undecoded input until the next call

303

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

308

self.buffer = b""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

309

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

310

def getstate(self):

311

# additional state info is always 0

312

return (self.buffer, 0)

313

314

def setstate(self, state):

315

# ignore additional state info

316

self.buffer = state[0]

317

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

318

#

319

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

320

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

321

# very easily. See encodings/utf_8.py for an example on how this is

322

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

323

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

324

325

class StreamWriter(Codec):

326

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

327

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

328

329

""" Creates a StreamWriter instance.

330

331

stream must be a file-like object open for writing

332

(binary) data.

333

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

334

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

335

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

336

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

337

338

'strict' - raise a ValueError (or a subclass)

339

'ignore' - ignore the character and continue with the next

340

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

341

'xmlcharrefreplace' - Replace with the appropriate XML

342

character reference.

343

'backslashreplace' - Replace with backslashed escape

344

sequences (only for encoding).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

345

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

346

The set of allowed parameter values can be extended via

347

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

352

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

353

354

""" Writes the object's contents encoded to self.stream.

355

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

356

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

357

self.stream.write(data)

358

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

359

def writelines(self, list):

360

361

""" Writes the concatenated list of strings to the stream

362

using .write().

363

"""

364

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

365

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

366

def reset(self):

367

368

""" Flushes and resets the codec buffers used for keeping state.

369

370

Calling this method should ensure that the data on the

371

output is put into a clean state, that allows appending

372

of new fresh data without having to rescan the whole

373

stream to recover state.

"""

pass

Victor Stinner

2010-05-22 16:59:09 +0000

[diff] [blame]

378

def seek(self, offset, whence=0):

379

self.stream.seek(offset, whence)

380

if whence == 0 and offset == 0:

381

self.reset()

382

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

383

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

384

getattr=getattr):

385

386

""" Inherit all other methods from the underlying stream.

387

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

388

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

389

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

394

self.stream.close()

395

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

396

###

397

398

class StreamReader(Codec):

399

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

400

charbuffertype = str

401

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

402

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

403

404

""" Creates a StreamReader instance.

405

406

stream must be a file-like object open for reading

407

(binary) data.

408

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

409

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

410

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

411

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

412

413

'strict' - raise a ValueError (or a subclass)

414

'ignore' - ignore the character and continue with the next

415

'replace'- replace with a suitable replacement character;

416

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

417

The set of allowed parameter values can be extended via

418

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

419

"""

420

self.stream = stream

421

self.errors = errors

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

422

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

423

self._empty_charbuffer = self.charbuffertype()

424

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

425

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

426

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

427

def decode(self, input, errors='strict'):

428

raise NotImplementedError

429

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

430

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

431

432

""" Decodes data from the stream self.stream and returns the

433

resulting object.

434

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

435

chars indicates the number of characters to read from the

436

stream. read() will never return more than chars

437

characters, but it might return less, if there are not enough

438

characters available.

439

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

440

size indicates the approximate maximum number of bytes to

441

read from the stream for decoding purposes. The decoder

442

can modify this setting as appropriate. The default value

443

-1 indicates to read and decode as much as possible. size

444

is intended to prevent having to decode huge files in one

445

step.

446

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

447

If firstline is true, and a UnicodeDecodeError happens

448

after the first line terminator in the input only the first line

449

will be returned, the rest of the input will be kept until the

450

next call to read().

451

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

452

The method should use a greedy read strategy meaning that

453

it should read as much data as is allowed within the

454

definition of the encoding and the given size, e.g. if

455

optional encoding endings or state markers are available

456

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

457

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

458

# If we have lines cached, first merge them back into characters

459

if self.linebuffer:

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

460

self.charbuffer = self._empty_charbuffer.join(self.linebuffer)

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

461

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

462

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

463

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

464

while True:

Tim Golden

621302c

2012-10-01 16:40:40 +0100

[diff] [blame]

465

# can the request be satisfied from the character buffer?

Serhiy Storchaka

8003850

2014-01-26 19:21:00 +0200

[diff] [blame]

466

if chars >= 0:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

467

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

468

break

Serhiy Storchaka

8003850

2014-01-26 19:21:00 +0200

[diff] [blame]

469

elif size >= 0:

470

if len(self.charbuffer) >= size:

471

break

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

472

# we need more data

473

if size < 0:

474

newdata = self.stream.read()

475

else:

476

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

477

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

478

data = self.bytebuffer + newdata

Serhiy Storchaka

8003850

2014-01-26 19:21:00 +0200

[diff] [blame]

479

if not data:

480

break

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

481

try:

482

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

483

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

484

if firstline:

Walter Dörwald

2007-04-16 22:10:50 +0000

[diff] [blame]

485

newchars, decodedbytes = \

486

self.decode(data[:exc.start], self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

487

lines = newchars.splitlines(keepends=True)

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

492

# keep undecoded bytes until the next call

493

self.bytebuffer = data[decodedbytes:]

494

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

495

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

496

# there was no data available

497

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

498

break

499

if chars < 0:

500

# Return everything we've got

501

result = self.charbuffer

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

502

self.charbuffer = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

503

else:

504

# Return the first chars characters

505

result = self.charbuffer[:chars]

506

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

507

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

508

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

509

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

510

511

""" Read one line from the input stream and return the

512

decoded data.

513

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

514

size, if given, is passed as size argument to the

515

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

516

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

517

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

518

# If we have lines cached from an earlier read, return

519

# them unconditionally

520

if self.linebuffer:

521

line = self.linebuffer[0]

522

del self.linebuffer[0]

523

if len(self.linebuffer) == 1:

524

# revert to charbuffer mode; we might need more data

525

# next time

526

self.charbuffer = self.linebuffer[0]

527

self.linebuffer = None

528

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

529

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

530

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

531

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

532

readsize = size or 72

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

533

line = self._empty_charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

534

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

535

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

536

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

537

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

538

# If we're at a "\r" read one extra character (which might

539

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

540

# temporarily exhausted we return the wrong line ending.

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

541

if (isinstance(data, str) and data.endswith("\r")) or \

542

(isinstance(data, bytes) and data.endswith(b"\r")):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

543

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

544

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

545

line += data

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

546

lines = line.splitlines(keepends=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

547

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

548

if len(lines) > 1:

549

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

555

lines[-1] += self.charbuffer

556

self.linebuffer = lines

557

self.charbuffer = None

558

else:

559

# only one remaining line, put it back into charbuffer

560

self.charbuffer = lines[0] + self.charbuffer

561

if not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

562

line = line.splitlines(keepends=False)[0]

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

563

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

564

line0withend = lines[0]

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

565

line0withoutend = lines[0].splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

566

if line0withend != line0withoutend: # We really have a line end

567

# Put the rest back together and keep it until the next call

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

568

self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \

569

self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

574

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

575

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

576

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

577

if line and not keepends:

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

578

line = line.splitlines(keepends=False)[0]

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

579

break

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

580

if readsize < 8000:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

581

readsize *= 2

582

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

583

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

584

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

585

586

""" Read all lines available on the input stream

587

and return them as list of lines.

588

589

Line breaks are implemented using the codec's decoder

590

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

591

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

592

sizehint, if given, is ignored since there is no efficient

593

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

594

595

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

596

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

597

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

602

603

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

604

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

605

from decoding errors.

606

607

"""

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

608

self.bytebuffer = b""

Georg Brandl

2010-12-02 18:06:51 +0000

[diff] [blame]

609

self.charbuffer = self._empty_charbuffer

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

610

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

611

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

612

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

613

""" Set the input stream's current position.

614

615

Resets the codec buffers used for keeping state.

616

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

617

self.stream.seek(offset, whence)

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

618

self.reset()

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

619

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

620

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

621

622

""" Return the next decoded line from the input stream."""

623

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

631

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

632

getattr=getattr):

633

634

""" Inherit all other methods from the underlying stream.

635

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

636

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

637

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

642

self.stream.close()

643

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

644

###

645

646

class StreamReaderWriter:

647

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

648

""" StreamReaderWriter instances allow wrapping streams which

649

work in both read and write modes.

650

651

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

652

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

653

instance.

654

655

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

656

# Optional attributes set by the file wrappers below

657

encoding = 'unknown'

658

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

659

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

660

661

""" Creates a StreamReaderWriter instance.

662

663

stream must be a Stream-like object.

664

665

Reader, Writer must be factory functions or classes

666

providing the StreamReader, StreamWriter interface resp.

667

668

Error handling is done in the same way as defined for the

669

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

674

self.writer = Writer(stream, errors)

675

self.errors = errors

676

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

677

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

678

679

return self.reader.read(size)

680

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

681

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

682

683

return self.reader.readline(size)

684

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

685

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

686

687

return self.reader.readlines(sizehint)

688

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

689

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

690

691

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

692

return next(self.reader)

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

697

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

698

699

return self.writer.write(data)

700

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

701

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

702

703

return self.writer.writelines(list)

704

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Victor Stinner

2010-05-22 02:16:27 +0000

[diff] [blame]

710

def seek(self, offset, whence=0):

Victor Stinner

a92ad7e

2010-05-22 16:59:09 +0000

[diff] [blame]

711

self.stream.seek(offset, whence)

712

self.reader.reset()

713

if whence == 0 and offset == 0:

714

self.writer.reset()

Victor Stinner

3fed087

2010-05-22 02:16:27 +0000

[diff] [blame]

715

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

716

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

717

getattr=getattr):

718

719

""" Inherit all other methods from the underlying stream.

720

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

721

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

722

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

723

# these are needed to make "with codecs.open(...)" work properly

def __enter__(self):

return self

def __exit__(self, type, value, tb):

729

self.stream.close()

730

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

735

""" StreamRecoder instances provide a frontend - backend

736

view of encoding data.

737

738

They use the complete set of APIs returned by the

739

codecs.lookup() function to implement their task.

740

741

Data written to the stream is first decoded into an

742

intermediate format (which is dependent on the given codec

743

combination) and then written to the stream using an instance

744

of the provided Writer class.

745

746

In the other direction, data is read from the stream using a

747

Reader instance and then return encoded data to the caller.

748

749

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

750

# Optional attributes set by the file wrappers below

751

data_encoding = 'unknown'

752

file_encoding = 'unknown'

753

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

754

def __init__(self, stream, encode, decode, Reader, Writer,

755

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

756

757

""" Creates a StreamRecoder instance which implements a two-way

758

conversion: encode and decode work on the frontend (the

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

759

input to .read() and output of .write()) while

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

760

Reader and Writer work on the backend (reading and

Fred Drake

908670c

2000-03-17 15:42:11 +0000

[diff] [blame]

761

writing to the stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

762

763

You can use these objects to do transparent direct

764

recodings from e.g. latin-1 to utf-8 and back.

765

766

stream must be a file-like object.

767

768

encode, decode must adhere to the Codec interface, Reader,

769

Writer must be factory functions or classes providing the

770

StreamReader, StreamWriter interface resp.

771

772

encode and decode are needed for the frontend translation,

773

Reader and Writer for the backend translation. Unicode is

774

used as intermediate encoding.

775

776

Error handling is done in the same way as defined for the

777

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

784

self.writer = Writer(stream, errors)

785

self.errors = errors

786

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

787

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

788

789

data = self.reader.read(size)

790

data, bytesencoded = self.encode(data, self.errors)

791

return data

792

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

793

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

794

795

if size is None:

796

data = self.reader.readline()

797

else:

798

data = self.reader.readline(size)

799

data, bytesencoded = self.encode(data, self.errors)

800

return data

801

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

802

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

803

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

804

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

805

data, bytesencoded = self.encode(data, self.errors)

Ezio Melotti

2011-09-28 17:37:55 +0300

[diff] [blame]

806

return data.splitlines(keepends=True)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

807

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

808

def __next__(self):

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

809

810

""" Return the next decoded line from the input stream."""

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

811

data = next(self.reader)

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

812

data, bytesencoded = self.encode(data, self.errors)

813

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

818

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

819

820

data, bytesdecoded = self.decode(data, self.errors)

821

return self.writer.write(data)

822

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

823

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

824

825

data = ''.join(list)

826

data, bytesdecoded = self.decode(data, self.errors)

827

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

834

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

835

getattr=getattr):

836

837

""" Inherit all other methods from the underlying stream.

838

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

839

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

840

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

845

self.stream.close()

846

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

847

### Shortcuts

848

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

849

def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

850

851

""" Open an encoded file using the given mode and return

852

a wrapped version providing transparent encoding/decoding.

853

854

Note: The wrapped version will only accept the object format

855

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

856

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

857

Unicode as well.

858

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

859

Files are always opened in binary mode, even if no binary mode

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

860

was specified. This is done to avoid data loss due to encodings

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

861

using 8-bit values. The default file mode is 'rb' meaning to

862

open the file in binary read mode.

863

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

864

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

865

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

866

867

errors may be given to define the error handling. It defaults

868

to 'strict' which causes ValueErrors to be raised in case an

869

encoding error occurs.

870

871

buffering has the same meaning as for the builtin open() API.

872

It defaults to line buffered.

873

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

874

The returned wrapped file object provides an extra attribute

875

.encoding which allows querying the used encoding. This

876

attribute is only available if an encoding was specified as

877

parameter.

878

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

879

"""

880

if encoding is not None and \

881

'b' not in mode:

882

# Force opening of the file in binary mode

883

mode = mode + 'b'

Georg Brandl

1a3284e

2007-12-02 09:40:06 +0000

[diff] [blame]

884

file = builtins.open(filename, mode, buffering)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

885

if encoding is None:

886

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

887

info = lookup(encoding)

888

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

889

# Add attributes to simplify introspection

890

srw.encoding = encoding

891

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

892

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

893

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

894

895

""" Return a wrapped version of file which provides transparent

896

encoding translation.

897

898

Strings written to the wrapped file are interpreted according

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

899

to the given data_encoding and then written to the original

900

file as string using file_encoding. The intermediate encoding

901

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

902

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

903

Strings are read from the file using file_encoding and then

904

passed back to the caller as string using data_encoding.

905

906

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

907

908

errors may be given to define the error handling. It defaults

909

to 'strict' which causes ValueErrors to be raised in case an

910

encoding error occurs.

911

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

912

The returned wrapped file object provides two extra attributes

913

.data_encoding and .file_encoding which reflect the given

914

parameters of the same name. The attributes can be used for

915

introspection by Python programs.

916

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

917

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

918

if file_encoding is None:

919

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

920

data_info = lookup(data_encoding)

921

file_info = lookup(file_encoding)

922

sr = StreamRecoder(file, data_info.encode, data_info.decode,

923

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

924

# Add attributes to simplify introspection

925

sr.data_encoding = data_encoding

926

sr.file_encoding = file_encoding

927

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

928

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

929

### Helpers for codec lookup

930

931

def getencoder(encoding):

932

933

""" Lookup up the codec for the given encoding and return

934

its encoder function.

935

936

Raises a LookupError in case the encoding cannot be found.

937

938

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

939

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

940

941

def getdecoder(encoding):

942

943

""" Lookup up the codec for the given encoding and return

944

its decoder function.

945

946

Raises a LookupError in case the encoding cannot be found.

947

948

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

949

return lookup(encoding).decode

950

951

def getincrementalencoder(encoding):

952

953

""" Lookup up the codec for the given encoding and return

954

its IncrementalEncoder class or factory function.

955

956

Raises a LookupError in case the encoding cannot be found

957

or the codecs doesn't provide an incremental encoder.

958

959

"""

960

encoder = lookup(encoding).incrementalencoder

961

if encoder is None:

962

raise LookupError(encoding)

963

return encoder

964

965

def getincrementaldecoder(encoding):

966

967

""" Lookup up the codec for the given encoding and return

968

its IncrementalDecoder class or factory function.

969

970

Raises a LookupError in case the encoding cannot be found

971

or the codecs doesn't provide an incremental decoder.

972

973

"""

974

decoder = lookup(encoding).incrementaldecoder

975

if decoder is None:

976

raise LookupError(encoding)

977

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

978

979

def getreader(encoding):

980

981

""" Lookup up the codec for the given encoding and return

982

its StreamReader class or factory function.

983

984

Raises a LookupError in case the encoding cannot be found.

985

986

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

987

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

988

989

def getwriter(encoding):

990

991

""" Lookup up the codec for the given encoding and return

992

its StreamWriter class or factory function.

993

994

Raises a LookupError in case the encoding cannot be found.

995

996

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

997

return lookup(encoding).streamwriter

998

999

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Encodes the input strings from the iterator using a IncrementalEncoder.

1004

1005

errors and kwargs are passed through to the IncrementalEncoder

1006

constructor.

1007

"""

1008

encoder = getincrementalencoder(encoding)(errors, **kwargs)

1009

for input in iterator:

1010

output = encoder.encode(input)

1011

if output:

1012

yield output

1013

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Decodes the input strings from the iterator using a IncrementalDecoder.

1022

1023

errors and kwargs are passed through to the IncrementalDecoder

1024

constructor.

1025

"""

1026

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

1027

for input in iterator:

1028

output = decoder.decode(input)

1029

if output:

1030

yield output

Walter Dörwald

2007-05-04 13:05:09 +0000

[diff] [blame]

1031

output = decoder.decode(b"", True)

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

1032

if output:

1033

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

1034

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1035

### Helpers for charmap-based codecs

1036

1037

def make_identity_dict(rng):

1038

1039

""" make_identity_dict(rng) -> dict

1040

1041

Return a dictionary where elements of the rng sequence are

1042

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

1043

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1044

"""

Antoine Pitrou

aaefac7

2012-06-16 22:48:21 +0200

[diff] [blame]

1045

return {i:i for i in rng}

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

1046

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1047

def make_encoding_map(decoding_map):

1048

1049

""" Creates an encoding map from a decoding map.

1050

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

1051

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

1052

times, then that target is mapped to None (undefined mapping),

1053

causing an exception when encountered by the charmap codec

1054

during translation.

1055

1056

One example where this happens is cp875.py which decodes

1057

multiple character to \u001a.

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

1062

if not v in m:

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1067

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1068

### error handlers

1069

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1070

try:

1071

strict_errors = lookup_error("strict")

1072

ignore_errors = lookup_error("ignore")

1073

replace_errors = lookup_error("replace")

1074

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1075

backslashreplace_errors = lookup_error("backslashreplace")

1076

except LookupError:

1077

# In --disable-unicode builds, these error handler are missing

1078

strict_errors = None

1079

ignore_errors = None

1080

replace_errors = None

1081

xmlcharrefreplace_errors = None

1082

backslashreplace_errors = None

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1083

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1084

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1090

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1091

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1092

if __name__ == '__main__':

1093

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1094

# Make stdout translate Latin-1 output into UTF-8 output

1095

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1096

Guido van Rossum