Blame - Lib/codecs.py - platform/external/python/cpython3

2000-03-10 23:20:43 +0000

[diff] [blame]

1

""" codecs -- Python Codec Registry, API and helpers.

2

3

4

Written by Marc-Andre Lemburg (mal@lemburg.com).

"""#"

Marc-André Lemburg

2002-12-12 17:37:50 +0000

[diff] [blame]

10

import __builtin__, sys

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

11

12

### Registry and builtin stateless codec functions

13

Guido van Rossum

b95de4f

2000-03-31 17:25:23 +0000

[diff] [blame]

14

try:

15

from _codecs import *

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame^]

16

except ImportError as why:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

17

raise SystemError('Failed to load the builtin codecs: %s' % why)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

18

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

19

__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

20

"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",

21

"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

22

"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",

23

"strict_errors", "ignore_errors", "replace_errors",

24

"xmlcharrefreplace_errors",

25

"register_error", "lookup_error"]

Skip Montanaro

e99d5ea

2001-01-20 19:54:20 +0000

[diff] [blame]

26

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

27

### Constants

28

29

#

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

30

# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

31

# and its possible byte string values

32

# for UTF8/UTF16/UTF32 output and little/big endian machines

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

33

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

34

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

35

# UTF-8

36

BOM_UTF8 = '\xef\xbb\xbf'

37

38

# UTF-16, little endian

39

BOM_LE = BOM_UTF16_LE = '\xff\xfe'

40

41

# UTF-16, big endian

42

BOM_BE = BOM_UTF16_BE = '\xfe\xff'

43

44

# UTF-32, little endian

45

BOM_UTF32_LE = '\xff\xfe\x00\x00'

46

47

# UTF-32, big endian

48

BOM_UTF32_BE = '\x00\x00\xfe\xff'

49

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

50

if sys.byteorder == 'little':

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

51

Marc-André Lemburg

b28de0d

2002-12-12 17:37:50 +0000

[diff] [blame]

52

# UTF-16, native endianness

53

BOM = BOM_UTF16 = BOM_UTF16_LE

54

55

# UTF-32, native endianness

56

BOM_UTF32 = BOM_UTF32_LE

else:

# UTF-16, native endianness

61

BOM = BOM_UTF16 = BOM_UTF16_BE

62

63

# UTF-32, native endianness

64

BOM_UTF32 = BOM_UTF32_BE

Walter Dörwald

2002-06-04 15:16:29 +0000

[diff] [blame]

65

66

# Old broken names (don't use in new code)

67

BOM32_LE = BOM_UTF16_LE

68

BOM32_BE = BOM_UTF16_BE

69

BOM64_LE = BOM_UTF32_LE

70

BOM64_BE = BOM_UTF32_BE

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

71

72

73

### Codec base classes (defining the API)

74

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

75

class CodecInfo(tuple):

76

77

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,

78

incrementalencoder=None, incrementaldecoder=None, name=None):

79

self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))

self.name = name

self.encode = encode

self.decode = decode

self.incrementalencoder = incrementalencoder

84

self.incrementaldecoder = incrementaldecoder

85

self.streamwriter = streamwriter

86

self.streamreader = streamreader

return self

def __repr__(self):

return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))

91

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

92

class Codec:

93

94

""" Defines the interface for stateless encoders/decoders.

95

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

96

The .encode()/.decode() methods may use different error

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

97

handling schemes by providing the errors argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

98

string values are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

99

Guido van Rossum

d8855fd

2000-03-24 22:14:19 +0000

[diff] [blame]

100

'strict' - raise a ValueError error (or a subclass)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

101

'ignore' - ignore the character and continue with the next

102

'replace' - replace with a suitable replacement character;

103

Python will use the official U+FFFD REPLACEMENT

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

104

CHARACTER for the builtin Unicode codecs on

105

decoding and '?' on encoding.

106

'xmlcharrefreplace' - Replace with the appropriate XML

107

character reference (only for encoding).

108

'backslashreplace' - Replace with backslashed escape sequences

109

(only for encoding).

110

111

The set of allowed values can be extended via register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

112

113

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

114

def encode(self, input, errors='strict'):

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

115

Fred Drake

3e74c0d

2000-03-17 15:40:35 +0000

[diff] [blame]

116

""" Encodes the object input and returns a tuple (output

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

117

object, length consumed).

118

119

errors defines the error handling to apply. It defaults to

120

'strict' handling.

121

122

The method may not store state in the Codec instance. Use

123

StreamCodec for codecs which have to keep state in order to

124

make encoding/decoding efficient.

125

126

The encoder must be able to handle zero length input and

127

return an empty object of the output object type in this

situation.

"""

raise NotImplementedError

132

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

133

def decode(self, input, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

134

135

""" Decodes the object input and returns a tuple (output

136

object, length consumed).

137

138

input must be an object which provides the bf_getreadbuf

139

buffer slot. Python strings, buffer objects and memory

140

mapped files are examples of objects providing this slot.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

141

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

142

errors defines the error handling to apply. It defaults to

143

'strict' handling.

144

145

The method may not store state in the Codec instance. Use

146

StreamCodec for codecs which have to keep state in order to

147

make encoding/decoding efficient.

148

149

The decoder must be able to handle zero length input and

150

return an empty object of the output object type in this

151

situation.

152

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

153

"""

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

154

raise NotImplementedError

155

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

156

class IncrementalEncoder(object):

157

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

158

An IncrementalEncoder encodes an input in multiple steps. The input can be

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

159

passed piece by piece to the encode() method. The IncrementalEncoder remembers

160

the state of the Encoding process between calls to encode().

161

"""

162

def __init__(self, errors='strict'):

163

"""

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

164

Creates an IncrementalEncoder instance.

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

165

166

The IncrementalEncoder may use different error handling schemes by

167

providing the errors keyword argument. See the module docstring

168

for a list of possible values.

"""

self.errors = errors

self.buffer = ""

def encode(self, input, final=False):

174

"""

175

Encodes input and returns the resulting object.

176

"""

177

raise NotImplementedError

def reset(self):

"""

Resets the encoder to the initial state.

182

"""

183

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

184

class BufferedIncrementalEncoder(IncrementalEncoder):

185

"""

186

This subclass of IncrementalEncoder can be used as the baseclass for an

187

incremental encoder if the encoder must keep some of the output in a

188

buffer between calls to encode().

189

"""

190

def __init__(self, errors='strict'):

191

IncrementalEncoder.__init__(self, errors)

192

self.buffer = "" # unencoded input that is kept between calls to encode()

193

194

def _buffer_encode(self, input, errors, final):

195

# Overwrite this method in subclasses: It must encode input

196

# and return an (output, length consumed) tuple

197

raise NotImplementedError

198

199

def encode(self, input, final=False):

200

# encode input (taking the buffer into account)

201

data = self.buffer + input

202

(result, consumed) = self._buffer_encode(data, self.errors, final)

203

# keep unencoded input until the next call

204

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalEncoder.reset(self)

209

self.buffer = ""

210

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

211

class IncrementalDecoder(object):

212

"""

213

An IncrementalDecoder decodes an input in multiple steps. The input can be

214

passed piece by piece to the decode() method. The IncrementalDecoder

215

remembers the state of the decoding process between calls to decode().

216

"""

217

def __init__(self, errors='strict'):

218

"""

219

Creates a IncrementalDecoder instance.

220

221

The IncrementalDecoder may use different error handling schemes by

222

providing the errors keyword argument. See the module docstring

223

for a list of possible values.

"""

self.errors = errors

def decode(self, input, final=False):

228

"""

229

Decodes input and returns the resulting object.

230

"""

231

raise NotImplementedError

def reset(self):

"""

Resets the decoder to the initial state.

236

"""

237

238

class BufferedIncrementalDecoder(IncrementalDecoder):

239

"""

240

This subclass of IncrementalDecoder can be used as the baseclass for an

241

incremental decoder if the decoder must be able to handle incomplete byte

242

sequences.

243

"""

244

def __init__(self, errors='strict'):

245

IncrementalDecoder.__init__(self, errors)

246

self.buffer = "" # undecoded input that is kept between calls to decode()

247

248

def _buffer_decode(self, input, errors, final):

249

# Overwrite this method in subclasses: It must decode input

250

# and return an (output, length consumed) tuple

251

raise NotImplementedError

252

253

def decode(self, input, final=False):

254

# decode input (taking the buffer into account)

255

data = self.buffer + input

256

(result, consumed) = self._buffer_decode(data, self.errors, final)

257

# keep undecoded input until the next call

258

self.buffer = data[consumed:]

return result

def reset(self):

IncrementalDecoder.reset(self)

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

263

self.buffer = ""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

264

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

265

#

266

# The StreamWriter and StreamReader class provide generic working

Andrew M. Kuchling

97c5635

2001-09-18 20:29:48 +0000

[diff] [blame]

267

# interfaces which can be used to implement new encoding submodules

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

268

# very easily. See encodings/utf_8.py for an example on how this is

269

# done.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

270

#

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

271

272

class StreamWriter(Codec):

273

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

274

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

275

276

""" Creates a StreamWriter instance.

277

278

stream must be a file-like object open for writing

279

(binary) data.

280

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

281

The StreamWriter may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

282

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

283

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

284

285

'strict' - raise a ValueError (or a subclass)

286

'ignore' - ignore the character and continue with the next

287

'replace'- replace with a suitable replacement character

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

288

'xmlcharrefreplace' - Replace with the appropriate XML

289

character reference.

290

'backslashreplace' - Replace with backslashed escape

291

sequences (only for encoding).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

292

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

293

The set of allowed parameter values can be extended via

294

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

"""

self.stream = stream

self.errors = errors

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

299

def write(self, object):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

300

301

""" Writes the object's contents encoded to self.stream.

302

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

303

data, consumed = self.encode(object, self.errors)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

304

self.stream.write(data)

305

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

306

def writelines(self, list):

307

308

""" Writes the concatenated list of strings to the stream

309

using .write().

310

"""

311

self.write(''.join(list))

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

312

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

313

def reset(self):

314

315

""" Flushes and resets the codec buffers used for keeping state.

316

317

Calling this method should ensure that the data on the

318

output is put into a clean state, that allows appending

319

of new fresh data without having to rescan the whole

320

stream to recover state.

"""

pass

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

325

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

326

getattr=getattr):

327

328

""" Inherit all other methods from the underlying stream.

329

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

330

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

331

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

336

self.stream.close()

337

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

338

###

339

340

class StreamReader(Codec):

341

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

342

def __init__(self, stream, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

343

344

""" Creates a StreamReader instance.

345

346

stream must be a file-like object open for reading

347

(binary) data.

348

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

349

The StreamReader may use different error handling

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

350

schemes by providing the errors keyword argument. These

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

351

parameters are predefined:

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

352

353

'strict' - raise a ValueError (or a subclass)

354

'ignore' - ignore the character and continue with the next

355

'replace'- replace with a suitable replacement character;

356

Walter Dörwald

2002-11-19 21:42:53 +0000

[diff] [blame]

357

The set of allowed parameter values can be extended via

358

register_error.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

359

"""

360

self.stream = stream

361

self.errors = errors

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

362

self.bytebuffer = ""

Walter Dörwald

2005-07-20 22:15:39 +0000

[diff] [blame]

363

# For str->str decoding this will stay a str

364

# For str->unicode decoding the first read will promote it to unicode

365

self.charbuffer = ""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

366

self.linebuffer = None

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

367

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

368

def decode(self, input, errors='strict'):

369

raise NotImplementedError

370

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

371

def read(self, size=-1, chars=-1, firstline=False):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

372

373

""" Decodes data from the stream self.stream and returns the

374

resulting object.

375

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

376

chars indicates the number of characters to read from the

377

stream. read() will never return more than chars

378

characters, but it might return less, if there are not enough

379

characters available.

380

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

381

size indicates the approximate maximum number of bytes to

382

read from the stream for decoding purposes. The decoder

383

can modify this setting as appropriate. The default value

384

-1 indicates to read and decode as much as possible. size

385

is intended to prevent having to decode huge files in one

386

step.

387

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

388

If firstline is true, and a UnicodeDecodeError happens

389

after the first line terminator in the input only the first line

390

will be returned, the rest of the input will be kept until the

391

next call to read().

392

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

393

The method should use a greedy read strategy meaning that

394

it should read as much data as is allowed within the

395

definition of the encoding and the given size, e.g. if

396

optional encoding endings or state markers are available

397

on the stream, these should be read too.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

398

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

399

# If we have lines cached, first merge them back into characters

400

if self.linebuffer:

401

self.charbuffer = "".join(self.linebuffer)

402

self.linebuffer = None

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

403

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

404

# read until we get the required number of characters (if available)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

405

while True:

406

# can the request can be satisfied from the character buffer?

407

if chars < 0:

Walter Dörwald

ca19943

2006-03-06 22:39:12 +0000

[diff] [blame]

if size < 0:

if self.charbuffer:

break

elif len(self.charbuffer) >= size:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

412

break

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

413

else:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

414

if len(self.charbuffer) >= chars:

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

break

# we need more data

if size < 0:

newdata = self.stream.read()

419

else:

420

newdata = self.stream.read(size)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

421

# decode bytes (those remaining from the last call included)

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

422

data = self.bytebuffer + newdata

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

423

try:

424

newchars, decodedbytes = self.decode(data, self.errors)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame^]

425

except UnicodeDecodeError as exc:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

426

if firstline:

427

newchars, decodedbytes = self.decode(data[:exc.start], self.errors)

428

lines = newchars.splitlines(True)

if len(lines)<=1:

raise

else:

raise

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

433

# keep undecoded bytes until the next call

434

self.bytebuffer = data[decodedbytes:]

435

# put new characters in the character buffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

436

self.charbuffer += newchars

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

437

# there was no data available

438

if not newdata:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

439

break

440

if chars < 0:

441

# Return everything we've got

442

result = self.charbuffer

Walter Dörwald

2005-07-20 22:15:39 +0000

[diff] [blame]

443

self.charbuffer = ""

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

444

else:

445

# Return the first chars characters

446

result = self.charbuffer[:chars]

447

self.charbuffer = self.charbuffer[chars:]

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

448

return result

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

449

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

450

def readline(self, size=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

451

452

""" Read one line from the input stream and return the

453

decoded data.

454

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

455

size, if given, is passed as size argument to the

456

read() method.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

457

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

458

"""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

459

# If we have lines cached from an earlier read, return

460

# them unconditionally

461

if self.linebuffer:

462

line = self.linebuffer[0]

463

del self.linebuffer[0]

464

if len(self.linebuffer) == 1:

465

# revert to charbuffer mode; we might need more data

466

# next time

467

self.charbuffer = self.linebuffer[0]

468

self.linebuffer = None

469

if not keepends:

470

line = line.splitlines(False)[0]

471

return line

Tim Peters

536cf99

2005-12-25 23:18:31 +0000

[diff] [blame]

472

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

473

readsize = size or 72

Walter Dörwald

2005-07-20 22:15:39 +0000

[diff] [blame]

474

line = ""

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

475

# If size is given, we call read() only once

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

476

while True:

Martin v. Löwis

2005-08-24 07:38:12 +0000

[diff] [blame]

477

data = self.read(readsize, firstline=True)

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

478

if data:

Walter Dörwald

a4eb2d5

2005-04-21 21:42:35 +0000

[diff] [blame]

479

# If we're at a "\r" read one extra character (which might

480

# be a "\n") to get a proper line ending. If the stream is

Walter Dörwald

bc8e642

2005-04-21 21:32:03 +0000

[diff] [blame]

481

# temporarily exhausted we return the wrong line ending.

Walter Dörwald

2005-07-20 22:15:39 +0000

[diff] [blame]

482

if data.endswith("\r"):

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

483

data += self.read(size=1, chars=1)

Walter Dörwald

7a6dc13

2005-04-04 21:38:47 +0000

[diff] [blame]

484

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

485

line += data

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

486

lines = line.splitlines(True)

487

if lines:

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

488

if len(lines) > 1:

489

# More than one line result; the first line is a full line

# to return

line = lines[0]

del lines[0]

if len(lines) > 1:

# cache the remaining lines

495

lines[-1] += self.charbuffer

496

self.linebuffer = lines

497

self.charbuffer = None

498

else:

499

# only one remaining line, put it back into charbuffer

500

self.charbuffer = lines[0] + self.charbuffer

501

if not keepends:

502

line = line.splitlines(False)[0]

503

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

504

line0withend = lines[0]

505

line0withoutend = lines[0].splitlines(False)[0]

506

if line0withend != line0withoutend: # We really have a line end

507

# Put the rest back together and keep it until the next call

Walter Dörwald

2005-07-20 22:15:39 +0000

[diff] [blame]

508

self.charbuffer = "".join(lines[1:]) + self.charbuffer

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

if keepends:

line = line0withend

else:

line = line0withoutend

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

513

break

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

514

# we didn't get anything or this was our only try

Walter Dörwald

9fa0946

2005-01-10 12:01:39 +0000

[diff] [blame]

515

if not data or size is not None:

Walter Dörwald

2004-12-21 22:24:00 +0000

[diff] [blame]

516

if line and not keepends:

517

line = line.splitlines(False)[0]

break

if readsize<8000:

readsize *= 2

return line

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

522

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

523

def readlines(self, sizehint=None, keepends=True):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

524

525

""" Read all lines available on the input stream

526

and return them as list of lines.

527

528

Line breaks are implemented using the codec's decoder

529

method and are included in the list entries.

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

530

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

531

sizehint, if given, is ignored since there is no efficient

532

way to finding the true end-of-line.

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

533

534

"""

Walter Dörwald

2004-09-07 20:24:22 +0000

[diff] [blame]

535

data = self.read()

Hye-Shik Chang

af5c7cf

2004-10-17 23:51:21 +0000

[diff] [blame]

536

return data.splitlines(keepends)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

""" Resets the codec buffers used for keeping state.

541

542

Note that no stream repositioning should take place.

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

543

This method is primarily intended to be able to recover

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

544

from decoding errors.

545

546

"""

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

547

self.bytebuffer = ""

548

self.charbuffer = u""

Martin v. Löwis

2005-09-18 08:34:39 +0000

[diff] [blame]

549

self.linebuffer = None

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

550

Walter Dörwald

71fd90d

2005-03-14 19:25:41 +0000

[diff] [blame]

551

def seek(self, offset, whence=0):

Walter Dörwald

729c31f

2005-03-14 19:06:30 +0000

[diff] [blame]

552

""" Set the input stream's current position.

553

554

Resets the codec buffers used for keeping state.

555

"""

556

self.reset()

557

self.stream.seek(offset, whence)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

558

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

559

def next(self):

560

561

""" Return the next decoded line from the input stream."""

562

line = self.readline()

if line:

return line

raise StopIteration

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

570

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

571

getattr=getattr):

572

573

""" Inherit all other methods from the underlying stream.

574

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

575

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

576

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

581

self.stream.close()

582

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

583

###

584

585

class StreamReaderWriter:

586

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

587

""" StreamReaderWriter instances allow wrapping streams which

588

work in both read and write modes.

589

590

The design is such that one can use the factory functions

Thomas Wouters

7e47402

2000-07-16 12:04:32 +0000

[diff] [blame]

591

returned by the codec.lookup() function to construct the

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

592

instance.

593

594

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

595

# Optional attributes set by the file wrappers below

596

encoding = 'unknown'

597

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

598

def __init__(self, stream, Reader, Writer, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

599

600

""" Creates a StreamReaderWriter instance.

601

602

stream must be a Stream-like object.

603

604

Reader, Writer must be factory functions or classes

605

providing the StreamReader, StreamWriter interface resp.

606

607

Error handling is done in the same way as defined for the

608

StreamWriter/Readers.

"""

self.stream = stream

self.reader = Reader(stream, errors)

613

self.writer = Writer(stream, errors)

614

self.errors = errors

615

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

616

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

617

618

return self.reader.read(size)

619

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

620

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

621

622

return self.reader.readline(size)

623

Guido van Rossum

d58c26f

2000-05-01 16:17:32 +0000

[diff] [blame]

624

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

625

626

return self.reader.readlines(sizehint)

627

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

628

def next(self):

629

630

""" Return the next decoded line from the input stream."""

631

return self.reader.next()

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

636

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

637

638

return self.writer.write(data)

639

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

640

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

641

642

return self.writer.writelines(list)

643

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

649

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

650

getattr=getattr):

651

652

""" Inherit all other methods from the underlying stream.

653

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

654

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

655

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

656

# these are needed to make "with codecs.open(...)" work properly

def __enter__(self):

return self

def __exit__(self, type, value, tb):

662

self.stream.close()

663

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

###

class StreamRecoder:

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

668

""" StreamRecoder instances provide a frontend - backend

669

view of encoding data.

670

671

They use the complete set of APIs returned by the

672

codecs.lookup() function to implement their task.

673

674

Data written to the stream is first decoded into an

675

intermediate format (which is dependent on the given codec

676

combination) and then written to the stream using an instance

677

of the provided Writer class.

678

679

In the other direction, data is read from the stream using a

680

Reader instance and then return encoded data to the caller.

681

682

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

683

# Optional attributes set by the file wrappers below

684

data_encoding = 'unknown'

685

file_encoding = 'unknown'

686

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

687

def __init__(self, stream, encode, decode, Reader, Writer,

688

errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

689

690

""" Creates a StreamRecoder instance which implements a two-way

691

conversion: encode and decode work on the frontend (the

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

692

input to .read() and output of .write()) while

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

693

Reader and Writer work on the backend (reading and

Fred Drake

908670c

2000-03-17 15:42:11 +0000

[diff] [blame]

694

writing to the stream).

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

695

696

You can use these objects to do transparent direct

697

recodings from e.g. latin-1 to utf-8 and back.

698

699

stream must be a file-like object.

700

701

encode, decode must adhere to the Codec interface, Reader,

702

Writer must be factory functions or classes providing the

703

StreamReader, StreamWriter interface resp.

704

705

encode and decode are needed for the frontend translation,

706

Reader and Writer for the backend translation. Unicode is

707

used as intermediate encoding.

708

709

Error handling is done in the same way as defined for the

710

StreamWriter/Readers.

"""

self.stream = stream

self.encode = encode

self.decode = decode

self.reader = Reader(stream, errors)

717

self.writer = Writer(stream, errors)

718

self.errors = errors

719

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

720

def read(self, size=-1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

721

722

data = self.reader.read(size)

723

data, bytesencoded = self.encode(data, self.errors)

724

return data

725

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

726

def readline(self, size=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

727

728

if size is None:

729

data = self.reader.readline()

730

else:

731

data = self.reader.readline(size)

732

data, bytesencoded = self.encode(data, self.errors)

733

return data

734

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

735

def readlines(self, sizehint=None):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

736

Marc-André Lemburg

d594849

2004-02-26 15:22:17 +0000

[diff] [blame]

737

data = self.reader.read()

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

738

data, bytesencoded = self.encode(data, self.errors)

739

return data.splitlines(1)

740

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

741

def next(self):

742

743

""" Return the next decoded line from the input stream."""

Walter Dörwald

c5238b8

2005-09-01 11:56:53 +0000

[diff] [blame]

744

data = self.reader.next()

745

data, bytesencoded = self.encode(data, self.errors)

746

return data

Walter Dörwald

2002-11-06 16:53:44 +0000

[diff] [blame]

def __iter__(self):

return self

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

751

def write(self, data):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

752

753

data, bytesdecoded = self.decode(data, self.errors)

754

return self.writer.write(data)

755

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

756

def writelines(self, list):

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

757

758

data = ''.join(list)

759

data, bytesdecoded = self.decode(data, self.errors)

760

return self.writer.write(data)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

def reset(self):

self.reader.reset()

self.writer.reset()

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

767

def __getattr__(self, name,

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

768

getattr=getattr):

769

770

""" Inherit all other methods from the underlying stream.

771

"""

Tim Peters

2001-05-15 17:19:16 +0000

[diff] [blame]

772

return getattr(self.stream, name)

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

773

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

def __enter__(self):

return self

def __exit__(self, type, value, tb):

778

self.stream.close()

779

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

780

### Shortcuts

781

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

782

def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

783

784

""" Open an encoded file using the given mode and return

785

a wrapped version providing transparent encoding/decoding.

786

787

Note: The wrapped version will only accept the object format

788

defined by the codecs, i.e. Unicode objects for most builtin

Skip Montanaro

9f5f9d9

2005-03-16 03:51:56 +0000

[diff] [blame]

789

codecs. Output is also codec dependent and will usually be

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

790

Unicode as well.

791

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

792

Files are always opened in binary mode, even if no binary mode

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

793

was specified. This is done to avoid data loss due to encodings

Marc-André Lemburg

349a3d3

2000-06-21 21:21:04 +0000

[diff] [blame]

794

using 8-bit values. The default file mode is 'rb' meaning to

795

open the file in binary read mode.

796

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

797

encoding specifies the encoding which is to be used for the

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

798

file.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

799

800

errors may be given to define the error handling. It defaults

801

to 'strict' which causes ValueErrors to be raised in case an

802

encoding error occurs.

803

804

buffering has the same meaning as for the builtin open() API.

805

It defaults to line buffered.

806

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

807

The returned wrapped file object provides an extra attribute

808

.encoding which allows querying the used encoding. This

809

attribute is only available if an encoding was specified as

810

parameter.

811

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

812

"""

813

if encoding is not None and \

814

'b' not in mode:

815

# Force opening of the file in binary mode

816

mode = mode + 'b'

817

file = __builtin__.open(filename, mode, buffering)

818

if encoding is None:

819

return file

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

820

info = lookup(encoding)

821

srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

822

# Add attributes to simplify introspection

823

srw.encoding = encoding

824

return srw

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

825

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

826

def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

827

828

""" Return a wrapped version of file which provides transparent

829

encoding translation.

830

831

Strings written to the wrapped file are interpreted according

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

832

to the given data_encoding and then written to the original

833

file as string using file_encoding. The intermediate encoding

834

will usually be Unicode but depends on the specified codecs.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

835

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

836

Strings are read from the file using file_encoding and then

837

passed back to the caller as string using data_encoding.

838

839

If file_encoding is not given, it defaults to data_encoding.

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

840

841

errors may be given to define the error handling. It defaults

842

to 'strict' which causes ValueErrors to be raised in case an

843

encoding error occurs.

844

Fred Drake

2000-04-13 14:11:21 +0000

[diff] [blame]

845

The returned wrapped file object provides two extra attributes

846

.data_encoding and .file_encoding which reflect the given

847

parameters of the same name. The attributes can be used for

848

introspection by Python programs.

849

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

850

"""

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

851

if file_encoding is None:

852

file_encoding = data_encoding

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

853

data_info = lookup(data_encoding)

854

file_info = lookup(file_encoding)

855

sr = StreamRecoder(file, data_info.encode, data_info.decode,

856

file_info.streamreader, file_info.streamwriter, errors)

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

857

# Add attributes to simplify introspection

858

sr.data_encoding = data_encoding

859

sr.file_encoding = file_encoding

860

return sr

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

861

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

862

### Helpers for codec lookup

863

864

def getencoder(encoding):

865

866

""" Lookup up the codec for the given encoding and return

867

its encoder function.

868

869

Raises a LookupError in case the encoding cannot be found.

870

871

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

872

return lookup(encoding).encode

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

873

874

def getdecoder(encoding):

875

876

""" Lookup up the codec for the given encoding and return

877

its decoder function.

878

879

Raises a LookupError in case the encoding cannot be found.

880

881

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

882

return lookup(encoding).decode

883

884

def getincrementalencoder(encoding):

885

886

""" Lookup up the codec for the given encoding and return

887

its IncrementalEncoder class or factory function.

888

889

Raises a LookupError in case the encoding cannot be found

890

or the codecs doesn't provide an incremental encoder.

891

892

"""

893

encoder = lookup(encoding).incrementalencoder

894

if encoder is None:

895

raise LookupError(encoding)

896

return encoder

897

898

def getincrementaldecoder(encoding):

899

900

""" Lookup up the codec for the given encoding and return

901

its IncrementalDecoder class or factory function.

902

903

Raises a LookupError in case the encoding cannot be found

904

or the codecs doesn't provide an incremental decoder.

905

906

"""

907

decoder = lookup(encoding).incrementaldecoder

908

if decoder is None:

909

raise LookupError(encoding)

910

return decoder

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

911

912

def getreader(encoding):

913

914

""" Lookup up the codec for the given encoding and return

915

its StreamReader class or factory function.

916

917

Raises a LookupError in case the encoding cannot be found.

918

919

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

920

return lookup(encoding).streamreader

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

921

922

def getwriter(encoding):

923

924

""" Lookup up the codec for the given encoding and return

925

its StreamWriter class or factory function.

926

927

Raises a LookupError in case the encoding cannot be found.

928

929

"""

Thomas Wouters

2006-04-21 09:43:23 +0000

[diff] [blame]

930

return lookup(encoding).streamwriter

931

932

def iterencode(iterator, encoding, errors='strict', **kwargs):

"""

Encoding iterator.

Encodes the input strings from the iterator using a IncrementalEncoder.

937

938

errors and kwargs are passed through to the IncrementalEncoder

939

constructor.

940

"""

941

encoder = getincrementalencoder(encoding)(errors, **kwargs)

942

for input in iterator:

943

output = encoder.encode(input)

944

if output:

945

yield output

946

output = encoder.encode("", True)

if output:

yield output

def iterdecode(iterator, encoding, errors='strict', **kwargs):

"""

Decoding iterator.

Decodes the input strings from the iterator using a IncrementalDecoder.

955

956

errors and kwargs are passed through to the IncrementalDecoder

957

constructor.

958

"""

959

decoder = getincrementaldecoder(encoding)(errors, **kwargs)

960

for input in iterator:

961

output = decoder.decode(input)

962

if output:

963

yield output

964

output = decoder.decode("", True)

965

if output:

966

yield output

Marc-André Lemburg

2001-09-19 11:24:48 +0000

[diff] [blame]

967

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

968

### Helpers for charmap-based codecs

969

970

def make_identity_dict(rng):

971

972

""" make_identity_dict(rng) -> dict

973

974

Return a dictionary where elements of the rng sequence are

975

mapped to themselves.

Tim Peters

88869f9

2001-01-14 23:36:06 +0000

[diff] [blame]

976

Marc-André Lemburg

a866df8

2001-01-03 21:29:14 +0000

[diff] [blame]

"""

res = {}

for i in rng:

res[i]=i

return res

Marc-André Lemburg

2001-05-16 09:41:45 +0000

[diff] [blame]

983

def make_encoding_map(decoding_map):

984

985

""" Creates an encoding map from a decoding map.

986

Walter Dörwald

7f3ed74

2003-02-02 23:08:27 +0000

[diff] [blame]

987

If a target mapping in the decoding map occurs multiple

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

988

times, then that target is mapped to None (undefined mapping),

989

causing an exception when encountered by the charmap codec

990

during translation.

991

992

One example where this happens is cp875.py which decodes

993

multiple character to \u001a.

"""

m = {}

for k,v in decoding_map.items():

Raymond Hettinger

54f0222

2002-06-01 14:18:47 +0000

[diff] [blame]

998

if not v in m:

Marc-André Lemburg

716cf91

2001-05-16 09:41:45 +0000

[diff] [blame]

m[v] = k

else:

m[v] = None

return m

Tim Peters

3a2ab1a

2001-05-29 06:06:54 +0000

[diff] [blame]

1003

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1004

### error handlers

1005

Martin v. Löwis

e2713be

2005-03-08 15:03:08 +0000

[diff] [blame]

1006

try:

1007

strict_errors = lookup_error("strict")

1008

ignore_errors = lookup_error("ignore")

1009

replace_errors = lookup_error("replace")

1010

xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")

1011

backslashreplace_errors = lookup_error("backslashreplace")

1012

except LookupError:

1013

# In --disable-unicode builds, these error handler are missing

1014

strict_errors = None

1015

ignore_errors = None

1016

replace_errors = None

1017

xmlcharrefreplace_errors = None

1018

backslashreplace_errors = None

Walter Dörwald

3aeb632

2002-09-02 13:14:32 +0000

[diff] [blame]

1019

Martin v. Löwis

6cd441d

2001-07-31 08:54:55 +0000

[diff] [blame]

1020

# Tell modulefinder that using codecs probably needs the encodings

# package

_false = 0

if _false:

import encodings

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1026

### Tests

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1027

Guido van Rossum

2000-03-10 23:20:43 +0000

[diff] [blame]

1028

if __name__ == '__main__':

1029

Guido van Rossum

2000-04-11 15:37:43 +0000

[diff] [blame]

1030

# Make stdout translate Latin-1 output into UTF-8 output

1031

sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')

Guido van Rossum

2000-04-11 15:41:38 +0000

[diff] [blame]

1032

Guido van Rossum