blob: 1bb479203f365dabd6cc59bb0b6debef55e40396 [file] [log] [blame]
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001""" Python 'utf-8-sig' Codec
2This work similar to UTF-8 with the following changes:
3
4* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5 first three bytes.
6
7* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8 bytes will be skipped.
9"""
10import codecs
11
12### Codec APIs
13
14def encode(input, errors='strict'):
Walter Dörwald3abcb012007-04-16 22:10:50 +000015 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
16 len(input))
Martin v. Löwis412ed3b2006-01-08 10:45:39 +000017
18def decode(input, errors='strict'):
19 prefix = 0
Thomas Wouters89f507f2006-12-13 04:49:30 +000020 if input[:3] == codecs.BOM_UTF8:
Martin v. Löwis412ed3b2006-01-08 10:45:39 +000021 input = input[3:]
22 prefix = 3
23 (output, consumed) = codecs.utf_8_decode(input, errors, True)
24 return (output, consumed+prefix)
25
Thomas Woutersa9773292006-04-21 09:43:23 +000026class IncrementalEncoder(codecs.IncrementalEncoder):
27 def __init__(self, errors='strict'):
28 codecs.IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +000029 self.first = 1
Thomas Woutersa9773292006-04-21 09:43:23 +000030
31 def encode(self, input, final=False):
32 if self.first:
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 self.first = 0
34 return codecs.BOM_UTF8 + \
35 codecs.utf_8_encode(input, self.errors)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +000036 else:
Thomas Wouters0e3f5912006-08-11 14:57:12 +000037 return codecs.utf_8_encode(input, self.errors)[0]
Thomas Woutersa9773292006-04-21 09:43:23 +000038
39 def reset(self):
40 codecs.IncrementalEncoder.reset(self)
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 self.first = 1
42
43 def getstate(self):
44 return self.first
45
46 def setstate(self, state):
47 self.first = state
Thomas Woutersa9773292006-04-21 09:43:23 +000048
49class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
50 def __init__(self, errors='strict'):
51 codecs.BufferedIncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 self.first = 1
Thomas Woutersa9773292006-04-21 09:43:23 +000053
54 def _buffer_decode(self, input, errors, final):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 if self.first:
Thomas Woutersa9773292006-04-21 09:43:23 +000056 if len(input) < 3:
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 if codecs.BOM_UTF8.startswith(input):
58 # not enough data to decide if this really is a BOM
59 # => try again on the next call
Guido van Rossumef87d6e2007-05-02 19:09:54 +000060 return ("", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +000061 else:
62 self.first = 0
63 else:
64 self.first = 0
65 if input[:3] == codecs.BOM_UTF8:
66 (output, consumed) = \
67 codecs.utf_8_decode(input[3:], errors, final)
68 return (output, consumed+3)
Thomas Woutersa9773292006-04-21 09:43:23 +000069 return codecs.utf_8_decode(input, errors, final)
70
71 def reset(self):
72 codecs.BufferedIncrementalDecoder.reset(self)
Walter Dörwald3abcb012007-04-16 22:10:50 +000073 self.first = 1
74
75 def getstate(self):
76 state = codecs.BufferedIncrementalDecoder.getstate(self)
77 # state[1] must be 0 here, as it isn't passed along to the caller
78 return (state[0], self.first)
79
80 def setstate(self, state):
81 # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
82 codecs.BufferedIncrementalDecoder.setstate(self, state)
83 self.first = state[1]
Thomas Woutersa9773292006-04-21 09:43:23 +000084
Martin v. Löwis412ed3b2006-01-08 10:45:39 +000085class StreamWriter(codecs.StreamWriter):
86 def reset(self):
87 codecs.StreamWriter.reset(self)
88 try:
89 del self.encode
90 except AttributeError:
91 pass
92
93 def encode(self, input, errors='strict'):
94 self.encode = codecs.utf_8_encode
95 return encode(input, errors)
96
97class StreamReader(codecs.StreamReader):
98 def reset(self):
99 codecs.StreamReader.reset(self)
100 try:
101 del self.decode
102 except AttributeError:
103 pass
104
105 def decode(self, input, errors='strict'):
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000106 if len(input) < 3:
107 if codecs.BOM_UTF8.startswith(input):
108 # not enough data to decide if this is a BOM
109 # => try again on the next call
110 return ("", 0)
111 elif input[:3] == codecs.BOM_UTF8:
112 self.decode = codecs.utf_8_decode
113 (output, consumed) = codecs.utf_8_decode(input[3:],errors)
114 return (output, consumed+3)
115 # (else) no BOM present
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000116 self.decode = codecs.utf_8_decode
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000117 return codecs.utf_8_decode(input, errors)
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000118
119### encodings module API
120
121def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000122 return codecs.CodecInfo(
123 name='utf-8-sig',
124 encode=encode,
125 decode=decode,
126 incrementalencoder=IncrementalEncoder,
127 incrementaldecoder=IncrementalDecoder,
128 streamreader=StreamReader,
129 streamwriter=StreamWriter,
130 )