blob: 139fcf1cb649d7b285eac564f5645b9772c84440 [file] [log] [blame]
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +01001/*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20*/
21
22
23#include "config.h"
24#include "core/loader/TextResourceDecoder.h"
25
26#include "HTMLNames.h"
27#include "core/dom/DOMImplementation.h"
28#include "core/html/parser/HTMLMetaCharsetParser.h"
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +010029#include "core/platform/text/TextEncodingDetector.h"
Torne (Richard Coles)81a51572013-05-13 16:52:28 +010030#include "wtf/StringExtras.h"
31#include "wtf/text/TextCodec.h"
32#include "wtf/text/TextEncoding.h"
33#include "wtf/text/TextEncodingRegistry.h"
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +010034
35using namespace WTF;
36
37namespace WebCore {
38
39using namespace HTMLNames;
40
41static inline bool bytesEqual(const char* p, char b0, char b1)
42{
43 return p[0] == b0 && p[1] == b1;
44}
45
46static inline bool bytesEqual(const char* p, char b0, char b1, char b2)
47{
48 return p[0] == b0 && p[1] == b1 && p[2] == b2;
49}
50
51static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
52{
53 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;
54}
55
56static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
57{
58 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;
59}
60
61static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
62{
63 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;
64}
65
66static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
67{
68 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;
69}
70
71// You might think we should put these find functions elsewhere, perhaps with the
72// similar functions that operate on UChar, but arguably only the decoder has
73// a reason to process strings of char rather than UChar.
74
75static int find(const char* subject, size_t subjectLength, const char* target)
76{
77 size_t targetLength = strlen(target);
78 if (targetLength > subjectLength)
79 return -1;
80 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
81 bool match = true;
82 for (size_t j = 0; j < targetLength; ++j) {
83 if (subject[i + j] != target[j]) {
84 match = false;
85 break;
86 }
87 }
88 if (match)
89 return i;
90 }
91 return -1;
92}
93
Torne (Richard Coles)81a51572013-05-13 16:52:28 +010094static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +010095{
96 Vector<char, 64> buffer(length + 1);
97 memcpy(buffer.data(), encodingName, length);
98 buffer[length] = '\0';
99 return buffer.data();
100}
101
102class KanjiCode {
103public:
104 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
105 static enum Type judge(const char* str, int length);
106 static const int ESC = 0x1b;
107 static const unsigned char sjisMap[256];
108 static int ISkanji(int code)
109 {
110 if (code >= 0x100)
111 return 0;
112 return sjisMap[code & 0xff] & 1;
113 }
114 static int ISkana(int code)
115 {
116 if (code >= 0x100)
117 return 0;
118 return sjisMap[code & 0xff] & 2;
119 }
120};
121
122const unsigned char KanjiCode::sjisMap[256] = {
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
139};
140
141/*
142 * EUC-JP is
143 * [0xa1 - 0xfe][0xa1 - 0xfe]
144 * 0x8e[0xa1 - 0xfe](SS2)
145 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
146 *
147 * Shift_Jis is
148 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
149 *
150 * Shift_Jis Hankaku Kana is
151 * [0xa1 - 0xdf]
152 */
153
154/*
155 * KanjiCode::judge() is based on judge_jcode() from jvim
156 * http://hp.vector.co.jp/authors/VA003457/vim/
157 *
158 * Special Thanks to Kenichi Tsuchida
159 */
160
161enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
162{
163 enum Type code;
164 int i;
165 int bfr = false; /* Kana Moji */
166 int bfk = 0; /* EUC Kana */
167 int sjis = 0;
168 int euc = 0;
169
170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
171
172 code = ASCII;
173
174 i = 0;
175 while (i < size) {
176 if (ptr[i] == ESC && (size - i >= 3)) {
177 if (bytesEqual(str + i + 1, '$', 'B')
178 || bytesEqual(str + i + 1, '(', 'B')
179 || bytesEqual(str + i + 1, '$', '@')
180 || bytesEqual(str + i + 1, '(', 'J')) {
181 code = JIS;
182 goto breakBreak;
183 }
184 if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')', 'I')) {
185 code = JIS;
186 i += 3;
187 } else {
188 i++;
189 }
190 bfr = false;
191 bfk = 0;
192 } else {
193 if (ptr[i] < 0x20) {
194 bfr = false;
195 bfk = 0;
196 /* ?? check kudokuten ?? && ?? hiragana ?? */
197 if ((i >= 2) && (ptr[i - 2] == 0x81)
198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
199 code = SJIS;
200 sjis += 100; /* kudokuten */
201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
202 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
203 code = EUC;
204 euc += 100; /* kudokuten */
205 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
206 sjis += 40; /* hiragana */
207 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
208 euc += 40; /* hiragana */
209 }
210 } else {
211 /* ?? check hiragana or katana ?? */
212 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
213 sjis++; /* hiragana */
214 } else if ((size - i > 1) && (ptr[i] == 0x83)
215 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
216 sjis++; /* katakana */
217 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
218 euc++; /* hiragana */
219 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
220 euc++; /* katakana */
221 }
222 if (bfr) {
223 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
224 code = SJIS;
225 goto breakBreak;
226 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
227 code = SJIS;
228 goto breakBreak;
229 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
230 code = EUC;
231 goto breakBreak;
232 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
233 code = EUC;
234 goto breakBreak;
235 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
236 code = SJIS;
237 goto breakBreak;
238 } else if (ptr[i] <= 0x7f) {
239 code = SJIS;
240 goto breakBreak;
241 } else {
242 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
243 euc++; /* sjis hankaku kana kigo */
244 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
245 ; /* sjis hankaku kana */
246 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
247 euc++;
248 } else if (0x8e == ptr[i]) {
249 euc++;
250 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
251 sjis++;
252 }
253 bfr = false;
254 bfk = 0;
255 }
256 } else if (0x8e == ptr[i]) {
257 if (size - i <= 1) {
258 ;
259 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
260 /* EUC KANA or SJIS KANJI */
261 if (bfk == 1) {
262 euc += 100;
263 }
264 bfk++;
265 i++;
266 } else {
267 /* SJIS only */
268 code = SJIS;
269 goto breakBreak;
270 }
271 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
272 /* SJIS only */
273 code = SJIS;
274 if ((size - i >= 1)
275 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
276 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
277 goto breakBreak;
278 }
279 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
280 /* EUC only */
281 code = EUC;
282 if ((size - i >= 1)
283 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
284 goto breakBreak;
285 }
286 } else if (ptr[i] <= 0x7f) {
287 ;
288 } else {
289 bfr = true;
290 bfk = 0;
291 }
292 }
293 i++;
294 }
295 }
296 if (code == ASCII) {
297 if (sjis > euc) {
298 code = SJIS;
299 } else if (sjis < euc) {
300 code = EUC;
301 }
302 }
303breakBreak:
304 return (code);
305}
306
307TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
308{
309 if (equalIgnoringCase(mimeType, "text/css"))
310 return CSS;
311 if (equalIgnoringCase(mimeType, "text/html"))
312 return HTML;
313 if (DOMImplementation::isXMLMIMEType(mimeType))
314 return XML;
315 return PlainText;
316}
317
Torne (Richard Coles)81a51572013-05-13 16:52:28 +0100318const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const WTF::TextEncoding& specifiedDefaultEncoding)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100319{
Ben Murdoch02772c62013-07-26 10:21:05 +0100320 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100321 // for text/xml. This matches Firefox.
322 if (contentType == XML)
323 return UTF8Encoding();
324 if (!specifiedDefaultEncoding.isValid())
325 return Latin1Encoding();
326 return specifiedDefaultEncoding;
327}
328
Torne (Richard Coles)81a51572013-05-13 16:52:28 +0100329TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100330 : m_contentType(determineContentType(mimeType))
331 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
332 , m_source(DefaultEncoding)
333 , m_hintEncoding(0)
334 , m_checkedForBOM(false)
335 , m_checkedForCSSCharset(false)
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100336 , m_checkedForXMLCharset(false)
337 , m_checkedForMetaCharset(false)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100338 , m_useLenientXMLDecoding(false)
339 , m_sawError(false)
340 , m_usesEncodingDetector(usesEncodingDetector)
341{
342}
343
344TextResourceDecoder::~TextResourceDecoder()
345{
346}
347
Torne (Richard Coles)81a51572013-05-13 16:52:28 +0100348void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, EncodingSource source)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100349{
350 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
351 if (!encoding.isValid())
352 return;
353
354 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
355 // treat x-user-defined as windows-1252 (bug 18270)
356 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
357 m_encoding = "windows-1252";
Ben Murdoch02772c62013-07-26 10:21:05 +0100358 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100359 m_encoding = encoding.closestByteBasedEquivalent();
360 else
361 m_encoding = encoding;
362
363 m_codec.clear();
364 m_source = source;
365}
366
367// Returns the position of the encoding string.
368static int findXMLEncoding(const char* str, int len, int& encodingLength)
369{
370 int pos = find(str, len, "encoding");
371 if (pos == -1)
372 return -1;
373 pos += 8;
Ben Murdoch02772c62013-07-26 10:21:05 +0100374
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100375 // Skip spaces and stray control characters.
376 while (pos < len && str[pos] <= ' ')
377 ++pos;
378
379 // Skip equals sign.
380 if (pos >= len || str[pos] != '=')
381 return -1;
382 ++pos;
383
384 // Skip spaces and stray control characters.
385 while (pos < len && str[pos] <= ' ')
386 ++pos;
387
388 // Skip quotation mark.
389 if (pos >= len)
390 return - 1;
391 char quoteMark = str[pos];
392 if (quoteMark != '"' && quoteMark != '\'')
393 return -1;
394 ++pos;
395
396 // Find the trailing quotation mark.
397 int end = pos;
398 while (end < len && str[end] != quoteMark)
399 ++end;
400 if (end >= len)
401 return -1;
402
403 encodingLength = end - pos;
404 return pos;
405}
406
407// true if there is more to parse
408static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
409{
410 while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
411 ++pos;
412 return pos != dataEnd;
413}
414
415size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
416{
417 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
418 // We let it override even a user-chosen encoding.
419 ASSERT(!m_checkedForBOM);
420
421 size_t lengthOfBOM = 0;
422
423 size_t bufferLength = m_buffer.size();
424
425 size_t buf1Len = bufferLength;
426 size_t buf2Len = len;
427 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
428 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
429 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
430 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
431 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
432 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
433
434 // Check for the BOM.
435 if (c1 == 0xFF && c2 == 0xFE) {
436 if (c3 != 0 || c4 != 0) {
437 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
438 lengthOfBOM = 2;
439 } else {
440 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
441 lengthOfBOM = 4;
442 }
443 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
444 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
445 lengthOfBOM = 3;
446 } else if (c1 == 0xFE && c2 == 0xFF) {
447 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
448 lengthOfBOM = 2;
449 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
450 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
451 lengthOfBOM = 4;
452 }
453
454 if (lengthOfBOM || bufferLength + len >= 4)
455 m_checkedForBOM = true;
456
457 return lengthOfBOM;
458}
459
460bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
461{
462 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
463 m_checkedForCSSCharset = true;
464 return true;
465 }
466
467 size_t oldSize = m_buffer.size();
468 m_buffer.grow(oldSize + len);
469 memcpy(m_buffer.data() + oldSize, data, len);
470
471 movedDataToBuffer = true;
472
473 if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13
474 return false;
475
476 const char* dataStart = m_buffer.data();
477 const char* dataEnd = dataStart + m_buffer.size();
478
479 if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {
480 dataStart += 10;
481 const char* pos = dataStart;
482
483 while (pos < dataEnd && *pos != '"')
484 ++pos;
485 if (pos == dataEnd)
486 return false;
487
488 int encodingNameLength = pos - dataStart;
Ben Murdoch02772c62013-07-26 10:21:05 +0100489
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100490 ++pos;
491
492 if (*pos == ';')
493 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
494 }
495
496 m_checkedForCSSCharset = true;
497 return true;
498}
499
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100500bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100501{
502 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100503 m_checkedForXMLCharset = true;
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100504 return true;
505 }
506
507 // This is not completely efficient, since the function might go
508 // through the HTML head several times.
509
510 size_t oldSize = m_buffer.size();
511 m_buffer.grow(oldSize + len);
512 memcpy(m_buffer.data() + oldSize, data, len);
513
514 movedDataToBuffer = true;
515
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100516 const char* ptr = m_buffer.data();
517 const char* pEnd = ptr + m_buffer.size();
518
519 // Is there enough data available to check for XML declaration?
520 if (m_buffer.size() < 8)
521 return false;
522
523 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
524 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
525 if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {
526 const char* xmlDeclarationEnd = ptr;
527 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
528 ++xmlDeclarationEnd;
529 if (xmlDeclarationEnd == pEnd)
530 return false;
531 // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
532 int len = 0;
533 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
534 if (pos != -1)
535 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
536 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100537 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100538 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100539 else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100540 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100541 else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100542 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100543 else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100544 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100545
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100546 m_checkedForXMLCharset = true;
547 return true;
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100548}
549
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100550void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100551{
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100552 if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) {
553 m_checkedForMetaCharset = true;
554 return;
555 }
556
557 if (!m_charsetParser)
558 m_charsetParser = HTMLMetaCharsetParser::create();
559
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100560 if (!m_charsetParser->checkForMetaCharset(data, length))
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100561 return;
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100562
563 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
564 m_charsetParser.clear();
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100565 m_checkedForMetaCharset = true;
566 return;
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100567}
568
569void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
570{
571 switch (KanjiCode::judge(data, len)) {
572 case KanjiCode::JIS:
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100573 setEncoding("ISO-2022-JP", EncodingFromContentSniffing);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100574 break;
575 case KanjiCode::EUC:
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100576 setEncoding("EUC-JP", EncodingFromContentSniffing);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100577 break;
578 case KanjiCode::SJIS:
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100579 setEncoding("Shift_JIS", EncodingFromContentSniffing);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100580 break;
581 case KanjiCode::ASCII:
582 case KanjiCode::UTF16:
583 case KanjiCode::UTF8:
584 break;
585 }
586}
587
588// We use the encoding detector in two cases:
589// 1. Encoding detector is turned ON and no other encoding source is
590// available (that is, it's DefaultEncoding).
591// 2. Encoding detector is turned ON and the encoding is set to
592// the encoding of the parent frame, which is also auto-detected.
593// Note that condition #2 is NOT satisfied unless parent-child frame
594// relationship is compliant to the same-origin policy. If they're from
595// different domains, |m_source| would not be set to EncodingFromParentFrame
Ben Murdoch02772c62013-07-26 10:21:05 +0100596// in the first place.
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100597bool TextResourceDecoder::shouldAutoDetect() const
598{
599 // Just checking m_hintEncoding suffices here because it's only set
600 // in setHintEncoding when the source is AutoDetectedEncoding.
601 return m_usesEncodingDetector
Ben Murdoch02772c62013-07-26 10:21:05 +0100602 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100603}
604
605String TextResourceDecoder::decode(const char* data, size_t len)
606{
607 size_t lengthOfBOM = 0;
608 if (!m_checkedForBOM)
609 lengthOfBOM = checkForBOM(data, len);
610
611 bool movedDataToBuffer = false;
612
613 if (m_contentType == CSS && !m_checkedForCSSCharset)
614 if (!checkForCSSCharset(data, len, movedDataToBuffer))
615 return emptyString();
616
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100617 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLCharset)
618 if (!checkForXMLCharset(data, len, movedDataToBuffer))
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100619 return emptyString();
620
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100621 // FIXME: It would be more efficient to move this logic below checkForMetaCharset because
622 // checkForMetaCharset can overrule these detections.
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100623 if (shouldAutoDetect()) {
624 if (m_encoding.isJapanese())
625 detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
626 else {
Torne (Richard Coles)81a51572013-05-13 16:52:28 +0100627 WTF::TextEncoding detectedEncoding;
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100628 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100629 setEncoding(detectedEncoding, EncodingFromContentSniffing);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100630 }
631 }
632
633 ASSERT(m_encoding.isValid());
634
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100635 const char* dataForDecode = data + lengthOfBOM;
636 size_t lengthForDecode = len - lengthOfBOM;
637
638 if (!m_buffer.isEmpty()) {
639 if (!movedDataToBuffer) {
640 size_t oldSize = m_buffer.size();
641 m_buffer.grow(oldSize + len);
642 memcpy(m_buffer.data() + oldSize, data, len);
643 }
644
645 dataForDecode = m_buffer.data() + lengthOfBOM;
646 lengthForDecode = m_buffer.size() - lengthOfBOM;
647 }
648
649 if (m_contentType == HTML && !m_checkedForMetaCharset)
650 checkForMetaCharset(dataForDecode, lengthForDecode);
651
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100652 if (!m_codec)
653 m_codec = newTextCodec(m_encoding);
654
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100655 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100656
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100657 m_buffer.clear();
658 return result;
659}
660
661String TextResourceDecoder::flush()
662{
663 // If we can not identify the encoding even after a document is completely
664 // loaded, we need to detect the encoding if other conditions for
665 // autodetection is satisfied.
666 if (m_buffer.size() && shouldAutoDetect()
Torne (Richard Coles)93ac45c2013-05-29 14:40:20 +0100667 && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
668 WTF::TextEncoding detectedEncoding;
669 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
670 setEncoding(detectedEncoding, EncodingFromContentSniffing);
Torne (Richard Coles)53e740f2013-05-09 18:38:43 +0100671 }
672
673 if (!m_codec)
674 m_codec = newTextCodec(m_encoding);
675
676 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
677 m_buffer.clear();
678 m_codec.clear();
679 m_checkedForBOM = false; // Skip BOM again when re-decoding.
680 return result;
681}
682
683}