Blame - Source/core/loader/TextResourceDecoder.cpp - fp2-dev/platform/external/chromium_org/third_party/WebKit

blob: 139fcf1cb649d7b285eac564f5645b9772c84440 [file] [log] [blame]

Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	1	/*
				2	Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
				3	Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
				4	Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
				5
				6	This library is free software; you can redistribute it and/or
				7	modify it under the terms of the GNU Library General Public
				8	License as published by the Free Software Foundation; either
				9	version 2 of the License, or (at your option) any later version.
				10
				11	This library is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	Library General Public License for more details.
				15
				16	You should have received a copy of the GNU Library General Public License
				17	along with this library; see the file COPYING.LIB. If not, write to
				18	the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
				19	Boston, MA 02110-1301, USA.
				20	*/
				21
				22
				23	#include "config.h"
				24	#include "core/loader/TextResourceDecoder.h"
				25
				26	#include "HTMLNames.h"
				27	#include "core/dom/DOMImplementation.h"
				28	#include "core/html/parser/HTMLMetaCharsetParser.h"
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	29	#include "core/platform/text/TextEncodingDetector.h"
Torne (Richard Coles)	81a5157	2013-05-13 16:52:28 +0100	[diff] [blame]	30	#include "wtf/StringExtras.h"
				31	#include "wtf/text/TextCodec.h"
				32	#include "wtf/text/TextEncoding.h"
				33	#include "wtf/text/TextEncodingRegistry.h"
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	34
				35	using namespace WTF;
				36
				37	namespace WebCore {
				38
				39	using namespace HTMLNames;
				40
				41	static inline bool bytesEqual(const char* p, char b0, char b1)
				42	{
				43	return p[0] == b0 && p[1] == b1;
				44	}
				45
				46	static inline bool bytesEqual(const char* p, char b0, char b1, char b2)
				47	{
				48	return p[0] == b0 && p[1] == b1 && p[2] == b2;
				49	}
				50
				51	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
				52	{
				53	return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;
				54	}
				55
				56	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
				57	{
				58	return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;
				59	}
				60
				61	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
				62	{
				63	return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;
				64	}
				65
				66	static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
				67	{
				68	return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;
				69	}
				70
				71	// You might think we should put these find functions elsewhere, perhaps with the
				72	// similar functions that operate on UChar, but arguably only the decoder has
				73	// a reason to process strings of char rather than UChar.
				74
				75	static int find(const char* subject, size_t subjectLength, const char* target)
				76	{
				77	size_t targetLength = strlen(target);
				78	if (targetLength > subjectLength)
				79	return -1;
				80	for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
				81	bool match = true;
				82	for (size_t j = 0; j < targetLength; ++j) {
				83	if (subject[i + j] != target[j]) {
				84	match = false;
				85	break;
				86	}
				87	}
				88	if (match)
				89	return i;
				90	}
				91	return -1;
				92	}
				93
Torne (Richard Coles)	81a5157	2013-05-13 16:52:28 +0100	[diff] [blame]	94	static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	95	{
				96	Vector<char, 64> buffer(length + 1);
				97	memcpy(buffer.data(), encodingName, length);
				98	buffer[length] = '\0';
				99	return buffer.data();
				100	}
				101
				102	class KanjiCode {
				103	public:
				104	enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
				105	static enum Type judge(const char* str, int length);
				106	static const int ESC = 0x1b;
				107	static const unsigned char sjisMap[256];
				108	static int ISkanji(int code)
				109	{
				110	if (code >= 0x100)
				111	return 0;
				112	return sjisMap[code & 0xff] & 1;
				113	}
				114	static int ISkana(int code)
				115	{
				116	if (code >= 0x100)
				117	return 0;
				118	return sjisMap[code & 0xff] & 2;
				119	}
				120	};
				121
				122	const unsigned char KanjiCode::sjisMap[256] = {
				123	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				124	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				125	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				126	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				127	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				128	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				129	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				130	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				131	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				132	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				133	0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				134	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				135	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				136	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				137	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				138	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
				139	};
				140
				141	/*
				142	* EUC-JP is
				143	* [0xa1 - 0xfe][0xa1 - 0xfe]
				144	* 0x8e[0xa1 - 0xfe](SS2)
				145	* 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
				146	*
				147	* Shift_Jis is
				148	* [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
				149	*
				150	* Shift_Jis Hankaku Kana is
				151	* [0xa1 - 0xdf]
				152	*/
				153
				154	/*
				155	* KanjiCode::judge() is based on judge_jcode() from jvim
				156	* http://hp.vector.co.jp/authors/VA003457/vim/
				157	*
				158	* Special Thanks to Kenichi Tsuchida
				159	*/
				160
				161	enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
				162	{
				163	enum Type code;
				164	int i;
				165	int bfr = false; /* Kana Moji */
				166	int bfk = 0; /* EUC Kana */
				167	int sjis = 0;
				168	int euc = 0;
				169
				170	const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
				171
				172	code = ASCII;
				173
				174	i = 0;
				175	while (i < size) {
				176	if (ptr[i] == ESC && (size - i >= 3)) {
				177	if (bytesEqual(str + i + 1, '$', 'B')
				178	\|\| bytesEqual(str + i + 1, '(', 'B')
				179	\|\| bytesEqual(str + i + 1, '$', '@')
				180	\|\| bytesEqual(str + i + 1, '(', 'J')) {
				181	code = JIS;
				182	goto breakBreak;
				183	}
				184	if (bytesEqual(str + i + 1, '(', 'I') \|\| bytesEqual(str + i + 1, ')', 'I')) {
				185	code = JIS;
				186	i += 3;
				187	} else {
				188	i++;
				189	}
				190	bfr = false;
				191	bfk = 0;
				192	} else {
				193	if (ptr[i] < 0x20) {
				194	bfr = false;
				195	bfk = 0;
				196	/* ?? check kudokuten ?? && ?? hiragana ?? */
				197	if ((i >= 2) && (ptr[i - 2] == 0x81)
				198	&& (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
				199	code = SJIS;
				200	sjis += 100; /* kudokuten */
				201	} else if ((i >= 2) && (ptr[i - 2] == 0xa1)
				202	&& (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
				203	code = EUC;
				204	euc += 100; /* kudokuten */
				205	} else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
				206	sjis += 40; /* hiragana */
				207	} else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
				208	euc += 40; /* hiragana */
				209	}
				210	} else {
				211	/* ?? check hiragana or katana ?? */
				212	if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
				213	sjis++; /* hiragana */
				214	} else if ((size - i > 1) && (ptr[i] == 0x83)
				215	&& (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
				216	sjis++; /* katakana */
				217	} else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
				218	euc++; /* hiragana */
				219	} else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
				220	euc++; /* katakana */
				221	}
				222	if (bfr) {
				223	if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
				224	code = SJIS;
				225	goto breakBreak;
				226	} else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) \|\| (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
				227	code = SJIS;
				228	goto breakBreak;
				229	} else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
				230	code = EUC;
				231	goto breakBreak;
				232	} else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
				233	code = EUC;
				234	goto breakBreak;
				235	} else if ((i >= 1) && (ptr[i] < 0xa0 \|\| 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
				236	code = SJIS;
				237	goto breakBreak;
				238	} else if (ptr[i] <= 0x7f) {
				239	code = SJIS;
				240	goto breakBreak;
				241	} else {
				242	if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
				243	euc++; /* sjis hankaku kana kigo */
				244	} else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
				245	; /* sjis hankaku kana */
				246	} else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
				247	euc++;
				248	} else if (0x8e == ptr[i]) {
				249	euc++;
				250	} else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
				251	sjis++;
				252	}
				253	bfr = false;
				254	bfk = 0;
				255	}
				256	} else if (0x8e == ptr[i]) {
				257	if (size - i <= 1) {
				258	;
				259	} else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
				260	/* EUC KANA or SJIS KANJI */
				261	if (bfk == 1) {
				262	euc += 100;
				263	}
				264	bfk++;
				265	i++;
				266	} else {
				267	/* SJIS only */
				268	code = SJIS;
				269	goto breakBreak;
				270	}
				271	} else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
				272	/* SJIS only */
				273	code = SJIS;
				274	if ((size - i >= 1)
				275	&& ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
				276	\|\| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
				277	goto breakBreak;
				278	}
				279	} else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
				280	/* EUC only */
				281	code = EUC;
				282	if ((size - i >= 1)
				283	&& (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
				284	goto breakBreak;
				285	}
				286	} else if (ptr[i] <= 0x7f) {
				287	;
				288	} else {
				289	bfr = true;
				290	bfk = 0;
				291	}
				292	}
				293	i++;
				294	}
				295	}
				296	if (code == ASCII) {
				297	if (sjis > euc) {
				298	code = SJIS;
				299	} else if (sjis < euc) {
				300	code = EUC;
				301	}
				302	}
				303	breakBreak:
				304	return (code);
				305	}
				306
				307	TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
				308	{
				309	if (equalIgnoringCase(mimeType, "text/css"))
				310	return CSS;
				311	if (equalIgnoringCase(mimeType, "text/html"))
				312	return HTML;
				313	if (DOMImplementation::isXMLMIMEType(mimeType))
				314	return XML;
				315	return PlainText;
				316	}
				317
Torne (Richard Coles)	81a5157	2013-05-13 16:52:28 +0100	[diff] [blame]	318	const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const WTF::TextEncoding& specifiedDefaultEncoding)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	319	{
Ben Murdoch	02772c6	2013-07-26 10:21:05 +0100	[diff] [blame]	320	// Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	321	// for text/xml. This matches Firefox.
				322	if (contentType == XML)
				323	return UTF8Encoding();
				324	if (!specifiedDefaultEncoding.isValid())
				325	return Latin1Encoding();
				326	return specifiedDefaultEncoding;
				327	}
				328
Torne (Richard Coles)	81a5157	2013-05-13 16:52:28 +0100	[diff] [blame]	329	TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	330	: m_contentType(determineContentType(mimeType))
				331	, m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
				332	, m_source(DefaultEncoding)
				333	, m_hintEncoding(0)
				334	, m_checkedForBOM(false)
				335	, m_checkedForCSSCharset(false)
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	336	, m_checkedForXMLCharset(false)
				337	, m_checkedForMetaCharset(false)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	338	, m_useLenientXMLDecoding(false)
				339	, m_sawError(false)
				340	, m_usesEncodingDetector(usesEncodingDetector)
				341	{
				342	}
				343
				344	TextResourceDecoder::~TextResourceDecoder()
				345	{
				346	}
				347
Torne (Richard Coles)	81a5157	2013-05-13 16:52:28 +0100	[diff] [blame]	348	void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, EncodingSource source)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	349	{
				350	// In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
				351	if (!encoding.isValid())
				352	return;
				353
				354	// When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
				355	// treat x-user-defined as windows-1252 (bug 18270)
				356	if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
				357	m_encoding = "windows-1252";
Ben Murdoch	02772c6	2013-07-26 10:21:05 +0100	[diff] [blame]	358	else if (source == EncodingFromMetaTag \|\| source == EncodingFromXMLHeader \|\| source == EncodingFromCSSCharset)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	359	m_encoding = encoding.closestByteBasedEquivalent();
				360	else
				361	m_encoding = encoding;
				362
				363	m_codec.clear();
				364	m_source = source;
				365	}
				366
				367	// Returns the position of the encoding string.
				368	static int findXMLEncoding(const char* str, int len, int& encodingLength)
				369	{
				370	int pos = find(str, len, "encoding");
				371	if (pos == -1)
				372	return -1;
				373	pos += 8;
Ben Murdoch	02772c6	2013-07-26 10:21:05 +0100	[diff] [blame]	374
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	375	// Skip spaces and stray control characters.
				376	while (pos < len && str[pos] <= ' ')
				377	++pos;
				378
				379	// Skip equals sign.
				380	if (pos >= len \|\| str[pos] != '=')
				381	return -1;
				382	++pos;
				383
				384	// Skip spaces and stray control characters.
				385	while (pos < len && str[pos] <= ' ')
				386	++pos;
				387
				388	// Skip quotation mark.
				389	if (pos >= len)
				390	return - 1;
				391	char quoteMark = str[pos];
				392	if (quoteMark != '"' && quoteMark != '\'')
				393	return -1;
				394	++pos;
				395
				396	// Find the trailing quotation mark.
				397	int end = pos;
				398	while (end < len && str[end] != quoteMark)
				399	++end;
				400	if (end >= len)
				401	return -1;
				402
				403	encodingLength = end - pos;
				404	return pos;
				405	}
				406
				407	// true if there is more to parse
				408	static inline bool skipWhitespace(const char& pos, const char dataEnd)
				409	{
				410	while (pos < dataEnd && (pos == '\t' \|\| pos == ' '))
				411	++pos;
				412	return pos != dataEnd;
				413	}
				414
				415	size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
				416	{
				417	// Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
				418	// We let it override even a user-chosen encoding.
				419	ASSERT(!m_checkedForBOM);
				420
				421	size_t lengthOfBOM = 0;
				422
				423	size_t bufferLength = m_buffer.size();
				424
				425	size_t buf1Len = bufferLength;
				426	size_t buf2Len = len;
				427	const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
				428	const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
				429	unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, buf2++) : 0;
				430	unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, buf2++) : 0;
				431	unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, buf2++) : 0;
				432	unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
				433
				434	// Check for the BOM.
				435	if (c1 == 0xFF && c2 == 0xFE) {
				436	if (c3 != 0 \|\| c4 != 0) {
				437	setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
				438	lengthOfBOM = 2;
				439	} else {
				440	setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
				441	lengthOfBOM = 4;
				442	}
				443	} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
				444	setEncoding(UTF8Encoding(), AutoDetectedEncoding);
				445	lengthOfBOM = 3;
				446	} else if (c1 == 0xFE && c2 == 0xFF) {
				447	setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
				448	lengthOfBOM = 2;
				449	} else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
				450	setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
				451	lengthOfBOM = 4;
				452	}
				453
				454	if (lengthOfBOM \|\| bufferLength + len >= 4)
				455	m_checkedForBOM = true;
				456
				457	return lengthOfBOM;
				458	}
				459
				460	bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
				461	{
				462	if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
				463	m_checkedForCSSCharset = true;
				464	return true;
				465	}
				466
				467	size_t oldSize = m_buffer.size();
				468	m_buffer.grow(oldSize + len);
				469	memcpy(m_buffer.data() + oldSize, data, len);
				470
				471	movedDataToBuffer = true;
				472
				473	if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13
				474	return false;
				475
				476	const char* dataStart = m_buffer.data();
				477	const char* dataEnd = dataStart + m_buffer.size();
				478
				479	if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {
				480	dataStart += 10;
				481	const char* pos = dataStart;
				482
				483	while (pos < dataEnd && *pos != '"')
				484	++pos;
				485	if (pos == dataEnd)
				486	return false;
				487
				488	int encodingNameLength = pos - dataStart;
Ben Murdoch	02772c6	2013-07-26 10:21:05 +0100	[diff] [blame]	489
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	490	++pos;
				491
				492	if (*pos == ';')
				493	setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
				494	}
				495
				496	m_checkedForCSSCharset = true;
				497	return true;
				498	}
				499
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	500	bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	501	{
				502	if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	503	m_checkedForXMLCharset = true;
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	504	return true;
				505	}
				506
				507	// This is not completely efficient, since the function might go
				508	// through the HTML head several times.
				509
				510	size_t oldSize = m_buffer.size();
				511	m_buffer.grow(oldSize + len);
				512	memcpy(m_buffer.data() + oldSize, data, len);
				513
				514	movedDataToBuffer = true;
				515
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	516	const char* ptr = m_buffer.data();
				517	const char* pEnd = ptr + m_buffer.size();
				518
				519	// Is there enough data available to check for XML declaration?
				520	if (m_buffer.size() < 8)
				521	return false;
				522
				523	// Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
				524	// It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
				525	if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {
				526	const char* xmlDeclarationEnd = ptr;
				527	while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
				528	++xmlDeclarationEnd;
				529	if (xmlDeclarationEnd == pEnd)
				530	return false;
				531	// No need for +1, because we have an extra "?" to lose at the end of XML declaration.
				532	int len = 0;
				533	int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
				534	if (pos != -1)
				535	setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
				536	// continue looking for a charset - it may be specified in an HTTP-Equiv meta
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	537	} else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	538	setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	539	else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	540	setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	541	else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	542	setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	543	else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	544	setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	545
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	546	m_checkedForXMLCharset = true;
				547	return true;
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	548	}
				549
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	550	void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	551	{
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	552	if (m_source == UserChosenEncoding \|\| m_source == EncodingFromHTTPHeader \|\| m_source == AutoDetectedEncoding) {
				553	m_checkedForMetaCharset = true;
				554	return;
				555	}
				556
				557	if (!m_charsetParser)
				558	m_charsetParser = HTMLMetaCharsetParser::create();
				559
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	560	if (!m_charsetParser->checkForMetaCharset(data, length))
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	561	return;
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	562
				563	setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
				564	m_charsetParser.clear();
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	565	m_checkedForMetaCharset = true;
				566	return;
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	567	}
				568
				569	void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
				570	{
				571	switch (KanjiCode::judge(data, len)) {
				572	case KanjiCode::JIS:
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	573	setEncoding("ISO-2022-JP", EncodingFromContentSniffing);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	574	break;
				575	case KanjiCode::EUC:
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	576	setEncoding("EUC-JP", EncodingFromContentSniffing);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	577	break;
				578	case KanjiCode::SJIS:
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	579	setEncoding("Shift_JIS", EncodingFromContentSniffing);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	580	break;
				581	case KanjiCode::ASCII:
				582	case KanjiCode::UTF16:
				583	case KanjiCode::UTF8:
				584	break;
				585	}
				586	}
				587
				588	// We use the encoding detector in two cases:
				589	// 1. Encoding detector is turned ON and no other encoding source is
				590	// available (that is, it's DefaultEncoding).
				591	// 2. Encoding detector is turned ON and the encoding is set to
				592	// the encoding of the parent frame, which is also auto-detected.
				593	// Note that condition #2 is NOT satisfied unless parent-child frame
				594	// relationship is compliant to the same-origin policy. If they're from
				595	// different domains, \|m_source\| would not be set to EncodingFromParentFrame
Ben Murdoch	02772c6	2013-07-26 10:21:05 +0100	[diff] [blame]	596	// in the first place.
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	597	bool TextResourceDecoder::shouldAutoDetect() const
				598	{
				599	// Just checking m_hintEncoding suffices here because it's only set
				600	// in setHintEncoding when the source is AutoDetectedEncoding.
				601	return m_usesEncodingDetector
Ben Murdoch	02772c6	2013-07-26 10:21:05 +0100	[diff] [blame]	602	&& (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	603	}
				604
				605	String TextResourceDecoder::decode(const char* data, size_t len)
				606	{
				607	size_t lengthOfBOM = 0;
				608	if (!m_checkedForBOM)
				609	lengthOfBOM = checkForBOM(data, len);
				610
				611	bool movedDataToBuffer = false;
				612
				613	if (m_contentType == CSS && !m_checkedForCSSCharset)
				614	if (!checkForCSSCharset(data, len, movedDataToBuffer))
				615	return emptyString();
				616
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	617	if ((m_contentType == HTML \|\| m_contentType == XML) && !m_checkedForXMLCharset)
				618	if (!checkForXMLCharset(data, len, movedDataToBuffer))
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	619	return emptyString();
				620
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	621	// FIXME: It would be more efficient to move this logic below checkForMetaCharset because
				622	// checkForMetaCharset can overrule these detections.
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	623	if (shouldAutoDetect()) {
				624	if (m_encoding.isJapanese())
				625	detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
				626	else {
Torne (Richard Coles)	81a5157	2013-05-13 16:52:28 +0100	[diff] [blame]	627	WTF::TextEncoding detectedEncoding;
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	628	if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	629	setEncoding(detectedEncoding, EncodingFromContentSniffing);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	630	}
				631	}
				632
				633	ASSERT(m_encoding.isValid());
				634
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	635	const char* dataForDecode = data + lengthOfBOM;
				636	size_t lengthForDecode = len - lengthOfBOM;
				637
				638	if (!m_buffer.isEmpty()) {
				639	if (!movedDataToBuffer) {
				640	size_t oldSize = m_buffer.size();
				641	m_buffer.grow(oldSize + len);
				642	memcpy(m_buffer.data() + oldSize, data, len);
				643	}
				644
				645	dataForDecode = m_buffer.data() + lengthOfBOM;
				646	lengthForDecode = m_buffer.size() - lengthOfBOM;
				647	}
				648
				649	if (m_contentType == HTML && !m_checkedForMetaCharset)
				650	checkForMetaCharset(dataForDecode, lengthForDecode);
				651
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	652	if (!m_codec)
				653	m_codec = newTextCodec(m_encoding);
				654
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	655	String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	656
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	657	m_buffer.clear();
				658	return result;
				659	}
				660
				661	String TextResourceDecoder::flush()
				662	{
				663	// If we can not identify the encoding even after a document is completely
				664	// loaded, we need to detect the encoding if other conditions for
				665	// autodetection is satisfied.
				666	if (m_buffer.size() && shouldAutoDetect()
Torne (Richard Coles)	93ac45c	2013-05-29 14:40:20 +0100	[diff] [blame]	667	&& ((!m_checkedForXMLCharset && (m_contentType == HTML \|\| m_contentType == XML)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
				668	WTF::TextEncoding detectedEncoding;
				669	if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
				670	setEncoding(detectedEncoding, EncodingFromContentSniffing);
Torne (Richard Coles)	53e740f	2013-05-09 18:38:43 +0100	[diff] [blame]	671	}
				672
				673	if (!m_codec)
				674	m_codec = newTextCodec(m_encoding);
				675
				676	String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
				677	m_buffer.clear();
				678	m_codec.clear();
				679	m_checkedForBOM = false; // Skip BOM again when re-decoding.
				680	return result;
				681	}
				682
				683	}