001 // Copyright (c) 2011, Mike Samuel
002 // All rights reserved.
003 //
004 // Redistribution and use in source and binary forms, with or without
005 // modification, are permitted provided that the following conditions
006 // are met:
007 //
008 // Redistributions of source code must retain the above copyright
009 // notice, this list of conditions and the following disclaimer.
010 // Redistributions in binary form must reproduce the above copyright
011 // notice, this list of conditions and the following disclaimer in the
012 // documentation and/or other materials provided with the distribution.
013 // Neither the name of the OWASP nor the names of its contributors may
014 // be used to endorse or promote products derived from this software
015 // without specific prior written permission.
016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027 // POSSIBILITY OF SUCH DAMAGE.
028
029 package org.owasp.html;
030
031 import java.util.LinkedList;
032 import java.util.List;
033 import javax.annotation.Nullable;
034
035 import com.google.common.collect.Lists;
036
037 /**
038 * Consumes an HTML stream, and dispatches events to a policy object which
039 * decides which elements and attributes to allow.
040 */
041 public final class HtmlSanitizer {
042
043 /**
044 * Receives events based on the HTML stream, and applies a policy to decide
045 * what HTML constructs to allow.
046 * Typically, implementations use an {@link HtmlStreamRenderer} to produce
047 * the sanitized output.
048 *
049 * <p>
050 * <b>Implementations of this class are in the TCB.</b></p>
051 */
052 @TCB
053 public interface Policy extends HtmlStreamEventReceiver {
054 /**
055 * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
056 *
057 * @param elementName a normalized (lower-case for non-namespaced names)
058 * element name.
059 * @param attrs a list of alternating attribute name and value pairs.
060 * For efficiency, this list may be mutated by this during this method
061 * call, but ownership reverts to the caller on method exit.
062 * The values are raw -- HTML entities have been decoded.
063 * Specifically, implementations are allowed to use a list iterator
064 * and remove all disallowed attributes, add necessary attributes, and
065 * then pass the list to an {@link HtmlStreamRenderer}.
066 */
067 void openTag(String elementName, List<String> attrs);
068
069 /**
070 * Called when an HTML tag like {@code </foo>} is seen in the input.
071 *
072 * @param elementName a normalized (lower-case for non-namespaced names)
073 * element name.
074 */
075 void closeTag(String elementName);
076
077 /**
078 * Called when textual content is seen.
079 * @param textChunk raw content -- HTML entities have been decoded.
080 */
081 void text(String textChunk);
082 }
083
084 /**
085 * Sanitizes the given HTML by applying the given policy to it.
086 *
087 * <p>
088 * This method is not in the TCB.
089 *
090 * <p>
091 * This method has no return value since policies are assumed to render things
092 * they accept and do nothing on things they reject.
093 * Use {@link HtmlStreamRenderer} to render content to an output buffer.
094 *
095 * @param html A snippet of HTML to sanitize. {@code null} is treated as the
096 * empty string and will not result in a {@code NullPointerException}.
097 * @param policy The Policy that will receive events based on the tokens in
098 * HTML. Typically, this policy ends up routing the events to an
099 * {@link HtmlStreamRenderer} after filtering.
100 * {@link HtmlPolicyBuilder} provides an easy way to create policies.
101 */
102 public static void sanitize(@Nullable String html, final Policy policy) {
103 if (html == null) { html = ""; }
104
105 TagBalancingHtmlStreamEventReceiver balancer
106 = new TagBalancingHtmlStreamEventReceiver(policy);
107
108 // According to Opera the maximum table nesting depth seen in the wild is
109 // 795, but 99.99% of documents have a table nesting depth of less than 22.
110 // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
111 // document depth of 90 (incl. HTML & BODY).
112 // Obviously table nesting depth is not the same as whole document depth,
113 // but it is the best proxy I have available.
114 // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
115 // the original data.
116
117 // Webkit defines the maximum HTML parser tree depth as 512.
118 // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
119 // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
120
121 // The first number gives us a lower bound on the nesting depth we allow,
122 // 90, and the second gives us an upper bound: 512.
123 // We do not want to bump right up against that limit.
124 // 256 is substantially larger than the lower bound and well clear of the
125 // upper bound.
126 balancer.setNestingLimit(256);
127
128 balancer.openDocument();
129
130 HtmlLexer lexer = new HtmlLexer(html);
131 // Use a linked list so that policies can use Iterator.remove() in an O(1)
132 // way.
133 LinkedList<String> attrs = Lists.newLinkedList();
134 while (lexer.hasNext()) {
135 HtmlToken token = lexer.next();
136 switch (token.type) {
137 case TEXT:
138 balancer.text(
139 Encoding.decodeHtml(html.substring(token.start, token.end)));
140 break;
141 case UNESCAPED:
142 balancer.text(Encoding.stripBannedCodeunits(
143 html.substring(token.start, token.end)));
144 break;
145 case TAGBEGIN:
146 if (html.charAt(token.start + 1) == '/') { // A close tag.
147 balancer.closeTag(HtmlLexer.canonicalName(
148 html.substring(token.start + 2, token.end)));
149 while (lexer.hasNext()
150 && lexer.next().type != HtmlTokenType.TAGEND) {
151 // skip tokens until we see a ">"
152 }
153 } else {
154 attrs.clear();
155
156 boolean attrsReadyForName = true;
157 tagBody:
158 while (lexer.hasNext()) {
159 HtmlToken tagBodyToken = lexer.next();
160 switch (tagBodyToken.type) {
161 case ATTRNAME:
162 if (!attrsReadyForName) {
163 // Last attribute added was valueless.
164 attrs.add(attrs.getLast());
165 } else {
166 attrsReadyForName = false;
167 }
168 attrs.add(HtmlLexer.canonicalName(
169 html.substring(tagBodyToken.start, tagBodyToken.end)));
170 break;
171 case ATTRVALUE:
172 attrs.add(Encoding.decodeHtml(stripQuotes(
173 html.substring(tagBodyToken.start, tagBodyToken.end))));
174 attrsReadyForName = true;
175 break;
176 case TAGEND:
177 break tagBody;
178 default:
179 // Just drop anything not recognized
180 }
181 }
182 if (!attrsReadyForName) {
183 attrs.add(attrs.getLast());
184 }
185 balancer.openTag(
186 HtmlLexer.canonicalName(
187 html.substring(token.start + 1, token.end)),
188 attrs);
189 }
190 break;
191 default:
192 // Ignore comments, XML prologues, processing instructions, and other
193 // stuff that shouldn't show up in the output.
194 break;
195 }
196 }
197
198 balancer.closeDocument();
199 }
200
201 private static String stripQuotes(String encodedAttributeValue) {
202 int n = encodedAttributeValue.length();
203 if (n > 0) {
204 char last = encodedAttributeValue.charAt(n - 1);
205 if (last == '"' || last == '\'') {
206 int start = 0;
207 if (n != 1 && last == encodedAttributeValue.charAt(0)) {
208 start = 1;
209 } else {
210 // Browsers deal with missing left quotes : <img src=foo.png">
211 // but generally do not deal with missing right : <img src="foo.png>
212 }
213 return encodedAttributeValue.substring(start, n - 1);
214 }
215 }
216 return encodedAttributeValue;
217 }
218
219 }