001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html; 030 031 import java.util.LinkedList; 032 import java.util.List; 033 import javax.annotation.Nullable; 034 035 import com.google.common.annotations.VisibleForTesting; 036 import com.google.common.collect.Lists; 037 038 /** 039 * Consumes an HTML stream, and dispatches events to a policy object which 040 * decides which elements and attributes to allow. 041 */ 042 public final class HtmlSanitizer { 043 044 /** 045 * Receives events based on the HTML stream, and applies a policy to decide 046 * what HTML constructs to allow. 047 * Typically, implementations use an {@link HtmlStreamRenderer} to produce 048 * the sanitized output. 049 * 050 * <p> 051 * <b>Implementations of this class are in the TCB.</b></p> 052 */ 053 @TCB 054 public interface Policy extends HtmlStreamEventReceiver { 055 /** 056 * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input. 057 * 058 * @param elementName a normalized (lower-case for non-namespaced names) 059 * element name. 060 * @param attrs a list of alternating attribute name and value pairs. 061 * For efficiency, this list may be mutated by this during this method 062 * call, but ownership reverts to the caller on method exit. 063 * The values are raw -- HTML entities have been decoded. 064 * Specifically, implementations are allowed to use a list iterator 065 * and remove all disallowed attributes, add necessary attributes, and 066 * then pass the list to an {@link HtmlStreamRenderer}. 067 */ 068 void openTag(String elementName, List<String> attrs); 069 070 /** 071 * Called when an HTML tag like {@code </foo>} is seen in the input. 072 * 073 * @param elementName a normalized (lower-case for non-namespaced names) 074 * element name. 075 */ 076 void closeTag(String elementName); 077 078 /** 079 * Called when textual content is seen. 080 * @param textChunk raw content -- HTML entities have been decoded. 081 */ 082 void text(String textChunk); 083 } 084 085 /** 086 * Sanitizes the given HTML by applying the given policy to it. 087 * 088 * <p> 089 * This method is not in the TCB. 090 * 091 * <p> 092 * This method has no return value since policies are assumed to render things 093 * they accept and do nothing on things they reject. 094 * Use {@link HtmlStreamRenderer} to render content to an output buffer. 095 * 096 * @param html A snippet of HTML to sanitize. {@code null} is treated as the 097 * empty string and will not result in a {@code NullPointerException}. 098 * @param policy The Policy that will receive events based on the tokens in 099 * html. Typically, this policy ends up routing the events to an 100 * {@link HtmlStreamRenderer} after filtering. 101 * {@link HtmlPolicyBuilder} provides an easy way to create policies. 102 */ 103 public static void sanitize(@Nullable String html, final Policy policy) { 104 if (html == null) { html = ""; } 105 106 HtmlStreamEventReceiver balancer = new TagBalancingHtmlStreamEventReceiver( 107 policy); 108 109 balancer.openDocument(); 110 111 HtmlLexer lexer = new HtmlLexer(html); 112 // Use a linked list so that policies can use Iterator.remove() in an O(1) 113 // way. 114 LinkedList<String> attrs = Lists.newLinkedList(); 115 while (lexer.hasNext()) { 116 HtmlToken token = lexer.next(); 117 switch (token.type) { 118 case TEXT: 119 balancer.text(decodeHtml(html.substring(token.start, token.end))); 120 break; 121 case UNESCAPED: 122 balancer.text(html.substring(token.start, token.end)); 123 break; 124 case TAGBEGIN: 125 if (html.charAt(token.start + 1) == '/') { // A close tag. 126 balancer.closeTag(HtmlLexer.canonicalName( 127 html.substring(token.start + 2, token.end))); 128 while (lexer.hasNext() 129 && lexer.next().type != HtmlTokenType.TAGEND) { 130 // skip tokens until we see a ">" 131 } 132 } else { 133 attrs.clear(); 134 135 boolean attrsReadyForName = true; 136 tagBody: 137 while (lexer.hasNext()) { 138 HtmlToken tagBodyToken = lexer.next(); 139 switch (tagBodyToken.type) { 140 case ATTRNAME: 141 if (!attrsReadyForName) { 142 // Last attribute added was valueless. 143 attrs.add(attrs.getLast()); 144 } else { 145 attrsReadyForName = false; 146 } 147 attrs.add(HtmlLexer.canonicalName( 148 html.substring(tagBodyToken.start, tagBodyToken.end))); 149 break; 150 case ATTRVALUE: 151 attrs.add(decodeHtml(stripQuotes( 152 html.substring(tagBodyToken.start, tagBodyToken.end)))); 153 attrsReadyForName = true; 154 break; 155 case TAGEND: 156 break tagBody; 157 default: 158 // Just drop anything not recognized 159 } 160 } 161 if (!attrsReadyForName) { 162 attrs.add(attrs.getLast()); 163 } 164 balancer.openTag( 165 HtmlLexer.canonicalName( 166 html.substring(token.start + 1, token.end)), 167 attrs); 168 } 169 break; 170 default: 171 // Ignore comments, directives, and other stuff that shouldn't show 172 // up in the output. 173 break; 174 } 175 } 176 177 balancer.closeDocument(); 178 } 179 180 private static String stripQuotes(String encodedAttributeValue) { 181 int n = encodedAttributeValue.length(); 182 if (n > 0) { 183 char last = encodedAttributeValue.charAt(n - 1); 184 if (last == '"' || last == '\'') { 185 int start = 0; 186 if (n != 1 && last == encodedAttributeValue.charAt(0)) { 187 start = 1; 188 } else { 189 // Browsers deal with missing left quotes : <img src=foo.png"> 190 // but generally do not deal with missing right : <img src="foo.png> 191 } 192 return encodedAttributeValue.substring(start, n - 1); 193 } 194 } 195 return encodedAttributeValue; 196 } 197 198 @VisibleForTesting 199 static String decodeHtml(String s) { 200 int amp = s.indexOf('&'); 201 if (amp < 0) { return s; } 202 int pos = 0; 203 int n = s.length(); 204 StringBuilder sb = new StringBuilder(n); 205 int end; 206 do { 207 long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); 208 end = (int) (endAndCodepoint >>> 32); 209 int codepoint = (int) endAndCodepoint; 210 sb.append(s, pos, amp).appendCodePoint(codepoint); 211 pos = end; 212 } while ((amp = s.indexOf('&', end)) >= 0); 213 return sb.append(s, pos, n).toString(); 214 } 215 216 }