001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html.examples; 030 031 import java.io.IOException; 032 import java.io.InputStreamReader; 033 import java.util.regex.Pattern; 034 035 import org.owasp.html.Handler; 036 import org.owasp.html.HtmlPolicyBuilder; 037 import org.owasp.html.HtmlSanitizer; 038 import org.owasp.html.HtmlStreamEventReceiver; 039 import org.owasp.html.HtmlStreamRenderer; 040 041 import com.google.common.base.Charsets; 042 import com.google.common.base.Function; 043 import com.google.common.base.Predicate; 044 import com.google.common.base.Throwables; 045 import com.google.common.io.CharStreams; 046 047 /** 048 * Based on the 049 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>. 050 * <blockquote> 051 * eBay (http://www.ebay.com/) is the most popular online auction site in the 052 * universe, as far as I can tell. It is a public site so anyone is allowed to 053 * post listings with rich HTML content. It's not surprising that given the 054 * attractiveness of eBay as a target that it has been subject to a few complex 055 * XSS attacks. Listings are allowed to contain much more rich content than, 056 * say, Slashdot- so it's attack surface is considerably larger. The following 057 * tags appear to be accepted by eBay (they don't publish rules): 058 * {@code <a>},... 059 * </blockquote> 060 */ 061 public class EbayPolicyExample { 062 063 // Some common regular expression definitions. 064 065 // The 16 colors defined by the HTML Spec (also used by the CSS Spec) 066 private static final Pattern COLOR_NAME = Pattern.compile( 067 "(aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple" 068 + "|red|silver|teal|white|yellow)"); 069 070 // HTML/CSS Spec allows 3 or 6 digit hex to specify color 071 private static final Pattern COLOR_CODE = Pattern.compile( 072 "(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))"); 073 074 private static final Pattern NUMBER_OR_PERCENT = Pattern.compile( 075 "(\\d)+(%{0,1})"); 076 private static final Pattern PARAGRAPH = Pattern.compile( 077 "([\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*"); 078 private static final Pattern HTML_ID = Pattern.compile( 079 "[a-zA-Z0-9\\:\\-_\\.]+"); 080 // force non-empty with a '+' at the end instead of '*' 081 private static final Pattern HTML_TITLE = Pattern.compile( 082 "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*"); 083 private static final Pattern HTML_CLASS = Pattern.compile( 084 "[a-zA-Z0-9\\s,\\-_]+"); 085 086 private static final Pattern ONSITE_URL = Pattern.compile( 087 "([\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)"); 088 private static final Pattern OFFSITE_URL = Pattern.compile( 089 "(\\s)*((ht|f)tp(s?)://|mailto:)[\\p{L}\\p{N}]+" 090 + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*(\\s)*"); 091 092 private static final Pattern NUMBER = Pattern.compile( 093 "(-|\\+)?([0-9]+(\\.[0-9]+)?)"); 094 095 private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+"); 096 097 private static final Pattern ALIGN = Pattern.compile( 098 "(?i)cener|left|right|justify|char"); 099 100 private static final Pattern VALIGN = Pattern.compile( 101 "(?i)baseline|bottom|middle|top"); 102 103 private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE 104 = new Predicate<String>() { 105 public boolean apply(String s) { 106 return COLOR_NAME.matcher(s).matches() 107 || COLOR_CODE.matcher(s).matches(); 108 } 109 }; 110 111 private static final Predicate<String> ONSITE_OR_OFFSITE_URL 112 = new Predicate<String>() { 113 public boolean apply(String s) { 114 return ONSITE_URL.matcher(s).matches() 115 || OFFSITE_URL.matcher(s).matches(); 116 } 117 }; 118 119 private static final Pattern HISTORY_BACK = Pattern.compile( 120 "(?:javascript:)?\\Qhistory.go(-1)\\E"); 121 122 private static final Pattern ONE_CHAR = Pattern.compile(".?"); 123 124 125 126 public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> 127 POLICY_DEFINITION = new HtmlPolicyBuilder() 128 .allowAttributes("id").matching(HTML_ID).globally() 129 .allowAttributes("class").matching(HTML_CLASS).globally() 130 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) 131 .globally() 132 .allowAttributes("title").matching(HTML_TITLE).globally() 133 .allowStyling() 134 .allowAttributes("align").matching(ALIGN).onElements("p") 135 .allowAttributes("for").matching(HTML_ID).onElements("label") 136 .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE) 137 .onElements("font") 138 .allowAttributes("face") 139 .matching(Pattern.compile("[\\w;, \\-]+")) 140 .onElements("font") 141 .allowAttributes("size").matching(NUMBER).onElements("font") 142 .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL) 143 .onElements("a") 144 .allowStandardUrlProtocols() 145 .allowAttributes("nohref").onElements("a") 146 .allowAttributes("name").matching(NAME).onElements("a") 147 .allowAttributes( 148 "onfocus", "onblur", "onclick", "onmousedown", "onmouseup") 149 .matching(HISTORY_BACK).onElements("a") 150 .requireRelNofollowOnLinks() 151 .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL) 152 .onElements("img") 153 .allowAttributes("name").matching(NAME) 154 .onElements("img") 155 .allowAttributes("alt").matching(PARAGRAPH) 156 .onElements("img") 157 .allowAttributes("border", "hspace", "vspace").matching(NUMBER) 158 .onElements("img") 159 .allowAttributes("border", "cellpadding", "cellspacing") 160 .matching(NUMBER).onElements("table") 161 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) 162 .onElements("table") 163 .allowAttributes("background").matching(ONSITE_URL) 164 .onElements("table") 165 .allowAttributes("align").matching(ALIGN) 166 .onElements("table") 167 .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize")) 168 .onElements("table") 169 .allowAttributes("background").matching(ONSITE_URL) 170 .onElements("td", "th", "tr") 171 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) 172 .onElements("td", "th") 173 .allowAttributes("abbr").matching(PARAGRAPH) 174 .onElements("td", "th") 175 .allowAttributes("axis", "headers").matching(NAME) 176 .onElements("td", "th") 177 .allowAttributes("scope") 178 .matching(Pattern.compile("(?i)(?:row|col)(?:group)?")) 179 .onElements("td", "th") 180 .allowAttributes("nowrap") 181 .onElements("td", "th") 182 .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT) 183 .onElements("table", "td", "th", "tr", "img") 184 .allowAttributes("align").matching(ALIGN) 185 .onElements("thead", "tbody", "tfoot", "img", 186 "td", "th", "tr", "colgroup", "col") 187 .allowAttributes("valign").matching(VALIGN) 188 .onElements("thead", "tbody", "tfoot", 189 "td", "th", "tr", "colgroup", "col") 190 .allowAttributes("charoff").matching(NUMBER_OR_PERCENT) 191 .onElements("td", "th", "tr", "colgroup", "col", 192 "thead", "tbody", "tfoot") 193 .allowAttributes("char").matching(ONE_CHAR) 194 .onElements("td", "th", "tr", "colgroup", "col", 195 "thead", "tbody", "tfoot") 196 .allowAttributes("colspan", "rowspan").matching(NUMBER) 197 .onElements("td", "th") 198 .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT) 199 .onElements("colgroup", "col") 200 .allowElements( 201 "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6", 202 "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code", 203 "cite", "samp", "sub", "sup", "strike", "center", "blockquote", 204 "hr", "br", "col", "font", "map", "span", "div", "img", 205 "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot", 206 "table", "td", "th", "tr", "colgroup", "fieldset", "legend") 207 .toFactory(); 208 209 public static void main(String[] args) throws IOException { 210 if (args.length != 0) { 211 System.err.println("Reads from STDIN and writes to STDOUT"); 212 System.exit(-1); 213 } 214 System.err.println("[Reading from STDIN]"); 215 // Fetch the HTML to sanitize. 216 String html = CharStreams.toString( 217 new InputStreamReader(System.in, Charsets.UTF_8)); 218 // Set up an output channel to receive the sanitized HTML. 219 HtmlStreamRenderer renderer = HtmlStreamRenderer.create( 220 System.out, 221 // Receives notifications on a failure to write to the output. 222 new Handler<IOException>() { 223 public void handle(IOException ex) { 224 Throwables.propagate(ex); // System.out suppresses IOExceptions 225 } 226 }, 227 // Our HTML parser is very lenient, but this receives notifications on 228 // truly bizarre inputs. 229 new Handler<String>() { 230 public void handle(String x) { 231 throw new AssertionError(x); 232 } 233 }); 234 // Use the policy defined above to sanitize the HTML. 235 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); 236 } 237 }