001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html.examples; 030 031 import java.io.IOException; 032 import java.io.InputStreamReader; 033 import java.util.regex.Pattern; 034 035 import org.owasp.html.Handler; 036 import org.owasp.html.HtmlPolicyBuilder; 037 import org.owasp.html.HtmlSanitizer; 038 import org.owasp.html.HtmlStreamRenderer; 039 import org.owasp.html.PolicyFactory; 040 041 import com.google.common.base.Charsets; 042 import com.google.common.base.Predicate; 043 import com.google.common.base.Throwables; 044 import com.google.common.io.CharStreams; 045 046 /** 047 * Based on the 048 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>. 049 * <blockquote> 050 * eBay (http://www.ebay.com/) is the most popular online auction site in the 051 * universe, as far as I can tell. It is a public site so anyone is allowed to 052 * post listings with rich HTML content. It's not surprising that given the 053 * attractiveness of eBay as a target that it has been subject to a few complex 054 * XSS attacks. Listings are allowed to contain much more rich content than, 055 * say, Slashdot- so it's attack surface is considerably larger. The following 056 * tags appear to be accepted by eBay (they don't publish rules): 057 * {@code <a>},... 058 * </blockquote> 059 */ 060 public class EbayPolicyExample { 061 062 // Some common regular expression definitions. 063 064 // The 16 colors defined by the HTML Spec (also used by the CSS Spec) 065 private static final Pattern COLOR_NAME = Pattern.compile( 066 "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple" 067 + "|red|silver|teal|white|yellow)"); 068 069 // HTML/CSS Spec allows 3 or 6 digit hex to specify color 070 private static final Pattern COLOR_CODE = Pattern.compile( 071 "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))"); 072 073 private static final Pattern NUMBER_OR_PERCENT = Pattern.compile( 074 "[0-9]+%?"); 075 private static final Pattern PARAGRAPH = Pattern.compile( 076 "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*"); 077 private static final Pattern HTML_ID = Pattern.compile( 078 "[a-zA-Z0-9\\:\\-_\\.]+"); 079 // force non-empty with a '+' at the end instead of '*' 080 private static final Pattern HTML_TITLE = Pattern.compile( 081 "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*"); 082 private static final Pattern HTML_CLASS = Pattern.compile( 083 "[a-zA-Z0-9\\s,\\-_]+"); 084 085 private static final Pattern ONSITE_URL = Pattern.compile( 086 "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)"); 087 private static final Pattern OFFSITE_URL = Pattern.compile( 088 "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]" 089 + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*\\s*"); 090 091 private static final Pattern NUMBER = Pattern.compile( 092 "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)"); 093 094 private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+"); 095 096 private static final Pattern ALIGN = Pattern.compile( 097 "(?i)center|left|right|justify|char"); 098 099 private static final Pattern VALIGN = Pattern.compile( 100 "(?i)baseline|bottom|middle|top"); 101 102 private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE 103 = new Predicate<String>() { 104 public boolean apply(String s) { 105 return COLOR_NAME.matcher(s).matches() 106 || COLOR_CODE.matcher(s).matches(); 107 } 108 }; 109 110 private static final Predicate<String> ONSITE_OR_OFFSITE_URL 111 = new Predicate<String>() { 112 public boolean apply(String s) { 113 return ONSITE_URL.matcher(s).matches() 114 || OFFSITE_URL.matcher(s).matches(); 115 } 116 }; 117 118 private static final Pattern HISTORY_BACK = Pattern.compile( 119 "(?:javascript:)?\\Qhistory.go(-1)\\E"); 120 121 private static final Pattern ONE_CHAR = Pattern.compile( 122 ".?", Pattern.DOTALL); 123 124 125 126 public static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder() 127 .allowAttributes("id").matching(HTML_ID).globally() 128 .allowAttributes("class").matching(HTML_CLASS).globally() 129 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) 130 .globally() 131 .allowAttributes("title").matching(HTML_TITLE).globally() 132 .allowStyling() 133 .allowAttributes("align").matching(ALIGN).onElements("p") 134 .allowAttributes("for").matching(HTML_ID).onElements("label") 135 .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE) 136 .onElements("font") 137 .allowAttributes("face") 138 .matching(Pattern.compile("[\\w;, \\-]+")) 139 .onElements("font") 140 .allowAttributes("size").matching(NUMBER).onElements("font") 141 .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL) 142 .onElements("a") 143 .allowStandardUrlProtocols() 144 .allowAttributes("nohref").onElements("a") 145 .allowAttributes("name").matching(NAME).onElements("a") 146 .allowAttributes( 147 "onfocus", "onblur", "onclick", "onmousedown", "onmouseup") 148 .matching(HISTORY_BACK).onElements("a") 149 .requireRelNofollowOnLinks() 150 .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL) 151 .onElements("img") 152 .allowAttributes("name").matching(NAME) 153 .onElements("img") 154 .allowAttributes("alt").matching(PARAGRAPH) 155 .onElements("img") 156 .allowAttributes("border", "hspace", "vspace").matching(NUMBER) 157 .onElements("img") 158 .allowAttributes("border", "cellpadding", "cellspacing") 159 .matching(NUMBER).onElements("table") 160 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) 161 .onElements("table") 162 .allowAttributes("background").matching(ONSITE_URL) 163 .onElements("table") 164 .allowAttributes("align").matching(ALIGN) 165 .onElements("table") 166 .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize")) 167 .onElements("table") 168 .allowAttributes("background").matching(ONSITE_URL) 169 .onElements("td", "th", "tr") 170 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) 171 .onElements("td", "th") 172 .allowAttributes("abbr").matching(PARAGRAPH) 173 .onElements("td", "th") 174 .allowAttributes("axis", "headers").matching(NAME) 175 .onElements("td", "th") 176 .allowAttributes("scope") 177 .matching(Pattern.compile("(?i)(?:row|col)(?:group)?")) 178 .onElements("td", "th") 179 .allowAttributes("nowrap") 180 .onElements("td", "th") 181 .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT) 182 .onElements("table", "td", "th", "tr", "img") 183 .allowAttributes("align").matching(ALIGN) 184 .onElements("thead", "tbody", "tfoot", "img", 185 "td", "th", "tr", "colgroup", "col") 186 .allowAttributes("valign").matching(VALIGN) 187 .onElements("thead", "tbody", "tfoot", 188 "td", "th", "tr", "colgroup", "col") 189 .allowAttributes("charoff").matching(NUMBER_OR_PERCENT) 190 .onElements("td", "th", "tr", "colgroup", "col", 191 "thead", "tbody", "tfoot") 192 .allowAttributes("char").matching(ONE_CHAR) 193 .onElements("td", "th", "tr", "colgroup", "col", 194 "thead", "tbody", "tfoot") 195 .allowAttributes("colspan", "rowspan").matching(NUMBER) 196 .onElements("td", "th") 197 .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT) 198 .onElements("colgroup", "col") 199 .allowElements( 200 "a", "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6", 201 "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code", 202 "cite", "samp", "sub", "sup", "strike", "center", "blockquote", 203 "hr", "br", "col", "font", "map", "span", "div", "img", 204 "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot", 205 "table", "td", "th", "tr", "colgroup", "fieldset", "legend") 206 .toFactory(); 207 208 public static void main(String[] args) throws IOException { 209 if (args.length != 0) { 210 System.err.println("Reads from STDIN and writes to STDOUT"); 211 System.exit(-1); 212 } 213 System.err.println("[Reading from STDIN]"); 214 // Fetch the HTML to sanitize. 215 String html = CharStreams.toString( 216 new InputStreamReader(System.in, Charsets.UTF_8)); 217 // Set up an output channel to receive the sanitized HTML. 218 HtmlStreamRenderer renderer = HtmlStreamRenderer.create( 219 System.out, 220 // Receives notifications on a failure to write to the output. 221 new Handler<IOException>() { 222 public void handle(IOException ex) { 223 Throwables.propagate(ex); // System.out suppresses IOExceptions 224 } 225 }, 226 // Our HTML parser is very lenient, but this receives notifications on 227 // truly bizarre inputs. 228 new Handler<String>() { 229 public void handle(String x) { 230 throw new AssertionError(x); 231 } 232 }); 233 // Use the policy defined above to sanitize the HTML. 234 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); 235 } 236 }