001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html.examples;
030    
031    import java.io.IOException;
032    import java.io.InputStreamReader;
033    import java.util.regex.Pattern;
034    
035    import org.owasp.html.Handler;
036    import org.owasp.html.HtmlPolicyBuilder;
037    import org.owasp.html.HtmlSanitizer;
038    import org.owasp.html.HtmlStreamEventReceiver;
039    import org.owasp.html.HtmlStreamRenderer;
040    
041    import com.google.common.base.Charsets;
042    import com.google.common.base.Function;
043    import com.google.common.base.Predicate;
044    import com.google.common.base.Throwables;
045    import com.google.common.io.CharStreams;
046    
047    /**
048     * Based on the
049     * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>.
050     * <blockquote>
051     * eBay (http://www.ebay.com/) is the most popular online auction site in the
052     * universe, as far as I can tell. It is a public site so anyone is allowed to
053     * post listings with rich HTML content. It's not surprising that given the
054     * attractiveness of eBay as a target that it has been subject to a few complex
055     * XSS attacks. Listings are allowed to contain much more rich content than,
056     * say, Slashdot- so it's attack surface is considerably larger. The following
057     * tags appear to be accepted by eBay (they don't publish rules):
058     * {@code <a>},...
059     * </blockquote>
060     */
061    public class EbayPolicyExample {
062    
063      // Some common regular expression definitions.
064    
065      // The 16 colors defined by the HTML Spec (also used by the CSS Spec)
066      private static final Pattern COLOR_NAME = Pattern.compile(
067          "(aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple"
068          + "|red|silver|teal|white|yellow)");
069    
070      // HTML/CSS Spec allows 3 or 6 digit hex to specify color
071      private static final Pattern COLOR_CODE = Pattern.compile(
072          "(#([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))");
073    
074      private static final Pattern NUMBER_OR_PERCENT = Pattern.compile(
075          "(\\d)+(%{0,1})");
076      private static final Pattern PARAGRAPH = Pattern.compile(
077          "([\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*");
078      private static final Pattern HTML_ID = Pattern.compile(
079          "[a-zA-Z0-9\\:\\-_\\.]+");
080      // force non-empty with a '+' at the end instead of '*'
081      private static final Pattern HTML_TITLE = Pattern.compile(
082          "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*");
083      private static final Pattern HTML_CLASS = Pattern.compile(
084          "[a-zA-Z0-9\\s,\\-_]+");
085    
086      private static final Pattern ONSITE_URL = Pattern.compile(
087          "([\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)");
088      private static final Pattern OFFSITE_URL = Pattern.compile(
089          "(\\s)*((ht|f)tp(s?)://|mailto:)[\\p{L}\\p{N}]+"
090          + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*(\\s)*");
091    
092      private static final Pattern NUMBER = Pattern.compile(
093          "(-|\\+)?([0-9]+(\\.[0-9]+)?)");
094    
095      private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+");
096    
097      private static final Pattern ALIGN = Pattern.compile(
098          "(?i)cener|left|right|justify|char");
099    
100      private static final Pattern VALIGN = Pattern.compile(
101          "(?i)baseline|bottom|middle|top");
102    
103      private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE
104          = new Predicate<String>() {
105            public boolean apply(String s) {
106              return COLOR_NAME.matcher(s).matches()
107                  || COLOR_CODE.matcher(s).matches();
108            }
109          };
110    
111      private static final Predicate<String> ONSITE_OR_OFFSITE_URL
112          = new Predicate<String>() {
113            public boolean apply(String s) {
114              return ONSITE_URL.matcher(s).matches()
115                  || OFFSITE_URL.matcher(s).matches();
116            }
117          };
118    
119      private static final Pattern HISTORY_BACK = Pattern.compile(
120          "(?:javascript:)?\\Qhistory.go(-1)\\E");
121    
122      private static final Pattern ONE_CHAR = Pattern.compile(".?");
123    
124    
125    
126      public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy>
127          POLICY_DEFINITION = new HtmlPolicyBuilder()
128              .allowAttributes("id").matching(HTML_ID).globally()
129              .allowAttributes("class").matching(HTML_CLASS).globally()
130              .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
131                  .globally()
132              .allowAttributes("title").matching(HTML_TITLE).globally()
133              .allowStyling()
134              .allowAttributes("align").matching(ALIGN).onElements("p")
135              .allowAttributes("for").matching(HTML_ID).onElements("label")
136              .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE)
137                  .onElements("font")
138              .allowAttributes("face")
139                  .matching(Pattern.compile("[\\w;, \\-]+"))
140                  .onElements("font")
141              .allowAttributes("size").matching(NUMBER).onElements("font")
142              .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL)
143                  .onElements("a")
144              .allowStandardUrlProtocols()
145              .allowAttributes("nohref").onElements("a")
146              .allowAttributes("name").matching(NAME).onElements("a")
147              .allowAttributes(
148                  "onfocus", "onblur", "onclick", "onmousedown", "onmouseup")
149                  .matching(HISTORY_BACK).onElements("a")
150              .requireRelNofollowOnLinks()
151              .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL)
152                  .onElements("img")
153              .allowAttributes("name").matching(NAME)
154                  .onElements("img")
155              .allowAttributes("alt").matching(PARAGRAPH)
156                  .onElements("img")
157              .allowAttributes("border", "hspace", "vspace").matching(NUMBER)
158                  .onElements("img")
159              .allowAttributes("border", "cellpadding", "cellspacing")
160                  .matching(NUMBER).onElements("table")
161              .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
162                  .onElements("table")
163              .allowAttributes("background").matching(ONSITE_URL)
164                  .onElements("table")
165              .allowAttributes("align").matching(ALIGN)
166                  .onElements("table")
167              .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize"))
168                  .onElements("table")
169              .allowAttributes("background").matching(ONSITE_URL)
170                  .onElements("td", "th", "tr")
171              .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
172                  .onElements("td", "th")
173              .allowAttributes("abbr").matching(PARAGRAPH)
174                  .onElements("td", "th")
175              .allowAttributes("axis", "headers").matching(NAME)
176                  .onElements("td", "th")
177              .allowAttributes("scope")
178                  .matching(Pattern.compile("(?i)(?:row|col)(?:group)?"))
179                  .onElements("td", "th")
180              .allowAttributes("nowrap")
181                  .onElements("td", "th")
182              .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT)
183                  .onElements("table", "td", "th", "tr", "img")
184              .allowAttributes("align").matching(ALIGN)
185                  .onElements("thead", "tbody", "tfoot", "img",
186                                   "td", "th", "tr", "colgroup", "col")
187              .allowAttributes("valign").matching(VALIGN)
188                  .onElements("thead", "tbody", "tfoot",
189                                  "td", "th", "tr", "colgroup", "col")
190              .allowAttributes("charoff").matching(NUMBER_OR_PERCENT)
191                  .onElements("td", "th", "tr", "colgroup", "col",
192                                  "thead", "tbody", "tfoot")
193              .allowAttributes("char").matching(ONE_CHAR)
194                  .onElements("td", "th", "tr", "colgroup", "col",
195                                   "thead", "tbody", "tfoot")
196              .allowAttributes("colspan", "rowspan").matching(NUMBER)
197                  .onElements("td", "th")
198              .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT)
199                  .onElements("colgroup", "col")
200              .allowElements(
201                  "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6",
202                  "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code",
203                  "cite", "samp", "sub", "sup", "strike", "center", "blockquote",
204                  "hr", "br", "col", "font", "map", "span", "div", "img",
205                  "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot",
206                  "table", "td", "th", "tr", "colgroup", "fieldset", "legend")
207              .toFactory();
208    
209      public static void main(String[] args) throws IOException {
210        if (args.length != 0) {
211          System.err.println("Reads from STDIN and writes to STDOUT");
212          System.exit(-1);
213        }
214        System.err.println("[Reading from STDIN]");
215        // Fetch the HTML to sanitize.
216        String html = CharStreams.toString(
217            new InputStreamReader(System.in, Charsets.UTF_8));
218        // Set up an output channel to receive the sanitized HTML.
219        HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
220            System.out,
221            // Receives notifications on a failure to write to the output.
222            new Handler<IOException>() {
223              public void handle(IOException ex) {
224                Throwables.propagate(ex);  // System.out suppresses IOExceptions
225              }
226            },
227            // Our HTML parser is very lenient, but this receives notifications on
228            // truly bizarre inputs.
229            new Handler<String>() {
230              public void handle(String x) {
231                throw new AssertionError(x);
232              }
233            });
234        // Use the policy defined above to sanitize the HTML.
235        HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
236      }
237    }