001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html.examples;
030    
031    import java.io.IOException;
032    import java.io.InputStreamReader;
033    import java.util.regex.Pattern;
034    
035    import org.owasp.html.Handler;
036    import org.owasp.html.HtmlPolicyBuilder;
037    import org.owasp.html.HtmlSanitizer;
038    import org.owasp.html.HtmlStreamRenderer;
039    import org.owasp.html.PolicyFactory;
040    
041    import com.google.common.base.Charsets;
042    import com.google.common.base.Predicate;
043    import com.google.common.base.Throwables;
044    import com.google.common.io.CharStreams;
045    
046    /**
047     * Based on the
048     * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>.
049     * <blockquote>
050     * eBay (http://www.ebay.com/) is the most popular online auction site in the
051     * universe, as far as I can tell. It is a public site so anyone is allowed to
052     * post listings with rich HTML content. It's not surprising that given the
053     * attractiveness of eBay as a target that it has been subject to a few complex
054     * XSS attacks. Listings are allowed to contain much more rich content than,
055     * say, Slashdot- so it's attack surface is considerably larger. The following
056     * tags appear to be accepted by eBay (they don't publish rules):
057     * {@code <a>},...
058     * </blockquote>
059     */
060    public class EbayPolicyExample {
061    
062      // Some common regular expression definitions.
063    
064      // The 16 colors defined by the HTML Spec (also used by the CSS Spec)
065      private static final Pattern COLOR_NAME = Pattern.compile(
066          "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple"
067          + "|red|silver|teal|white|yellow)");
068    
069      // HTML/CSS Spec allows 3 or 6 digit hex to specify color
070      private static final Pattern COLOR_CODE = Pattern.compile(
071          "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))");
072    
073      private static final Pattern NUMBER_OR_PERCENT = Pattern.compile(
074          "[0-9]+%?");
075      private static final Pattern PARAGRAPH = Pattern.compile(
076          "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*");
077      private static final Pattern HTML_ID = Pattern.compile(
078          "[a-zA-Z0-9\\:\\-_\\.]+");
079      // force non-empty with a '+' at the end instead of '*'
080      private static final Pattern HTML_TITLE = Pattern.compile(
081          "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*");
082      private static final Pattern HTML_CLASS = Pattern.compile(
083          "[a-zA-Z0-9\\s,\\-_]+");
084    
085      private static final Pattern ONSITE_URL = Pattern.compile(
086          "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)");
087      private static final Pattern OFFSITE_URL = Pattern.compile(
088          "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]"
089          + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*\\s*");
090    
091      private static final Pattern NUMBER = Pattern.compile(
092          "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)");
093    
094      private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+");
095    
096      private static final Pattern ALIGN = Pattern.compile(
097          "(?i)center|left|right|justify|char");
098    
099      private static final Pattern VALIGN = Pattern.compile(
100          "(?i)baseline|bottom|middle|top");
101    
102      private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE
103          = new Predicate<String>() {
104            public boolean apply(String s) {
105              return COLOR_NAME.matcher(s).matches()
106                  || COLOR_CODE.matcher(s).matches();
107            }
108          };
109    
110      private static final Predicate<String> ONSITE_OR_OFFSITE_URL
111          = new Predicate<String>() {
112            public boolean apply(String s) {
113              return ONSITE_URL.matcher(s).matches()
114                  || OFFSITE_URL.matcher(s).matches();
115            }
116          };
117    
118      private static final Pattern HISTORY_BACK = Pattern.compile(
119          "(?:javascript:)?\\Qhistory.go(-1)\\E");
120    
121      private static final Pattern ONE_CHAR = Pattern.compile(
122          ".?", Pattern.DOTALL);
123    
124    
125    
126      public static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder()
127              .allowAttributes("id").matching(HTML_ID).globally()
128              .allowAttributes("class").matching(HTML_CLASS).globally()
129              .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
130                  .globally()
131              .allowAttributes("title").matching(HTML_TITLE).globally()
132              .allowStyling()
133              .allowAttributes("align").matching(ALIGN).onElements("p")
134              .allowAttributes("for").matching(HTML_ID).onElements("label")
135              .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE)
136                  .onElements("font")
137              .allowAttributes("face")
138                  .matching(Pattern.compile("[\\w;, \\-]+"))
139                  .onElements("font")
140              .allowAttributes("size").matching(NUMBER).onElements("font")
141              .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL)
142                  .onElements("a")
143              .allowStandardUrlProtocols()
144              .allowAttributes("nohref").onElements("a")
145              .allowAttributes("name").matching(NAME).onElements("a")
146              .allowAttributes(
147                  "onfocus", "onblur", "onclick", "onmousedown", "onmouseup")
148                  .matching(HISTORY_BACK).onElements("a")
149              .requireRelNofollowOnLinks()
150              .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL)
151                  .onElements("img")
152              .allowAttributes("name").matching(NAME)
153                  .onElements("img")
154              .allowAttributes("alt").matching(PARAGRAPH)
155                  .onElements("img")
156              .allowAttributes("border", "hspace", "vspace").matching(NUMBER)
157                  .onElements("img")
158              .allowAttributes("border", "cellpadding", "cellspacing")
159                  .matching(NUMBER).onElements("table")
160              .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
161                  .onElements("table")
162              .allowAttributes("background").matching(ONSITE_URL)
163                  .onElements("table")
164              .allowAttributes("align").matching(ALIGN)
165                  .onElements("table")
166              .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize"))
167                  .onElements("table")
168              .allowAttributes("background").matching(ONSITE_URL)
169                  .onElements("td", "th", "tr")
170              .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
171                  .onElements("td", "th")
172              .allowAttributes("abbr").matching(PARAGRAPH)
173                  .onElements("td", "th")
174              .allowAttributes("axis", "headers").matching(NAME)
175                  .onElements("td", "th")
176              .allowAttributes("scope")
177                  .matching(Pattern.compile("(?i)(?:row|col)(?:group)?"))
178                  .onElements("td", "th")
179              .allowAttributes("nowrap")
180                  .onElements("td", "th")
181              .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT)
182                  .onElements("table", "td", "th", "tr", "img")
183              .allowAttributes("align").matching(ALIGN)
184                  .onElements("thead", "tbody", "tfoot", "img",
185                                   "td", "th", "tr", "colgroup", "col")
186              .allowAttributes("valign").matching(VALIGN)
187                  .onElements("thead", "tbody", "tfoot",
188                                  "td", "th", "tr", "colgroup", "col")
189              .allowAttributes("charoff").matching(NUMBER_OR_PERCENT)
190                  .onElements("td", "th", "tr", "colgroup", "col",
191                                  "thead", "tbody", "tfoot")
192              .allowAttributes("char").matching(ONE_CHAR)
193                  .onElements("td", "th", "tr", "colgroup", "col",
194                                   "thead", "tbody", "tfoot")
195              .allowAttributes("colspan", "rowspan").matching(NUMBER)
196                  .onElements("td", "th")
197              .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT)
198                  .onElements("colgroup", "col")
199              .allowElements(
200                  "a", "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6",
201                  "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code",
202                  "cite", "samp", "sub", "sup", "strike", "center", "blockquote",
203                  "hr", "br", "col", "font", "map", "span", "div", "img",
204                  "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot",
205                  "table", "td", "th", "tr", "colgroup", "fieldset", "legend")
206              .toFactory();
207    
208      public static void main(String[] args) throws IOException {
209        if (args.length != 0) {
210          System.err.println("Reads from STDIN and writes to STDOUT");
211          System.exit(-1);
212        }
213        System.err.println("[Reading from STDIN]");
214        // Fetch the HTML to sanitize.
215        String html = CharStreams.toString(
216            new InputStreamReader(System.in, Charsets.UTF_8));
217        // Set up an output channel to receive the sanitized HTML.
218        HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
219            System.out,
220            // Receives notifications on a failure to write to the output.
221            new Handler<IOException>() {
222              public void handle(IOException ex) {
223                Throwables.propagate(ex);  // System.out suppresses IOExceptions
224              }
225            },
226            // Our HTML parser is very lenient, but this receives notifications on
227            // truly bizarre inputs.
228            new Handler<String>() {
229              public void handle(String x) {
230                throw new AssertionError(x);
231              }
232            });
233        // Use the policy defined above to sanitize the HTML.
234        HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
235      }
236    }