001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html.examples; 030 031 import java.io.IOException; 032 import java.io.InputStreamReader; 033 import java.util.regex.Pattern; 034 035 import org.owasp.html.Handler; 036 import org.owasp.html.HtmlPolicyBuilder; 037 import org.owasp.html.HtmlSanitizer; 038 import org.owasp.html.HtmlStreamEventReceiver; 039 import org.owasp.html.HtmlStreamRenderer; 040 041 import com.google.common.base.Charsets; 042 import com.google.common.base.Function; 043 import com.google.common.base.Throwables; 044 import com.google.common.io.CharStreams; 045 046 /** 047 * Based on the 048 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy Slashdot example</a>. 049 * <blockquote> 050 * Slashdot (http://www.slashdot.org/) is a techie news site that allows users 051 * to respond anonymously to news posts with very limited HTML markup. Now 052 * Slashdot is not only one of the coolest sites around, it's also one that's 053 * been subject to many different successful attacks. Even more unfortunate is 054 * the fact that most of the attacks led users to the infamous goatse.cx picture 055 * (please don't go look it up). The rules for Slashdot are fairly strict: users 056 * can only submit the following HTML tags and no CSS: {@code <b>}, {@code <u>}, 057 * {@code <i>}, {@code <a>}, {@code <blockquote>}. 058 * <br> 059 * Accordingly, we've built a policy file that allows fairly similar 060 * functionality. All text-formatting tags that operate directly on the font, 061 * color or emphasis have been allowed. 062 * </blockquote> 063 */ 064 public class SlashdotPolicyExample { 065 066 /** A policy definition that matches the minimal HTML that Slashdot allows. */ 067 public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> 068 POLICY_DEFINITION = new HtmlPolicyBuilder() 069 .allowStandardUrlProtocols() 070 // Allow title="..." on any element. 071 .allowAttributes("title").globally() 072 // Allow href="..." on <a> elements. 073 .allowAttributes("href").onElements("a") 074 // Defeat link spammers. 075 .requireRelNofollowOnLinks() 076 // Allow lang= with an alphabetic value on any element. 077 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) 078 .globally() 079 // The align attribute on <p> elements can have any value below. 080 .allowAttributes("align") 081 .matching(true, "center", "left", "right", "justify", "char") 082 .onElements("p") 083 // These elements are allowed. 084 .allowElements( 085 "a", "p", "div", "i", "b", "em", "blockquote", "tt", "strong", 086 "br", "ul", "ol", "li") 087 // Custom slashdot tags. 088 // These could be rewritten in the sanitizer using an ElementPolicy. 089 .allowElements("quote", "ecode") 090 .toFactory(); 091 092 public static void main(String[] args) throws IOException { 093 if (args.length != 0) { 094 System.err.println("Reads from STDIN and writes to STDOUT"); 095 System.exit(-1); 096 } 097 System.err.println("[Reading from STDIN]"); 098 // Fetch the HTML to sanitize. 099 String html = CharStreams.toString( 100 new InputStreamReader(System.in, Charsets.UTF_8)); 101 // Set up an output channel to receive the sanitized HTML. 102 HtmlStreamRenderer renderer = HtmlStreamRenderer.create( 103 System.out, 104 // Receives notifications on a failure to write to the output. 105 new Handler<IOException>() { 106 public void handle(IOException ex) { 107 Throwables.propagate(ex); // System.out suppresses IOExceptions 108 } 109 }, 110 // Our HTML parser is very lenient, but this receives notifications on 111 // truly bizarre inputs. 112 new Handler<String>() { 113 public void handle(String x) { 114 throw new AssertionError(x); 115 } 116 }); 117 // Use the policy defined above to sanitize the HTML. 118 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); 119 } 120 }