001    // Copyright (c) 2013, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html.examples;
030    
031    import java.io.IOException;
032    import java.util.ArrayList;
033    import java.util.List;
034    
035    import org.owasp.html.Handler;
036    import org.owasp.html.HtmlPolicyBuilder;
037    import org.owasp.html.HtmlSanitizer;
038    import org.owasp.html.HtmlStreamEventReceiver;
039    import org.owasp.html.HtmlStreamRenderer;
040    import org.owasp.html.HtmlTextEscapingMode;
041    import org.owasp.html.PolicyFactory;
042    import org.owasp.html.TagBalancingHtmlStreamEventReceiver;
043    
044    /**
045     * Uses a custom event receiver to emit the domain of a link or inline image
046     * after the link or image.
047     */
048    public class UrlTextExample {
049    
050      /** An event receiver that emits the domain of a link or image after it. */
051      static class AppendDomainAfterText implements HtmlStreamEventReceiver {
052        final HtmlStreamEventReceiver underlying;
053        private final List<String> pendingText = new ArrayList<String>();
054    
055        AppendDomainAfterText(HtmlStreamEventReceiver underlying) {
056          this.underlying = underlying;
057        }
058    
059        public void openDocument() {
060          underlying.openDocument();
061        }
062        public void closeDocument() {
063          underlying.closeDocument();
064        }
065        public void openTag(String elementName, List<String> attribs) {
066          underlying.openTag(elementName, attribs);
067    
068          String trailingText = null;
069    
070          if (!attribs.isEmpty()) {
071            // Figure out which attribute we should look for.
072            String urlAttrName = null;
073            if ("a".equals(elementName)) {
074              urlAttrName = "href";
075            } else if ("img".equals(elementName)) {
076              urlAttrName = "src";
077            }
078            if (urlAttrName != null) {
079              // Look for the attribute, and after it for its value.
080              for (int i = 0, n = attribs.size(); i < n; i += 2) {
081                if (urlAttrName.equals(attribs.get(i))) {
082                  String url = attribs.get(i+1).trim();
083                  String domain = domainOf(url);
084                  if (domain != null) {
085                    trailingText = " - " + domain;
086                  }
087                  break;
088                }
089              }
090            }
091          }
092          if (HtmlTextEscapingMode.isVoidElement(elementName)) {
093            // A void element like <img> will not have a corresponding closeTag
094            // call.
095            if (trailingText != null) {
096              text(trailingText);
097            }
098          } else {
099            // Push the trailing text onto a stack so when we see the corresponding
100            // close tag, we can emit the text.
101            pendingText.add(trailingText);
102          }
103        }
104        public void closeTag(String elementName) {
105          underlying.closeTag(elementName);
106          // Pull the trailing text for the recently closed element off the stack.
107          int pendingTextSize = pendingText.size();
108          if (pendingTextSize != 0) {
109            String trailingText = pendingText.remove(pendingTextSize - 1);
110            if (trailingText != null) {
111              text(trailingText);
112            }
113          }
114        }
115        public void text(String text) {
116          underlying.text(text);
117        }
118      }
119    
120      public static void run(Appendable out, String... argv) throws IOException {
121        PolicyFactory policyBuilder = new HtmlPolicyBuilder()
122          .allowAttributes("src").onElements("img")
123          .allowAttributes("href").onElements("a")
124          // Allow some URLs through.
125          .allowStandardUrlProtocols()
126          .allowElements(
127              "a", "label", "h1", "h2", "h3", "h4", "h5", "h6",
128              "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code",
129              "cite", "samp", "sub", "sup", "strike", "center", "blockquote",
130              "hr", "br", "col", "font", "span", "div", "img",
131              "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot",
132              "table", "td", "th", "tr", "colgroup", "fieldset", "legend"
133          ).toFactory();
134    
135        StringBuilder htmlOut = new StringBuilder();
136        HtmlSanitizer.Policy policy = policyBuilder.apply(
137            // The tag balancer passes events to AppendDomainAfterText which
138            // assumes that openTag and closeTag events line up with one-another.
139            new TagBalancingHtmlStreamEventReceiver(
140                // The domain appender forwards events to the HTML renderer,
141                new AppendDomainAfterText(
142                    // which puts tags and text onto the output buffer.
143                    HtmlStreamRenderer.create(htmlOut, Handler.DO_NOTHING)
144                )
145            )
146        );
147    
148        for (String input : argv) {
149          HtmlSanitizer.sanitize(input, policy);
150        }
151    
152        out.append(htmlOut);
153      }
154    
155      public static void main(String... argv) throws IOException {
156        run(System.out, argv);
157        System.out.println();
158      }
159    
160    
161      /**
162       * The domain (actually authority component) of an HTML5 URL.
163       * If the input is not hierarchical, then this has undefined behavior.
164       */
165      private static String domainOf(String url) {
166        int start = -1;
167        if (url.startsWith("//")) {
168          start = 2;
169        } else {
170          start = url.indexOf("://");
171          if (start >= 0) { start += 3; }
172        }
173        if (start < 0) { return null; }
174        for (int i = 0; i < start - 3; ++i) {
175          switch (url.charAt(i)) {
176          case '/': case '?': case '#': return null;
177          default: break;
178          }
179        }
180        int end = url.length();
181        for (int i = start; i < end; ++i) {
182          switch (url.charAt(i)) {
183          case '/': case '?': case '#': end = i; break;
184          default: break;
185          }
186        }
187        if (start < end) {
188          return url.substring(start, end);
189        } else {
190          return null;
191        }
192      }
193    }