001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html;
030    
031    import java.util.LinkedList;
032    import java.util.List;
033    import javax.annotation.Nullable;
034    
035    import com.google.common.annotations.VisibleForTesting;
036    import com.google.common.collect.Lists;
037    
038    /**
039     * Consumes an HTML stream, and dispatches events to a policy object which
040     * decides which elements and attributes to allow.
041     */
042    public final class HtmlSanitizer {
043    
044      /**
045       * Receives events based on the HTML stream, and applies a policy to decide
046       * what HTML constructs to allow.
047       * Typically, implementations use an {@link HtmlStreamRenderer} to produce
048       * the sanitized output.
049       *
050       * <p>
051       * <b>Implementations of this class are in the TCB.</b></p>
052       */
053      @TCB
054      public interface Policy extends HtmlStreamEventReceiver {
055        /**
056         * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
057         *
058         * @param elementName a normalized (lower-case for non-namespaced names)
059         *     element name.
060         * @param attrs a list of alternating attribute name and value pairs.
061         *     For efficiency, this list may be mutated by this during this method
062         *     call, but ownership reverts to the caller on method exit.
063         *     The values are raw -- HTML entities have been decoded.
064         *     Specifically, implementations are allowed to use a list iterator
065         *     and remove all disallowed attributes, add necessary attributes, and
066         *     then pass the list to an {@link HtmlStreamRenderer}.
067         */
068        void openTag(String elementName, List<String> attrs);
069    
070        /**
071         * Called when an HTML tag like {@code </foo>} is seen in the input.
072         *
073         * @param elementName a normalized (lower-case for non-namespaced names)
074         *     element name.
075         */
076        void closeTag(String elementName);
077    
078        /**
079         * Called when textual content is seen.
080         * @param textChunk raw content -- HTML entities have been decoded.
081         */
082        void text(String textChunk);
083      }
084    
085      /**
086       * Sanitizes the given HTML by applying the given policy to it.
087       *
088       * <p>
089       * This method is not in the TCB.
090       *
091       * <p>
092       * This method has no return value since policies are assumed to render things
093       * they accept and do nothing on things they reject.
094       * Use {@link HtmlStreamRenderer} to render content to an output buffer.
095       *
096       * @param html A snippet of HTML to sanitize.  {@code null} is treated as the
097       *     empty string and will not result in a {@code NullPointerException}.
098       * @param policy The Policy that will receive events based on the tokens in
099       *     html.  Typically, this policy ends up routing the events to an
100       *     {@link HtmlStreamRenderer} after filtering.
101       *     {@link HtmlPolicyBuilder} provides an easy way to create policies.
102       */
103      public static void sanitize(@Nullable String html, final Policy policy) {
104        if (html == null) { html = ""; }
105    
106        HtmlStreamEventReceiver balancer = new TagBalancingHtmlStreamEventReceiver(
107            policy);
108    
109        balancer.openDocument();
110    
111        HtmlLexer lexer = new HtmlLexer(html);
112        // Use a linked list so that policies can use Iterator.remove() in an O(1)
113        // way.
114        LinkedList<String> attrs = Lists.newLinkedList();
115        while (lexer.hasNext()) {
116          HtmlToken token = lexer.next();
117          switch (token.type) {
118            case TEXT:
119              balancer.text(decodeHtml(html.substring(token.start, token.end)));
120              break;
121            case UNESCAPED:
122              balancer.text(html.substring(token.start, token.end));
123              break;
124            case TAGBEGIN:
125              if (html.charAt(token.start + 1) == '/') {  // A close tag.
126                balancer.closeTag(HtmlLexer.canonicalName(
127                    html.substring(token.start + 2, token.end)));
128                while (lexer.hasNext()
129                       && lexer.next().type != HtmlTokenType.TAGEND) {
130                  // skip tokens until we see a ">"
131                }
132              } else {
133                attrs.clear();
134    
135                boolean attrsReadyForName = true;
136                tagBody:
137                while (lexer.hasNext()) {
138                  HtmlToken tagBodyToken = lexer.next();
139                  switch (tagBodyToken.type) {
140                    case ATTRNAME:
141                      if (!attrsReadyForName) {
142                        // Last attribute added was valueless.
143                        attrs.add(attrs.getLast());
144                      } else {
145                        attrsReadyForName = false;
146                      }
147                      attrs.add(HtmlLexer.canonicalName(
148                          html.substring(tagBodyToken.start, tagBodyToken.end)));
149                      break;
150                    case ATTRVALUE:
151                      attrs.add(decodeHtml(stripQuotes(
152                          html.substring(tagBodyToken.start, tagBodyToken.end))));
153                      attrsReadyForName = true;
154                      break;
155                    case TAGEND:
156                      break tagBody;
157                    default:
158                      // Just drop anything not recognized
159                  }
160                }
161                if (!attrsReadyForName) {
162                  attrs.add(attrs.getLast());
163                }
164                balancer.openTag(
165                    HtmlLexer.canonicalName(
166                        html.substring(token.start + 1, token.end)),
167                    attrs);
168              }
169              break;
170            default:
171              // Ignore comments, directives, and other stuff that shouldn't show
172              // up in the output.
173              break;
174          }
175        }
176    
177        balancer.closeDocument();
178      }
179    
180      private static String stripQuotes(String encodedAttributeValue) {
181        int n = encodedAttributeValue.length();
182        if (n > 0) {
183          char last = encodedAttributeValue.charAt(n - 1);
184          if (last == '"' || last == '\'') {
185            int start = 0;
186            if (n != 1 && last == encodedAttributeValue.charAt(0)) {
187              start = 1;
188            } else {
189              // Browsers deal with missing left quotes : <img src=foo.png">
190              // but generally do not deal with missing right : <img src="foo.png>
191            }
192            return encodedAttributeValue.substring(start, n - 1);
193          }
194        }
195        return encodedAttributeValue;
196      }
197    
198      @VisibleForTesting
199      static String decodeHtml(String s) {
200        int amp = s.indexOf('&');
201        if (amp < 0) { return s; }
202        int pos = 0;
203        int n = s.length();
204        StringBuilder sb = new StringBuilder(n);
205        int end;
206        do {
207          long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
208          end = (int) (endAndCodepoint >>> 32);
209          int codepoint = (int) endAndCodepoint;
210          sb.append(s, pos, amp).appendCodePoint(codepoint);
211          pos = end;
212        } while ((amp = s.indexOf('&', end)) >= 0);
213        return sb.append(s, pos, n).toString();
214      }
215    
216    }