001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html;
030    
031    import com.google.common.annotations.VisibleForTesting;
032    import java.io.Closeable;
033    import java.io.Flushable;
034    import java.io.IOException;
035    import java.util.Iterator;
036    import java.util.List;
037    import javax.annotation.WillCloseWhenClosed;
038    
039    /**
040     * Given a series of HTML tokens, writes valid, normalized HTML to the output.
041     * The output will have well-defined tag boundaries, but there may be orphaned
042     * or missing close and open tags.
043     * The result of two renderers can always be concatenated to produce a larger
044     * snippet of HTML, but if the first was called with
045     * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not
046     * be interpreted as tags in the concatenated version.
047     */
048    @TCB
049    public class HtmlStreamRenderer implements HtmlStreamEventReceiver {
050    
051      private final Appendable output;
052      private final Handler<? super IOException> ioExHandler;
053      private final Handler<? super String> badHtmlHandler;
054      private String lastTagOpened;
055      private StringBuilder pendingUnescaped;
056      private boolean open;
057    
058      /**
059       * Factory.
060       * @param output the buffer to which HTML is streamed.
061       * @param ioExHandler called with any exception raised by output.
062       * @param badHtmlHandler receives alerts when HTML cannot be rendered because
063       *    there is not valid HTML tree that results from that series of calls.
064       *    E.g. it is not possible to create an HTML {@code <style>} element whose
065       *    textual content is {@code "</style>"}.
066       */
067      public static HtmlStreamRenderer create(
068          @WillCloseWhenClosed Appendable output,
069          Handler<? super IOException> ioExHandler,
070          Handler<? super String> badHtmlHandler) {
071        if (output instanceof Closeable) {
072          return new CloseableHtmlStreamRenderer(
073              output, ioExHandler, badHtmlHandler);
074        } else {
075          return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler);
076        }
077      }
078    
079      /**
080       * Factory.
081       * @param output the buffer to which HTML is streamed.
082       * @param badHtmlHandler receives alerts when HTML cannot be rendered because
083       *    there is not valid HTML tree that results from that series of calls.
084       *    E.g. it is not possible to create an HTML {@code <style>} element whose
085       *    textual content is {@code "</style>"}.
086       */
087      public static HtmlStreamRenderer create(
088          StringBuilder output, Handler<? super String> badHtmlHandler) {
089        // Propagate since StringBuilder should not throw IOExceptions.
090        return create(output, Handler.PROPAGATE, badHtmlHandler);
091      }
092    
093      private HtmlStreamRenderer(
094          Appendable output, Handler<? super IOException> ioExHandler,
095          Handler<? super String> badHtmlHandler) {
096        this.output = output;
097        this.ioExHandler = ioExHandler;
098        this.badHtmlHandler = badHtmlHandler;
099      }
100    
101      /**
102       * Called when the series of calls make no sense.
103       * May be overridden to throw an unchecked throwable, to log, or to take some
104       * other action.
105       *
106       * @param message for human consumption.
107       * @param identifier an HTML identifier associated with the message.
108       */
109      private final void error(String message, CharSequence identifier) {
110        if (ioExHandler != Handler.DO_NOTHING) {   // Avoid string append.
111          badHtmlHandler.handle(message + " : " + identifier);
112        }
113      }
114    
115      /**
116       *
117       */
118      public final void openDocument() throws IllegalStateException {
119        if (open) { throw new IllegalStateException(); }
120        open = true;
121      }
122    
123      public final void closeDocument() throws IllegalStateException {
124        if (!open) { throw new IllegalStateException(); }
125        if (pendingUnescaped != null) {
126          closeTag(lastTagOpened);
127        }
128        open = false;
129        if (output instanceof Flushable) {
130          try {
131            ((Flushable) output).flush();
132          } catch (IOException ex) {
133            ioExHandler.handle(ex);
134          }
135        }
136      }
137    
138      public final boolean isDocumentOpen() {
139        return open;
140      }
141    
142      public final void openTag(String elementName, List<String> attrs) {
143        try {
144          writeOpenTag(elementName, attrs);
145        } catch (IOException ex) {
146          ioExHandler.handle(ex);
147        }
148      }
149    
150      private void writeOpenTag(String elementName, List<? extends String> attrs)
151          throws IOException {
152        if (!open) { throw new IllegalStateException(); }
153        elementName = HtmlLexer.canonicalName(elementName);
154        if (!isValidHtmlName(elementName)) {
155          error("Invalid element name", elementName);
156          return;
157        }
158        if (pendingUnescaped != null) {
159          error("Tag content cannot appear inside CDATA element", elementName);
160          return;
161        }
162    
163        switch (HtmlTextEscapingMode.getModeForTag(elementName)) {
164          case CDATA:
165          case CDATA_SOMETIMES:
166          case PLAIN_TEXT:
167            lastTagOpened = elementName;
168            pendingUnescaped = new StringBuilder();
169            break;
170          default:
171        }
172    
173        output.append('<').append(elementName);
174    
175        for (Iterator<? extends String> attrIt = attrs.iterator();
176             attrIt.hasNext();) {
177          String name = attrIt.next();
178          String value = attrIt.next();
179          name = HtmlLexer.canonicalName(name);
180          if (!isValidHtmlName(name)) {
181            error("Invalid attr name", name);
182            continue;
183          }
184          output.append(' ').append(name).append('=').append('"');
185          escapeHtmlOnto(value, output);
186          output.append('"');
187        }
188    
189        output.append('>');
190      }
191    
192      public final void closeTag(String elementName) {
193        try {
194          writeCloseTag(elementName);
195        } catch (IOException ex) {
196          ioExHandler.handle(ex);
197        }
198      }
199    
200      private final void writeCloseTag(String elementName)
201          throws IOException {
202        if (!open) { throw new IllegalStateException(); }
203        elementName = HtmlLexer.canonicalName(elementName);
204        if (!isValidHtmlName(elementName)) {
205          error("Invalid element name", elementName);
206          return;
207        }
208    
209        if (pendingUnescaped != null) {
210          if (!lastTagOpened.equals(elementName)) {
211            error("Tag content cannot appear inside CDATA element", elementName);
212            return;
213          } else {
214            StringBuilder cdataContent = pendingUnescaped;
215            pendingUnescaped = null;
216            int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent);
217            if (problemIndex == -1) {
218              output.append(cdataContent);
219            } else {
220              error(
221                  "Invalid CDATA text content",
222                  cdataContent.subSequence(
223                      problemIndex,
224                      Math.min(problemIndex + 10, cdataContent.length())));
225              // Still output the close tag.
226            }
227          }
228          if ("plaintext".equals(elementName)) { return; }
229        }
230        output.append("</").append(elementName).append(">");
231      }
232    
233      public final void text(String text) {
234        try {
235          writeText(text);
236        } catch (IOException ex) {
237          ioExHandler.handle(ex);
238        }
239      }
240    
241      private final void writeText(String text) throws IOException {
242        if (!open) { throw new IllegalStateException(); }
243        if (pendingUnescaped != null) {
244          pendingUnescaped.append(text.replaceAll("\0", ""));
245        } else {
246          escapeHtmlOnto(text, output);  // Works for RCDATA.
247        }
248      }
249    
250      private static int checkHtmlCdataCloseable(
251          String localName, StringBuilder sb) {
252        int escapingTextSpanStart = -1;
253        for (int i = 0, n = sb.length(); i < n; ++i) {
254          char ch = sb.charAt(i);
255          switch (ch) {
256            case '<':
257              if (i + 3 < n
258                  && '!' == sb.charAt(i + 1)
259                  && '-' == sb.charAt(i + 2)
260                  && '-' == sb.charAt(i + 3)) {
261                if (escapingTextSpanStart == -1) {
262                  escapingTextSpanStart = i;
263                } else {
264                  return i;
265                }
266              } else if (i + 1 + localName.length() < n
267                         && '/' == sb.charAt(i + 1)
268                         && Strings.regionMatchesIgnoreCase(
269                             sb, i + 2, localName, 0, localName.length())) {
270                // A close tag contained in the content.
271                if (escapingTextSpanStart < 0) {
272                  // We could try some recovery strategies here.
273                  // E.g. prepending "/<!--\n" to sb if "script".equals(localName)
274                  return i;
275                }
276                if (!"script".equals(localName)) {
277                  // Script tags are commonly included inside script tags.
278                  // <script><!--document.write('<script>f()</script>');--></script>
279                  // but this does not happen in other CDATA element types.
280                  // Actually allowing an end tag inside others is problematic.
281                  // Specifically,
282                  // <style><!--</style>-->/* foo */</style>
283                  // displays the text "/* foo */" on some browsers.
284                  return i;
285                }
286              }
287              break;
288            case '>':
289              // From the HTML5 spec:
290              //    The text in style, script, title, and textarea elements must not
291              //    have an escaping text span start that is not followed by an
292              //    escaping text span end.
293              // We look left since the HTML 5 spec allows the escaping text span
294              // end to share dashes with the start.
295              if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) {
296                if (escapingTextSpanStart < 0) { return i - 2; }
297                escapingTextSpanStart = -1;
298              }
299              break;
300          }
301        }
302        if (escapingTextSpanStart >= 0) {
303          // We could try recovery strategies here.
304          // E.g. appending "//-->" to the buffer if "script".equals(localName)
305          return escapingTextSpanStart;
306        }
307        return -1;
308      }
309    
310    
311      @VisibleForTesting
312      static boolean isValidHtmlName(String name) {
313        int n = name.length();
314        if (n == 0) { return false; }
315        if (n > 128) { return false; }
316        boolean isNamespaced = false;
317        for (int i = 0; i < n; ++i) {
318          char ch = name.charAt(i);
319          switch (ch) {
320            case ':':
321              if (isNamespaced) { return false; }
322              isNamespaced = true;
323              if (i == 0 || i + 1 == n) { return false; }
324              break;
325            case '-':
326              if (i == 0 || i + 1 == n) { return false; }
327              break;
328            default:
329              if (ch <= '9') {
330                if (i == 0 || ch < '0') { return false; }
331              } else if ('A' <= ch && ch <= 'z') {
332                if ('Z' < ch && ch < 'a') { return false; }
333              } else {
334                return false;
335              }
336              break;
337          }
338        }
339        return true;
340      }
341    
342      @SuppressWarnings("fallthrough")
343      static void escapeHtmlOnto(String plainText, Appendable output)
344          throws IOException {
345        int n = plainText.length();
346        int pos = 0;
347        for (int i = 0; i < n; ++i) {
348          char ch = plainText.charAt(i);
349          switch (ch) {
350            case '<':
351              output.append(plainText, pos, i).append("&lt;");
352              pos = i + 1;
353              break;
354            case '>':
355              output.append(plainText, pos, i).append("&gt;");
356              pos = i + 1;
357              break;
358            case '&':
359              output.append(plainText, pos, i).append("&amp;");
360              pos = i + 1;
361              break;
362            case '"':
363              output.append(plainText, pos, i).append("&#34;");
364              pos = i + 1;
365              break;
366            case '\r': case '\n': break;
367            default:
368              // Emit supplemental codepoints as entity so that they cannot
369              // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
370              // and get involved in UTF-16/UCS-2 confusion.
371              if (Character.isHighSurrogate(ch) && i + 1 < n) {
372                char next = plainText.charAt(i + 1);
373                if (Character.isLowSurrogate(next)) {
374                  int codepoint = Character.toCodePoint(ch, next);
375                  output.append(plainText, pos, i);
376                  appendNumericEntity(codepoint, output);
377                  ++i;  // Consume high surrogate.
378                  pos = i + 1;
379                  continue;
380                }
381              }
382              if (0x20 <= ch && ch < 0xff00) {
383                // Includes surrogates, so all supplementary codepoints are
384                // rendered raw.
385                continue;
386              }
387              // Is a control character or possible full-width version of a
388              // special character.
389              // FALL-THROUGH
390            case '+':  // UTF-7
391            case '=':  // Special in attributes.
392            case '@':  // Conditional compilation
393            case '\'': case '`':  // Quoting character
394              output.append(plainText, pos, i);
395              appendNumericEntity(ch, output);
396              pos = i + 1;
397              break;
398            case 0:
399              output.append(plainText, pos, i);
400              pos = i + 1;
401              break;
402          }
403        }
404        output.append(plainText, pos, n);
405      }
406    
407      static void appendNumericEntity(int codepoint, Appendable output)
408         throws IOException {
409        if (codepoint < 100) {
410          output.append("&#");
411          if (codepoint < 10) {
412            output.append((char) ('0' + codepoint));
413          } else {
414            output.append((char) ('0' + (codepoint / 10)));
415            output.append((char) ('0' + (codepoint % 10)));
416          }
417          output.append(";");
418        } else {
419          int nDigits = (codepoint < 0x1000
420                         ? codepoint < 0x100 ? 2 : 3
421                         : (codepoint < 0x10000 ? 4
422                            : codepoint < 0x100000 ? 5 : 6));
423          output.append("&#x");
424          for (int digit = nDigits; --digit >= 0;) {
425            int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
426            output.append(HEX_NUMERAL[hexDigit]);
427          }
428          output.append(";");
429        }
430      }
431    
432      private static final char[] HEX_NUMERAL = {
433        '0', '1', '2', '3', '4', '5', '6', '7',
434        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
435      };
436    
437    
438      static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer
439          implements Closeable {
440        private final Closeable closeable;
441    
442        CloseableHtmlStreamRenderer(
443            @WillCloseWhenClosed
444            Appendable output, Handler<? super IOException> errorHandler,
445            Handler<? super String> badHtmlHandler) {
446          super(output, errorHandler, badHtmlHandler);
447          this.closeable = (Closeable) output;
448        }
449    
450        public void close() throws IOException {
451          if (isDocumentOpen()) { closeDocument(); }
452          closeable.close();
453        }
454      }
455    }