001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html; 030 031 import com.google.common.annotations.VisibleForTesting; 032 import java.io.Closeable; 033 import java.io.Flushable; 034 import java.io.IOException; 035 import java.util.Iterator; 036 import java.util.List; 037 import javax.annotation.WillCloseWhenClosed; 038 039 /** 040 * Given a series of HTML tokens, writes valid, normalized HTML to the output. 041 * The output will have well-defined tag boundaries, but there may be orphaned 042 * or missing close and open tags. 043 * The result of two renderers can always be concatenated to produce a larger 044 * snippet of HTML, but if the first was called with 045 * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not 046 * be interpreted as tags in the concatenated version. 047 */ 048 @TCB 049 public class HtmlStreamRenderer implements HtmlStreamEventReceiver { 050 051 private final Appendable output; 052 private final Handler<? super IOException> ioExHandler; 053 private final Handler<? super String> badHtmlHandler; 054 private String lastTagOpened; 055 private StringBuilder pendingUnescaped; 056 private boolean open; 057 058 /** 059 * Factory. 060 * @param output the buffer to which HTML is streamed. 061 * @param ioExHandler called with any exception raised by output. 062 * @param badHtmlHandler receives alerts when HTML cannot be rendered because 063 * there is not valid HTML tree that results from that series of calls. 064 * E.g. it is not possible to create an HTML {@code <style>} element whose 065 * textual content is {@code "</style>"}. 066 */ 067 public static HtmlStreamRenderer create( 068 @WillCloseWhenClosed Appendable output, 069 Handler<? super IOException> ioExHandler, 070 Handler<? super String> badHtmlHandler) { 071 if (output instanceof Closeable) { 072 return new CloseableHtmlStreamRenderer( 073 output, ioExHandler, badHtmlHandler); 074 } else { 075 return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler); 076 } 077 } 078 079 /** 080 * Factory. 081 * @param output the buffer to which HTML is streamed. 082 * @param badHtmlHandler receives alerts when HTML cannot be rendered because 083 * there is not valid HTML tree that results from that series of calls. 084 * E.g. it is not possible to create an HTML {@code <style>} element whose 085 * textual content is {@code "</style>"}. 086 */ 087 public static HtmlStreamRenderer create( 088 StringBuilder output, Handler<? super String> badHtmlHandler) { 089 // Propagate since StringBuilder should not throw IOExceptions. 090 return create(output, Handler.PROPAGATE, badHtmlHandler); 091 } 092 093 private HtmlStreamRenderer( 094 Appendable output, Handler<? super IOException> ioExHandler, 095 Handler<? super String> badHtmlHandler) { 096 this.output = output; 097 this.ioExHandler = ioExHandler; 098 this.badHtmlHandler = badHtmlHandler; 099 } 100 101 /** 102 * Called when the series of calls make no sense. 103 * May be overridden to throw an unchecked throwable, to log, or to take some 104 * other action. 105 * 106 * @param message for human consumption. 107 * @param identifier an HTML identifier associated with the message. 108 */ 109 private final void error(String message, CharSequence identifier) { 110 if (ioExHandler != Handler.DO_NOTHING) { // Avoid string append. 111 badHtmlHandler.handle(message + " : " + identifier); 112 } 113 } 114 115 /** 116 * 117 */ 118 public final void openDocument() throws IllegalStateException { 119 if (open) { throw new IllegalStateException(); } 120 open = true; 121 } 122 123 public final void closeDocument() throws IllegalStateException { 124 if (!open) { throw new IllegalStateException(); } 125 if (pendingUnescaped != null) { 126 closeTag(lastTagOpened); 127 } 128 open = false; 129 if (output instanceof Flushable) { 130 try { 131 ((Flushable) output).flush(); 132 } catch (IOException ex) { 133 ioExHandler.handle(ex); 134 } 135 } 136 } 137 138 public final boolean isDocumentOpen() { 139 return open; 140 } 141 142 public final void openTag(String elementName, List<String> attrs) { 143 try { 144 writeOpenTag(elementName, attrs); 145 } catch (IOException ex) { 146 ioExHandler.handle(ex); 147 } 148 } 149 150 private void writeOpenTag(String elementName, List<? extends String> attrs) 151 throws IOException { 152 if (!open) { throw new IllegalStateException(); } 153 elementName = HtmlLexer.canonicalName(elementName); 154 if (!isValidHtmlName(elementName)) { 155 error("Invalid element name", elementName); 156 return; 157 } 158 if (pendingUnescaped != null) { 159 error("Tag content cannot appear inside CDATA element", elementName); 160 return; 161 } 162 163 switch (HtmlTextEscapingMode.getModeForTag(elementName)) { 164 case CDATA: 165 case CDATA_SOMETIMES: 166 case PLAIN_TEXT: 167 lastTagOpened = elementName; 168 pendingUnescaped = new StringBuilder(); 169 break; 170 default: 171 } 172 173 output.append('<').append(elementName); 174 175 for (Iterator<? extends String> attrIt = attrs.iterator(); 176 attrIt.hasNext();) { 177 String name = attrIt.next(); 178 String value = attrIt.next(); 179 name = HtmlLexer.canonicalName(name); 180 if (!isValidHtmlName(name)) { 181 error("Invalid attr name", name); 182 continue; 183 } 184 output.append(' ').append(name).append('=').append('"'); 185 escapeHtmlOnto(value, output); 186 output.append('"'); 187 } 188 189 output.append('>'); 190 } 191 192 public final void closeTag(String elementName) { 193 try { 194 writeCloseTag(elementName); 195 } catch (IOException ex) { 196 ioExHandler.handle(ex); 197 } 198 } 199 200 private final void writeCloseTag(String elementName) 201 throws IOException { 202 if (!open) { throw new IllegalStateException(); } 203 elementName = HtmlLexer.canonicalName(elementName); 204 if (!isValidHtmlName(elementName)) { 205 error("Invalid element name", elementName); 206 return; 207 } 208 209 if (pendingUnescaped != null) { 210 if (!lastTagOpened.equals(elementName)) { 211 error("Tag content cannot appear inside CDATA element", elementName); 212 return; 213 } else { 214 StringBuilder cdataContent = pendingUnescaped; 215 pendingUnescaped = null; 216 int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent); 217 if (problemIndex == -1) { 218 output.append(cdataContent); 219 } else { 220 error( 221 "Invalid CDATA text content", 222 cdataContent.subSequence( 223 problemIndex, 224 Math.min(problemIndex + 10, cdataContent.length()))); 225 // Still output the close tag. 226 } 227 } 228 if ("plaintext".equals(elementName)) { return; } 229 } 230 output.append("</").append(elementName).append(">"); 231 } 232 233 public final void text(String text) { 234 try { 235 writeText(text); 236 } catch (IOException ex) { 237 ioExHandler.handle(ex); 238 } 239 } 240 241 private final void writeText(String text) throws IOException { 242 if (!open) { throw new IllegalStateException(); } 243 if (pendingUnescaped != null) { 244 pendingUnescaped.append(text.replaceAll("\0", "")); 245 } else { 246 escapeHtmlOnto(text, output); // Works for RCDATA. 247 } 248 } 249 250 private static int checkHtmlCdataCloseable( 251 String localName, StringBuilder sb) { 252 int escapingTextSpanStart = -1; 253 for (int i = 0, n = sb.length(); i < n; ++i) { 254 char ch = sb.charAt(i); 255 switch (ch) { 256 case '<': 257 if (i + 3 < n 258 && '!' == sb.charAt(i + 1) 259 && '-' == sb.charAt(i + 2) 260 && '-' == sb.charAt(i + 3)) { 261 if (escapingTextSpanStart == -1) { 262 escapingTextSpanStart = i; 263 } else { 264 return i; 265 } 266 } else if (i + 1 + localName.length() < n 267 && '/' == sb.charAt(i + 1) 268 && Strings.regionMatchesIgnoreCase( 269 sb, i + 2, localName, 0, localName.length())) { 270 // A close tag contained in the content. 271 if (escapingTextSpanStart < 0) { 272 // We could try some recovery strategies here. 273 // E.g. prepending "/<!--\n" to sb if "script".equals(localName) 274 return i; 275 } 276 if (!"script".equals(localName)) { 277 // Script tags are commonly included inside script tags. 278 // <script><!--document.write('<script>f()</script>');--></script> 279 // but this does not happen in other CDATA element types. 280 // Actually allowing an end tag inside others is problematic. 281 // Specifically, 282 // <style><!--</style>-->/* foo */</style> 283 // displays the text "/* foo */" on some browsers. 284 return i; 285 } 286 } 287 break; 288 case '>': 289 // From the HTML5 spec: 290 // The text in style, script, title, and textarea elements must not 291 // have an escaping text span start that is not followed by an 292 // escaping text span end. 293 // We look left since the HTML 5 spec allows the escaping text span 294 // end to share dashes with the start. 295 if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) { 296 if (escapingTextSpanStart < 0) { return i - 2; } 297 escapingTextSpanStart = -1; 298 } 299 break; 300 } 301 } 302 if (escapingTextSpanStart >= 0) { 303 // We could try recovery strategies here. 304 // E.g. appending "//-->" to the buffer if "script".equals(localName) 305 return escapingTextSpanStart; 306 } 307 return -1; 308 } 309 310 311 @VisibleForTesting 312 static boolean isValidHtmlName(String name) { 313 int n = name.length(); 314 if (n == 0) { return false; } 315 if (n > 128) { return false; } 316 boolean isNamespaced = false; 317 for (int i = 0; i < n; ++i) { 318 char ch = name.charAt(i); 319 switch (ch) { 320 case ':': 321 if (isNamespaced) { return false; } 322 isNamespaced = true; 323 if (i == 0 || i + 1 == n) { return false; } 324 break; 325 case '-': 326 if (i == 0 || i + 1 == n) { return false; } 327 break; 328 default: 329 if (ch <= '9') { 330 if (i == 0 || ch < '0') { return false; } 331 } else if ('A' <= ch && ch <= 'z') { 332 if ('Z' < ch && ch < 'a') { return false; } 333 } else { 334 return false; 335 } 336 break; 337 } 338 } 339 return true; 340 } 341 342 @SuppressWarnings("fallthrough") 343 static void escapeHtmlOnto(String plainText, Appendable output) 344 throws IOException { 345 int n = plainText.length(); 346 int pos = 0; 347 for (int i = 0; i < n; ++i) { 348 char ch = plainText.charAt(i); 349 switch (ch) { 350 case '<': 351 output.append(plainText, pos, i).append("<"); 352 pos = i + 1; 353 break; 354 case '>': 355 output.append(plainText, pos, i).append(">"); 356 pos = i + 1; 357 break; 358 case '&': 359 output.append(plainText, pos, i).append("&"); 360 pos = i + 1; 361 break; 362 case '"': 363 output.append(plainText, pos, i).append("""); 364 pos = i + 1; 365 break; 366 case '\r': case '\n': break; 367 default: 368 // Emit supplemental codepoints as entity so that they cannot 369 // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper 370 // and get involved in UTF-16/UCS-2 confusion. 371 if (Character.isHighSurrogate(ch) && i + 1 < n) { 372 char next = plainText.charAt(i + 1); 373 if (Character.isLowSurrogate(next)) { 374 int codepoint = Character.toCodePoint(ch, next); 375 output.append(plainText, pos, i); 376 appendNumericEntity(codepoint, output); 377 ++i; // Consume high surrogate. 378 pos = i + 1; 379 continue; 380 } 381 } 382 if (0x20 <= ch && ch < 0xff00) { 383 // Includes surrogates, so all supplementary codepoints are 384 // rendered raw. 385 continue; 386 } 387 // Is a control character or possible full-width version of a 388 // special character. 389 // FALL-THROUGH 390 case '+': // UTF-7 391 case '=': // Special in attributes. 392 case '@': // Conditional compilation 393 case '\'': case '`': // Quoting character 394 output.append(plainText, pos, i); 395 appendNumericEntity(ch, output); 396 pos = i + 1; 397 break; 398 case 0: 399 output.append(plainText, pos, i); 400 pos = i + 1; 401 break; 402 } 403 } 404 output.append(plainText, pos, n); 405 } 406 407 static void appendNumericEntity(int codepoint, Appendable output) 408 throws IOException { 409 if (codepoint < 100) { 410 output.append("&#"); 411 if (codepoint < 10) { 412 output.append((char) ('0' + codepoint)); 413 } else { 414 output.append((char) ('0' + (codepoint / 10))); 415 output.append((char) ('0' + (codepoint % 10))); 416 } 417 output.append(";"); 418 } else { 419 int nDigits = (codepoint < 0x1000 420 ? codepoint < 0x100 ? 2 : 3 421 : (codepoint < 0x10000 ? 4 422 : codepoint < 0x100000 ? 5 : 6)); 423 output.append("&#x"); 424 for (int digit = nDigits; --digit >= 0;) { 425 int hexDigit = (codepoint >>> (digit << 2)) & 0xf; 426 output.append(HEX_NUMERAL[hexDigit]); 427 } 428 output.append(";"); 429 } 430 } 431 432 private static final char[] HEX_NUMERAL = { 433 '0', '1', '2', '3', '4', '5', '6', '7', 434 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 435 }; 436 437 438 static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer 439 implements Closeable { 440 private final Closeable closeable; 441 442 CloseableHtmlStreamRenderer( 443 @WillCloseWhenClosed 444 Appendable output, Handler<? super IOException> errorHandler, 445 Handler<? super String> badHtmlHandler) { 446 super(output, errorHandler, badHtmlHandler); 447 this.closeable = (Closeable) output; 448 } 449 450 public void close() throws IOException { 451 if (isDocumentOpen()) { closeDocument(); } 452 closeable.close(); 453 } 454 } 455 }