001 // Copyright (c) 2011, Mike Samuel
002 // All rights reserved.
003 //
004 // Redistribution and use in source and binary forms, with or without
005 // modification, are permitted provided that the following conditions
006 // are met:
007 //
008 // Redistributions of source code must retain the above copyright
009 // notice, this list of conditions and the following disclaimer.
010 // Redistributions in binary form must reproduce the above copyright
011 // notice, this list of conditions and the following disclaimer in the
012 // documentation and/or other materials provided with the distribution.
013 // Neither the name of the OWASP nor the names of its contributors may
014 // be used to endorse or promote products derived from this software
015 // without specific prior written permission.
016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027 // POSSIBILITY OF SUCH DAMAGE.
028
029 package org.owasp.html;
030
031 import com.google.common.annotations.VisibleForTesting;
032 import java.io.Closeable;
033 import java.io.Flushable;
034 import java.io.IOException;
035 import java.util.Iterator;
036 import java.util.List;
037 import javax.annotation.WillCloseWhenClosed;
038 import javax.annotation.concurrent.NotThreadSafe;
039
040 /**
041 * Given a series of HTML tokens, writes valid, normalized HTML to the output.
042 * The output will have well-defined tag boundaries, but there may be orphaned
043 * or missing close and open tags.
044 * The result of two renderers can always be concatenated to produce a larger
045 * snippet of HTML, but if the first was called with
046 * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not
047 * be interpreted as tags in the concatenated version.
048 */
049 @TCB
050 @NotThreadSafe
051 public class HtmlStreamRenderer implements HtmlStreamEventReceiver {
052
053 private final Appendable output;
054 private final Handler<? super IOException> ioExHandler;
055 private final Handler<? super String> badHtmlHandler;
056 private String lastTagOpened;
057 private StringBuilder pendingUnescaped;
058 private boolean open;
059
060 /**
061 * Factory.
062 * @param output the buffer to which HTML is streamed.
063 * @param ioExHandler called with any exception raised by output.
064 * @param badHtmlHandler receives alerts when HTML cannot be rendered because
065 * there is not valid HTML tree that results from that series of calls.
066 * E.g. it is not possible to create an HTML {@code <style>} element whose
067 * textual content is {@code "</style>"}.
068 */
069 public static HtmlStreamRenderer create(
070 @WillCloseWhenClosed Appendable output,
071 Handler<? super IOException> ioExHandler,
072 Handler<? super String> badHtmlHandler) {
073 if (output instanceof Closeable) {
074 return new CloseableHtmlStreamRenderer(
075 output, ioExHandler, badHtmlHandler);
076 } else {
077 return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler);
078 }
079 }
080
081 /**
082 * Factory.
083 * @param output the buffer to which HTML is streamed.
084 * @param badHtmlHandler receives alerts when HTML cannot be rendered because
085 * there is not valid HTML tree that results from that series of calls.
086 * E.g. it is not possible to create an HTML {@code <style>} element whose
087 * textual content is {@code "</style>"}.
088 */
089 public static HtmlStreamRenderer create(
090 StringBuilder output, Handler<? super String> badHtmlHandler) {
091 // Propagate since StringBuilder should not throw IOExceptions.
092 return create(output, Handler.PROPAGATE, badHtmlHandler);
093 }
094
095 private HtmlStreamRenderer(
096 Appendable output, Handler<? super IOException> ioExHandler,
097 Handler<? super String> badHtmlHandler) {
098 this.output = output;
099 this.ioExHandler = ioExHandler;
100 this.badHtmlHandler = badHtmlHandler;
101 }
102
103 /**
104 * Called when the series of calls make no sense.
105 * May be overridden to throw an unchecked throwable, to log, or to take some
106 * other action.
107 *
108 * @param message for human consumption.
109 * @param identifier an HTML identifier associated with the message.
110 */
111 private final void error(String message, CharSequence identifier) {
112 if (badHtmlHandler != Handler.DO_NOTHING) { // Avoid string append.
113 badHtmlHandler.handle(message + " : " + identifier);
114 }
115 }
116
117 public final void openDocument() throws IllegalStateException {
118 if (open) { throw new IllegalStateException(); }
119 open = true;
120 }
121
122 public final void closeDocument() throws IllegalStateException {
123 if (!open) { throw new IllegalStateException(); }
124 if (pendingUnescaped != null) {
125 closeTag(lastTagOpened);
126 }
127 open = false;
128 if (output instanceof Flushable) {
129 try {
130 ((Flushable) output).flush();
131 } catch (IOException ex) {
132 ioExHandler.handle(ex);
133 }
134 }
135 }
136
137 public final boolean isDocumentOpen() {
138 return open;
139 }
140
141 public final void openTag(String elementName, List<String> attrs) {
142 try {
143 writeOpenTag(elementName, attrs);
144 } catch (IOException ex) {
145 ioExHandler.handle(ex);
146 }
147 }
148
149 private void writeOpenTag(String elementName, List<? extends String> attrs)
150 throws IOException {
151 if (!open) { throw new IllegalStateException(); }
152 elementName = safeName(elementName);
153 if (!isValidHtmlName(elementName)) {
154 error("Invalid element name", elementName);
155 return;
156 }
157 if (pendingUnescaped != null) {
158 error("Tag content cannot appear inside CDATA element", elementName);
159 return;
160 }
161
162 switch (HtmlTextEscapingMode.getModeForTag(elementName)) {
163 case CDATA_SOMETIMES:
164 case CDATA:
165 case PLAIN_TEXT:
166 lastTagOpened = elementName;
167 pendingUnescaped = new StringBuilder();
168 break;
169 default:
170 }
171
172 output.append('<').append(elementName);
173
174 for (Iterator<? extends String> attrIt = attrs.iterator();
175 attrIt.hasNext();) {
176 String name = attrIt.next();
177 String value = attrIt.next();
178 name = HtmlLexer.canonicalName(name);
179 if (!isValidHtmlName(name)) {
180 error("Invalid attr name", name);
181 continue;
182 }
183 output.append(' ').append(name).append('=').append('"');
184 Encoding.encodeHtmlOnto(value, output);
185 if (value.indexOf('`') != -1) {
186 // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
187 // values. Given
188 // <div attr="``foo=bar">
189 // we encode ` but if JavaScript does:
190 // nodeA.innerHTML = nodeB.innerHTML;
191 // and nodeB contains the DIV above, then IE8 will produce
192 // <div attr=``foo=bar>
193 // as the value of nodeB.innerHTML and assign it to nodeA.
194 // IE8's HTML parser treats `` as a blank attribute value and foo=bar
195 // becomes a separate attribute.
196 // Adding a space at the end of the attribute prevents this by forcing
197 // IE8 to put double quotes around the attribute when computing
198 // nodeB.innerHTML.
199 output.append(' ');
200 }
201 output.append('"');
202 }
203
204 // Limit our output to the intersection of valid XML and valid HTML5 when
205 // the output contains no special HTML5 elements like <title>, <script>, or
206 // <textarea>.
207 if (HtmlTextEscapingMode.isVoidElement(elementName)) {
208 output.append(" /");
209 }
210
211 output.append('>');
212 }
213
214 public final void closeTag(String elementName) {
215 try {
216 writeCloseTag(safeName(elementName));
217 } catch (IOException ex) {
218 ioExHandler.handle(ex);
219 }
220 }
221
222 private final void writeCloseTag(String elementName)
223 throws IOException {
224 if (!open) { throw new IllegalStateException(); }
225 elementName = HtmlLexer.canonicalName(elementName);
226 if (!isValidHtmlName(elementName)) {
227 error("Invalid element name", elementName);
228 return;
229 }
230
231 if (pendingUnescaped != null) {
232 if (!lastTagOpened.equals(elementName)) {
233 error("Tag content cannot appear inside CDATA element", elementName);
234 return;
235 } else {
236 StringBuilder cdataContent = pendingUnescaped;
237 pendingUnescaped = null;
238 Encoding.stripBannedCodeunits(cdataContent);
239 int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent);
240 if (problemIndex == -1) {
241 output.append(cdataContent);
242 } else {
243 error(
244 "Invalid CDATA text content",
245 cdataContent.subSequence(
246 problemIndex,
247 Math.min(problemIndex + 10, cdataContent.length())));
248 // Still output the close tag.
249 }
250 }
251 if ("plaintext".equals(elementName)) { return; }
252 }
253 output.append("</").append(elementName).append(">");
254 }
255
256 public final void text(String text) {
257 try {
258 writeText(text);
259 } catch (IOException ex) {
260 ioExHandler.handle(ex);
261 }
262 }
263
264 private final void writeText(String text) throws IOException {
265 if (!open) { throw new IllegalStateException(); }
266 if (pendingUnescaped != null) {
267 pendingUnescaped.append(text);
268 } else {
269 Encoding.encodeHtmlOnto(text, output); // Works for RCDATA.
270 }
271 }
272
273 private static int checkHtmlCdataCloseable(
274 String localName, StringBuilder sb) {
275 int escapingTextSpanStart = -1;
276 for (int i = 0, n = sb.length(); i < n; ++i) {
277 char ch = sb.charAt(i);
278 switch (ch) {
279 case '<':
280 if (i + 3 < n
281 && '!' == sb.charAt(i + 1)
282 && '-' == sb.charAt(i + 2)
283 && '-' == sb.charAt(i + 3)) {
284 if (escapingTextSpanStart == -1) {
285 escapingTextSpanStart = i;
286 } else {
287 return i;
288 }
289 } else if (i + 1 + localName.length() < n
290 && '/' == sb.charAt(i + 1)
291 && Strings.regionMatchesIgnoreCase(
292 sb, i + 2, localName, 0, localName.length())) {
293 // A close tag contained in the content.
294 if (escapingTextSpanStart < 0) {
295 // We could try some recovery strategies here.
296 // E.g. prepending "/<!--\n" to sb if "script".equals(localName)
297 return i;
298 }
299 if (!"script".equals(localName)) {
300 // Script tags are commonly included inside script tags.
301 // <script><!--document.write('<script>f()</script>');--></script>
302 // but this does not happen in other CDATA element types.
303 // Actually allowing an end tag inside others is problematic.
304 // Specifically,
305 // <style><!--</style>-->/* foo */</style>
306 // displays the text "/* foo */" on some browsers.
307 return i;
308 }
309 }
310 break;
311 case '>':
312 // From the HTML5 spec:
313 // The text in style, script, title, and textarea elements must not
314 // have an escaping text span start that is not followed by an
315 // escaping text span end.
316 // We look left since the HTML 5 spec allows the escaping text span
317 // end to share dashes with the start.
318 if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) {
319 if (escapingTextSpanStart < 0) { return i - 2; }
320 escapingTextSpanStart = -1;
321 }
322 break;
323 default:
324 break;
325 }
326 }
327 if (escapingTextSpanStart >= 0) {
328 // We could try recovery strategies here.
329 // E.g. appending "//-->" to the buffer if "script".equals(localName)
330 return escapingTextSpanStart;
331 }
332 return -1;
333 }
334
335
336 @VisibleForTesting
337 static boolean isValidHtmlName(String name) {
338 int n = name.length();
339 if (n == 0) { return false; }
340 if (n > 128) { return false; }
341 boolean isNamespaced = false;
342 for (int i = 0; i < n; ++i) {
343 char ch = name.charAt(i);
344 switch (ch) {
345 case ':':
346 if (isNamespaced) { return false; }
347 isNamespaced = true;
348 if (i == 0 || i + 1 == n) { return false; }
349 break;
350 case '-':
351 if (i == 0 || i + 1 == n) { return false; }
352 break;
353 default:
354 if (ch <= '9') {
355 if (i == 0 || ch < '0') { return false; }
356 } else if ('A' <= ch && ch <= 'z') {
357 if ('Z' < ch && ch < 'a') { return false; }
358 } else {
359 return false;
360 }
361 break;
362 }
363 }
364 return true;
365 }
366
367 /**
368 * Canonicalizes the element name and possibly substitutes an alternative
369 * that has more consistent semantics.
370 */
371 static String safeName(String elementName) {
372 elementName = HtmlLexer.canonicalName(elementName);
373
374 // Substitute a reliably non-raw-text element for raw-text and
375 // plain-text elements.
376 switch (elementName.length()) {
377 case 3:
378 if ("xmp".equals(elementName)) { return "pre"; }
379 break;
380 case 7:
381 if ("listing".equals(elementName)) { return "pre"; }
382 break;
383 case 9:
384 if ("plaintext".equals(elementName)) { return "pre"; }
385 break;
386 }
387 return elementName;
388 }
389
390 static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer
391 implements Closeable {
392 private final Closeable closeable;
393
394 CloseableHtmlStreamRenderer(
395 @WillCloseWhenClosed
396 Appendable output, Handler<? super IOException> errorHandler,
397 Handler<? super String> badHtmlHandler) {
398 super(output, errorHandler, badHtmlHandler);
399 this.closeable = (Closeable) output;
400 }
401
402 public void close() throws IOException {
403 if (isDocumentOpen()) { closeDocument(); }
404 closeable.close();
405 }
406 }
407 }