001 // Copyright (c) 2011, Mike Samuel
002 // All rights reserved.
003 //
004 // Redistribution and use in source and binary forms, with or without
005 // modification, are permitted provided that the following conditions
006 // are met:
007 //
008 // Redistributions of source code must retain the above copyright
009 // notice, this list of conditions and the following disclaimer.
010 // Redistributions in binary form must reproduce the above copyright
011 // notice, this list of conditions and the following disclaimer in the
012 // documentation and/or other materials provided with the distribution.
013 // Neither the name of the OWASP nor the names of its contributors may
014 // be used to endorse or promote products derived from this software
015 // without specific prior written permission.
016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027 // POSSIBILITY OF SUCH DAMAGE.
028
029 package org.owasp.html;
030
031 import javax.annotation.Nullable;
032
033 import com.google.common.collect.ImmutableSet;
034
035 /**
036 * An attribute policy for attributes whose values are URLs that requires that
037 * the value have no protocol or have an allowed protocol.
038 *
039 * <p>
040 * URLs with protocols must match the protocol set passed to the constructor.
041 * URLs without protocols but which specify an origin different from the
042 * containing page (e.g. {@code //example.org}) are only allowed if the
043 * {@link FilterUrlByProtocolAttributePolicy#allowProtocolRelativeUrls policy}
044 * allows both {@code http} and {@code https} which are normally used to serve
045 * HTML.
046 * Same-origin URLs, URLs without any protocol or authority part are always
047 * allowed.
048 * </p>
049 *
050 * <p>
051 * This class assumes that URLs are either hierarchical, or are opaque, but
052 * do not look like they contain an authority portion.
053 * </p>
054 *
055 * @author Mike Samuel <mikesamuel@gmail.com>
056 */
057 @TCB
058 public class FilterUrlByProtocolAttributePolicy implements AttributePolicy {
059 private final ImmutableSet<String> protocols;
060
061 public FilterUrlByProtocolAttributePolicy(
062 Iterable<? extends String> protocols) {
063 this.protocols = ImmutableSet.copyOf(protocols);
064 }
065
066 public @Nullable String apply(
067 String elementName, String attributeName, String s) {
068 protocol_loop:
069 for (int i = 0, n = s.length(); i < n; ++i) {
070 switch (s.charAt(i)) {
071 case '/': case '#': case '?': // No protocol.
072 // Check for domain relative URLs like //www.evil.org/
073 if (s.startsWith("//")
074 // or the protocols by which HTML is normally served are OK.
075 && !allowProtocolRelativeUrls()) {
076 return null;
077 }
078 break protocol_loop;
079 case ':':
080 if (!protocols.contains(s.substring(0, i))) { return null; }
081 break protocol_loop;
082 }
083 }
084 return normalizeUri(s);
085 }
086
087 protected boolean allowProtocolRelativeUrls() {
088 return protocols.contains("http") && protocols.contains("https");
089 }
090
091 /** Percent encodes anything that looks like a colon, or a parenthesis. */
092 static String normalizeUri(String s) {
093 int n = s.length();
094 boolean colonsIrrelevant = false;
095 for (int i = 0; i < n; ++i) {
096 char ch = s.charAt(i);
097 switch (ch) {
098 case '/': case '#': case '?': case ':':
099 colonsIrrelevant = true;
100 break;
101 case '(': case ')': case '\uff1a':
102 StringBuilder sb = new StringBuilder(n + 16);
103 int pos = 0;
104 for (; i < n; ++i) {
105 ch = s.charAt(i);
106 switch (ch) {
107 case '(':
108 sb.append(s, pos, i).append("%28");
109 pos = i + 1;
110 break;
111 case ')':
112 sb.append(s, pos, i).append("%29");
113 pos = i + 1;
114 break;
115 default:
116 if (ch > 0x100 && !colonsIrrelevant) {
117 // Other colon like characters.
118 // TODO: do we need to encode non-colon characters if we're
119 // not dealing with URLs that haven't been copy/pasted into
120 // the URL bar?
121 // Is it safe to assume UTF-8 here?
122 switch (ch) {
123 case '\u0589':
124 sb.append(s, pos, i).append("%d6%89");
125 pos = i + 1;
126 break;
127 case '\u05c3':
128 sb.append(s, pos, i).append("%d7%83");
129 pos = i + 1;
130 break;
131 case '\u2236':
132 sb.append(s, pos, i).append("%e2%88%b6");
133 pos = i + 1;
134 break;
135 case '\uff1a':
136 sb.append(s, pos, i).append("%ef%bc%9a");
137 pos = i + 1;
138 break;
139 }
140 }
141 break;
142 }
143 }
144 return sb.append(s, pos, n).toString();
145 }
146 }
147 return s;
148 }
149
150 }