J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 1998-2003 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | package javax.swing.text.html.parser; |
| 27 | |
| 28 | import javax.swing.text.SimpleAttributeSet; |
| 29 | import javax.swing.text.html.HTMLEditorKit; |
| 30 | import javax.swing.text.html.HTML; |
| 31 | import javax.swing.text.ChangedCharSetException; |
| 32 | |
| 33 | import java.util.*; |
| 34 | import java.io.*; |
| 35 | import java.net.*; |
| 36 | |
| 37 | /** |
| 38 | * A Parser for HTML Documents (actually, you can specify a DTD, but |
| 39 | * you should really only use this class with the html dtd in swing). |
| 40 | * Reads an InputStream of HTML and |
| 41 | * invokes the appropriate methods in the ParserCallback class. This |
| 42 | * is the default parser used by HTMLEditorKit to parse HTML url's. |
| 43 | * <p>This will message the callback for all valid tags, as well as |
| 44 | * tags that are implied but not explicitly specified. For example, the |
| 45 | * html string (<p>blah) only has a p tag defined. The callback |
| 46 | * will see the following methods: |
| 47 | * <ol><li><i>handleStartTag(html, ...)</i></li> |
| 48 | * <li><i>handleStartTag(head, ...)</i></li> |
| 49 | * <li><i>handleEndTag(head)</i></li> |
| 50 | * <li><i>handleStartTag(body, ...)</i></li> |
| 51 | * <li>handleStartTag(p, ...)</i></li> |
| 52 | * <li>handleText(...)</li> |
| 53 | * <li><i>handleEndTag(p)</i></li> |
| 54 | * <li><i>handleEndTag(body)</i></li> |
| 55 | * <li><i>handleEndTag(html)</i></li> |
| 56 | * </ol> |
| 57 | * The items in <i>italic</i> are implied, that is, although they were not |
| 58 | * explicitly specified, to be correct html they should have been present |
| 59 | * (head isn't necessary, but it is still generated). For tags that |
| 60 | * are implied, the AttributeSet argument will have a value of |
| 61 | * <code>Boolean.TRUE</code> for the key |
| 62 | * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>. |
| 63 | * <p>HTML.Attributes defines a type safe enumeration of html attributes. |
| 64 | * If an attribute key of a tag is defined in HTML.Attribute, the |
| 65 | * HTML.Attribute will be used as the key, otherwise a String will be used. |
| 66 | * For example <p foo=bar class=neat> has two attributes. foo is |
| 67 | * not defined in HTML.Attribute, where as class is, therefore the |
| 68 | * AttributeSet will have two values in it, HTML.Attribute.CLASS with |
| 69 | * a String value of 'neat' and the String key 'foo' with a String value of |
| 70 | * 'bar'. |
| 71 | * <p>The position argument will indicate the start of the tag, comment |
| 72 | * or text. Similiar to arrays, the first character in the stream has a |
| 73 | * position of 0. For tags that are |
| 74 | * implied the position will indicate |
| 75 | * the location of the next encountered tag. In the first example, |
| 76 | * the implied start body and html tags will have the same position as the |
| 77 | * p tag, and the implied end p, html and body tags will all have the same |
| 78 | * position. |
| 79 | * <p>As html skips whitespace the position for text will be the position |
| 80 | * of the first valid character, eg in the string '\n\n\nblah' |
| 81 | * the text 'blah' will have a position of 3, the newlines are skipped. |
| 82 | * <p> |
| 83 | * For attributes that do not have a value, eg in the html |
| 84 | * string <code><foo blah></code> the attribute <code>blah</code> |
| 85 | * does not have a value, there are two possible values that will be |
| 86 | * placed in the AttributeSet's value: |
| 87 | * <ul> |
| 88 | * <li>If the DTD does not contain an definition for the element, or the |
| 89 | * definition does not have an explicit value then the value in the |
| 90 | * AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>. |
| 91 | * <li>If the DTD contains an explicit value, as in: |
| 92 | * <code><!ATTLIST OPTION selected (selected) #IMPLIED></code> |
| 93 | * this value from the dtd (in this case selected) will be used. |
| 94 | * </ul> |
| 95 | * <p> |
| 96 | * Once the stream has been parsed, the callback is notified of the most |
| 97 | * likely end of line string. The end of line string will be one of |
| 98 | * \n, \r or \r\n, which ever is encountered the most in parsing the |
| 99 | * stream. |
| 100 | * |
| 101 | * @author Sunita Mani |
| 102 | */ |
| 103 | public class DocumentParser extends javax.swing.text.html.parser.Parser { |
| 104 | |
| 105 | private int inbody; |
| 106 | private int intitle; |
| 107 | private int inhead; |
| 108 | private int instyle; |
| 109 | private int inscript; |
| 110 | private boolean seentitle; |
| 111 | private HTMLEditorKit.ParserCallback callback = null; |
| 112 | private boolean ignoreCharSet = false; |
| 113 | private static final boolean debugFlag = false; |
| 114 | |
| 115 | public DocumentParser(DTD dtd) { |
| 116 | super(dtd); |
| 117 | } |
| 118 | |
| 119 | public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException { |
| 120 | this.ignoreCharSet = ignoreCharSet; |
| 121 | this.callback = callback; |
| 122 | parse(in); |
| 123 | // end of line |
| 124 | callback.handleEndOfLineString(getEndOfLineString()); |
| 125 | } |
| 126 | |
| 127 | /** |
| 128 | * Handle Start Tag. |
| 129 | */ |
| 130 | protected void handleStartTag(TagElement tag) { |
| 131 | |
| 132 | Element elem = tag.getElement(); |
| 133 | if (elem == dtd.body) { |
| 134 | inbody++; |
| 135 | } else if (elem == dtd.html) { |
| 136 | } else if (elem == dtd.head) { |
| 137 | inhead++; |
| 138 | } else if (elem == dtd.title) { |
| 139 | intitle++; |
| 140 | } else if (elem == dtd.style) { |
| 141 | instyle++; |
| 142 | } else if (elem == dtd.script) { |
| 143 | inscript++; |
| 144 | } |
| 145 | if (debugFlag) { |
| 146 | if (tag.fictional()) { |
| 147 | debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); |
| 148 | } else { |
| 149 | debug("Start Tag: " + tag.getHTMLTag() + " attributes: " + |
| 150 | getAttributes() + " pos: " + getCurrentPos()); |
| 151 | } |
| 152 | } |
| 153 | if (tag.fictional()) { |
| 154 | SimpleAttributeSet attrs = new SimpleAttributeSet(); |
| 155 | attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, |
| 156 | Boolean.TRUE); |
| 157 | callback.handleStartTag(tag.getHTMLTag(), attrs, |
| 158 | getBlockStartPosition()); |
| 159 | } else { |
| 160 | callback.handleStartTag(tag.getHTMLTag(), getAttributes(), |
| 161 | getBlockStartPosition()); |
| 162 | flushAttributes(); |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | |
| 167 | protected void handleComment(char text[]) { |
| 168 | if (debugFlag) { |
| 169 | debug("comment: ->" + new String(text) + "<-" |
| 170 | + " pos: " + getCurrentPos()); |
| 171 | } |
| 172 | callback.handleComment(text, getBlockStartPosition()); |
| 173 | } |
| 174 | |
| 175 | /** |
| 176 | * Handle Empty Tag. |
| 177 | */ |
| 178 | protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { |
| 179 | |
| 180 | Element elem = tag.getElement(); |
| 181 | if (elem == dtd.meta && !ignoreCharSet) { |
| 182 | SimpleAttributeSet atts = getAttributes(); |
| 183 | if (atts != null) { |
| 184 | String content = (String)atts.getAttribute(HTML.Attribute.CONTENT); |
| 185 | if (content != null) { |
| 186 | if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { |
| 187 | if (!content.equalsIgnoreCase("text/html") && |
| 188 | !content.equalsIgnoreCase("text/plain")) { |
| 189 | throw new ChangedCharSetException(content, false); |
| 190 | } |
| 191 | } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { |
| 192 | throw new ChangedCharSetException(content, true); |
| 193 | } |
| 194 | } |
| 195 | } |
| 196 | } |
| 197 | if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) { |
| 198 | if (debugFlag) { |
| 199 | if (tag.fictional()) { |
| 200 | debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); |
| 201 | } else { |
| 202 | debug("Empty Tag: " + tag.getHTMLTag() + " attributes: " |
| 203 | + getAttributes() + " pos: " + getCurrentPos()); |
| 204 | } |
| 205 | } |
| 206 | if (tag.fictional()) { |
| 207 | SimpleAttributeSet attrs = new SimpleAttributeSet(); |
| 208 | attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, |
| 209 | Boolean.TRUE); |
| 210 | callback.handleSimpleTag(tag.getHTMLTag(), attrs, |
| 211 | getBlockStartPosition()); |
| 212 | } else { |
| 213 | callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(), |
| 214 | getBlockStartPosition()); |
| 215 | flushAttributes(); |
| 216 | } |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | /** |
| 221 | * Handle End Tag. |
| 222 | */ |
| 223 | protected void handleEndTag(TagElement tag) { |
| 224 | Element elem = tag.getElement(); |
| 225 | if (elem == dtd.body) { |
| 226 | inbody--; |
| 227 | } else if (elem == dtd.title) { |
| 228 | intitle--; |
| 229 | seentitle = true; |
| 230 | } else if (elem == dtd.head) { |
| 231 | inhead--; |
| 232 | } else if (elem == dtd.style) { |
| 233 | instyle--; |
| 234 | } else if (elem == dtd.script) { |
| 235 | inscript--; |
| 236 | } |
| 237 | if (debugFlag) { |
| 238 | debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); |
| 239 | } |
| 240 | callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition()); |
| 241 | |
| 242 | } |
| 243 | |
| 244 | /** |
| 245 | * Handle Text. |
| 246 | */ |
| 247 | protected void handleText(char data[]) { |
| 248 | if (data != null) { |
| 249 | if (inscript != 0) { |
| 250 | callback.handleComment(data, getBlockStartPosition()); |
| 251 | return; |
| 252 | } |
| 253 | if (inbody != 0 || ((instyle != 0) || |
| 254 | ((intitle != 0) && !seentitle))) { |
| 255 | if (debugFlag) { |
| 256 | debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos()); |
| 257 | } |
| 258 | callback.handleText(data, getBlockStartPosition()); |
| 259 | } |
| 260 | } |
| 261 | } |
| 262 | |
| 263 | /* |
| 264 | * Error handling. |
| 265 | */ |
| 266 | protected void handleError(int ln, String errorMsg) { |
| 267 | if (debugFlag) { |
| 268 | debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos()); |
| 269 | } |
| 270 | /* PENDING: need to improve the error string. */ |
| 271 | callback.handleError(errorMsg, getCurrentPos()); |
| 272 | } |
| 273 | |
| 274 | |
| 275 | /* |
| 276 | * debug messages |
| 277 | */ |
| 278 | private void debug(String msg) { |
| 279 | System.out.println(msg); |
| 280 | } |
| 281 | } |