| /* |
| * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. Oracle designates this |
| * particular file as subject to the "Classpath" exception as provided |
| * by Oracle in the LICENSE file that accompanied this code. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| */ |
| |
| package javax.swing.text.html.parser; |
| |
| import javax.swing.text.SimpleAttributeSet; |
| import javax.swing.text.html.HTMLEditorKit; |
| import javax.swing.text.html.HTML; |
| import javax.swing.text.ChangedCharSetException; |
| |
| import java.util.*; |
| import java.io.*; |
| import java.net.*; |
| |
| /** |
| * A Parser for HTML Documents (actually, you can specify a DTD, but |
| * you should really only use this class with the html dtd in swing). |
| * Reads an InputStream of HTML and |
| * invokes the appropriate methods in the ParserCallback class. This |
| * is the default parser used by HTMLEditorKit to parse HTML url's. |
| * <p>This will message the callback for all valid tags, as well as |
| * tags that are implied but not explicitly specified. For example, the |
| * html string (<p>blah) only has a p tag defined. The callback |
| * will see the following methods: |
| * <ol><li><i>handleStartTag(html, ...)</i></li> |
| * <li><i>handleStartTag(head, ...)</i></li> |
| * <li><i>handleEndTag(head)</i></li> |
| * <li><i>handleStartTag(body, ...)</i></li> |
| * <li><i>handleStartTag(p, ...)</i></li> |
| * <li><i>handleText(...)</i></li> |
| * <li><i>handleEndTag(p)</i></li> |
| * <li><i>handleEndTag(body)</i></li> |
| * <li><i>handleEndTag(html)</i></li> |
| * </ol> |
| * The items in <i>italic</i> are implied, that is, although they were not |
| * explicitly specified, to be correct html they should have been present |
| * (head isn't necessary, but it is still generated). For tags that |
| * are implied, the AttributeSet argument will have a value of |
| * <code>Boolean.TRUE</code> for the key |
| * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>. |
| * <p>HTML.Attributes defines a type safe enumeration of html attributes. |
| * If an attribute key of a tag is defined in HTML.Attribute, the |
| * HTML.Attribute will be used as the key, otherwise a String will be used. |
| * For example <p foo=bar class=neat> has two attributes. foo is |
| * not defined in HTML.Attribute, where as class is, therefore the |
| * AttributeSet will have two values in it, HTML.Attribute.CLASS with |
| * a String value of 'neat' and the String key 'foo' with a String value of |
| * 'bar'. |
| * <p>The position argument will indicate the start of the tag, comment |
| * or text. Similar to arrays, the first character in the stream has a |
| * position of 0. For tags that are |
| * implied the position will indicate |
| * the location of the next encountered tag. In the first example, |
| * the implied start body and html tags will have the same position as the |
| * p tag, and the implied end p, html and body tags will all have the same |
| * position. |
| * <p>As html skips whitespace the position for text will be the position |
| * of the first valid character, eg in the string '\n\n\nblah' |
| * the text 'blah' will have a position of 3, the newlines are skipped. |
| * <p> |
| * For attributes that do not have a value, eg in the html |
| * string <code><foo blah></code> the attribute <code>blah</code> |
| * does not have a value, there are two possible values that will be |
| * placed in the AttributeSet's value: |
| * <ul> |
| * <li>If the DTD does not contain an definition for the element, or the |
| * definition does not have an explicit value then the value in the |
| * AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>. |
| * <li>If the DTD contains an explicit value, as in: |
| * <code><!ATTLIST OPTION selected (selected) #IMPLIED></code> |
| * this value from the dtd (in this case selected) will be used. |
| * </ul> |
| * <p> |
| * Once the stream has been parsed, the callback is notified of the most |
| * likely end of line string. The end of line string will be one of |
| * \n, \r or \r\n, which ever is encountered the most in parsing the |
| * stream. |
| * |
| * @author Sunita Mani |
| */ |
| public class DocumentParser extends javax.swing.text.html.parser.Parser { |
| |
| private int inbody; |
| private int intitle; |
| private int inhead; |
| private int instyle; |
| private int inscript; |
| private boolean seentitle; |
| private HTMLEditorKit.ParserCallback callback = null; |
| private boolean ignoreCharSet = false; |
| private static final boolean debugFlag = false; |
| |
| /** |
| * Creates document parser with the specified {@code dtd}. |
| * |
| * @param dtd the dtd. |
| */ |
| public DocumentParser(DTD dtd) { |
| super(dtd); |
| } |
| |
| /** |
| * Parse an HTML stream, given a DTD. |
| * |
| * @param in the reader to read the source from |
| * @param callback the callback |
| * @param ignoreCharSet if {@code true} the charset is ignored |
| * @throws IOException if an I/O error occurs |
| */ |
| public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException { |
| this.ignoreCharSet = ignoreCharSet; |
| this.callback = callback; |
| parse(in); |
| // end of line |
| callback.handleEndOfLineString(getEndOfLineString()); |
| } |
| |
| /** |
| * Handle Start Tag. |
| */ |
| protected void handleStartTag(TagElement tag) { |
| |
| Element elem = tag.getElement(); |
| if (elem == dtd.body) { |
| inbody++; |
| } else if (elem == dtd.html) { |
| } else if (elem == dtd.head) { |
| inhead++; |
| } else if (elem == dtd.title) { |
| intitle++; |
| } else if (elem == dtd.style) { |
| instyle++; |
| } else if (elem == dtd.script) { |
| inscript++; |
| } |
| if (debugFlag) { |
| if (tag.fictional()) { |
| debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); |
| } else { |
| debug("Start Tag: " + tag.getHTMLTag() + " attributes: " + |
| getAttributes() + " pos: " + getCurrentPos()); |
| } |
| } |
| if (tag.fictional()) { |
| SimpleAttributeSet attrs = new SimpleAttributeSet(); |
| attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, |
| Boolean.TRUE); |
| callback.handleStartTag(tag.getHTMLTag(), attrs, |
| getBlockStartPosition()); |
| } else { |
| callback.handleStartTag(tag.getHTMLTag(), getAttributes(), |
| getBlockStartPosition()); |
| flushAttributes(); |
| } |
| } |
| |
| |
| protected void handleComment(char text[]) { |
| if (debugFlag) { |
| debug("comment: ->" + new String(text) + "<-" |
| + " pos: " + getCurrentPos()); |
| } |
| callback.handleComment(text, getBlockStartPosition()); |
| } |
| |
| /** |
| * Handle Empty Tag. |
| */ |
| protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { |
| |
| Element elem = tag.getElement(); |
| if (elem == dtd.meta && !ignoreCharSet) { |
| SimpleAttributeSet atts = getAttributes(); |
| if (atts != null) { |
| String content = (String)atts.getAttribute(HTML.Attribute.CONTENT); |
| if (content != null) { |
| if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { |
| if (!content.equalsIgnoreCase("text/html") && |
| !content.equalsIgnoreCase("text/plain")) { |
| throw new ChangedCharSetException(content, false); |
| } |
| } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { |
| throw new ChangedCharSetException(content, true); |
| } |
| } |
| } |
| } |
| if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) { |
| if (debugFlag) { |
| if (tag.fictional()) { |
| debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); |
| } else { |
| debug("Empty Tag: " + tag.getHTMLTag() + " attributes: " |
| + getAttributes() + " pos: " + getCurrentPos()); |
| } |
| } |
| if (tag.fictional()) { |
| SimpleAttributeSet attrs = new SimpleAttributeSet(); |
| attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, |
| Boolean.TRUE); |
| callback.handleSimpleTag(tag.getHTMLTag(), attrs, |
| getBlockStartPosition()); |
| } else { |
| callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(), |
| getBlockStartPosition()); |
| flushAttributes(); |
| } |
| } |
| } |
| |
| /** |
| * Handle End Tag. |
| */ |
| protected void handleEndTag(TagElement tag) { |
| Element elem = tag.getElement(); |
| if (elem == dtd.body) { |
| inbody--; |
| } else if (elem == dtd.title) { |
| intitle--; |
| seentitle = true; |
| } else if (elem == dtd.head) { |
| inhead--; |
| } else if (elem == dtd.style) { |
| instyle--; |
| } else if (elem == dtd.script) { |
| inscript--; |
| } |
| if (debugFlag) { |
| debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); |
| } |
| callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition()); |
| |
| } |
| |
| /** |
| * Handle Text. |
| */ |
| protected void handleText(char data[]) { |
| if (data != null) { |
| if (inscript != 0) { |
| callback.handleComment(data, getBlockStartPosition()); |
| return; |
| } |
| if (inbody != 0 || ((instyle != 0) || |
| ((intitle != 0) && !seentitle))) { |
| if (debugFlag) { |
| debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos()); |
| } |
| callback.handleText(data, getBlockStartPosition()); |
| } |
| } |
| } |
| |
| /* |
| * Error handling. |
| */ |
| protected void handleError(int ln, String errorMsg) { |
| if (debugFlag) { |
| debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos()); |
| } |
| /* PENDING: need to improve the error string. */ |
| callback.handleError(errorMsg, getCurrentPos()); |
| } |
| |
| |
| /* |
| * debug messages |
| */ |
| private void debug(String msg) { |
| System.out.println(msg); |
| } |
| } |