| /* |
| * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. Oracle designates this |
| * particular file as subject to the "Classpath" exception as provided |
| * by Oracle in the LICENSE file that accompanied this code. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| */ |
| |
| package javax.swing.text.html.parser; |
| |
| import javax.swing.text.SimpleAttributeSet; |
| import javax.swing.text.html.HTML; |
| import javax.swing.text.ChangedCharSetException; |
| import java.io.*; |
| import java.util.Hashtable; |
| import java.util.Properties; |
| import java.util.Vector; |
| import java.util.Enumeration; |
| import java.net.URL; |
| |
| /** |
| * A simple DTD-driven HTML parser. The parser reads an |
| * HTML file from an InputStream and calls various methods |
| * (which should be overridden in a subclass) when tags and |
| * data are encountered. |
| * <p> |
| * Unfortunately there are many badly implemented HTML parsers |
| * out there, and as a result there are many badly formatted |
| * HTML files. This parser attempts to parse most HTML files. |
| * This means that the implementation sometimes deviates from |
| * the SGML specification in favor of HTML. |
| * <p> |
| * The parser treats \r and \r\n as \n. Newlines after starttags |
| * and before end tags are ignored just as specified in the SGML/HTML |
| * specification. |
| * <p> |
| * The html spec does not specify how spaces are to be coalesced very well. |
| * Specifically, the following scenarios are not discussed (note that a |
| * space should be used here, but I am using &nbsp to force the space to |
| * be displayed): |
| * <p> |
| * '<b>blah <i> <strike> foo' which can be treated as: |
| * '<b>blah <i><strike>foo' |
| * <p>as well as: |
| * '<p><a href="xx"> <em>Using</em></a></p>' |
| * which appears to be treated as: |
| * '<p><a href="xx"><em>Using</em></a></p>' |
| * <p> |
| * If <code>strict</code> is false, when a tag that breaks flow, |
| * (<code>TagElement.breaksFlows</code>) or trailing whitespace is |
| * encountered, all whitespace will be ignored until a non whitespace |
| * character is encountered. This appears to give behavior closer to |
| * the popular browsers. |
| * |
| * @see DTD |
| * @see TagElement |
| * @see SimpleAttributeSet |
| * @author Arthur van Hoff |
| * @author Sunita Mani |
| */ |
| public |
| class Parser implements DTDConstants { |
| |
| private char text[] = new char[1024]; |
| private int textpos = 0; |
| private TagElement last; |
| private boolean space; |
| |
| private char str[] = new char[128]; |
| private int strpos = 0; |
| |
| /** |
| * The dtd. |
| */ |
| protected DTD dtd = null; |
| |
| private int ch; |
| private int ln; |
| private Reader in; |
| |
| private Element recent; |
| private TagStack stack; |
| private boolean skipTag = false; |
| private TagElement lastFormSent = null; |
| private SimpleAttributeSet attributes = new SimpleAttributeSet(); |
| |
| // State for <html>, <head> and <body>. Since people like to slap |
| // together HTML documents without thinking, occasionally they |
| // have multiple instances of these tags. These booleans track |
| // the first sightings of these tags so they can be safely ignored |
| // by the parser if repeated. |
| private boolean seenHtml = false; |
| private boolean seenHead = false; |
| private boolean seenBody = false; |
| |
| /** |
| * The html spec does not specify how spaces are coalesced very well. |
| * If strict == false, ignoreSpace is used to try and mimic the behavior |
| * of the popular browsers. |
| * <p> |
| * The problematic scenarios are: |
| * '<b>blah <i> <strike> foo' which can be treated as: |
| * '<b>blah <i><strike>foo' |
| * as well as: |
| * '<p><a href="xx"> <em>Using</em></a></p>' |
| * which appears to be treated as: |
| * '<p><a href="xx"><em>Using</em></a></p>' |
| * <p> |
| * When a tag that breaks flow, or trailing whitespace is encountered |
| * ignoreSpace is set to true. From then on, all whitespace will be |
| * ignored. |
| * ignoreSpace will be set back to false the first time a |
| * non whitespace character is encountered. This appears to give |
| * behavior closer to the popular browsers. |
| */ |
| private boolean ignoreSpace; |
| |
| /** |
| * This flag determines whether or not the Parser will be strict |
| * in enforcing SGML compatibility. If false, it will be lenient |
| * with certain common classes of erroneous HTML constructs. |
| * Strict or not, in either case an error will be recorded. |
| * |
| */ |
| protected boolean strict = false; |
| |
| |
| /** Number of \r\n's encountered. */ |
| private int crlfCount; |
| /** Number of \r's encountered. A \r\n will not increment this. */ |
| private int crCount; |
| /** Number of \n's encountered. A \r\n will not increment this. */ |
| private int lfCount; |
| |
| // |
| // To correctly identify the start of a tag/comment/text we need two |
| // ivars. Two are needed as handleText isn't invoked until the tag |
| // after the text has been parsed, that is the parser parses the text, |
| // then a tag, then invokes handleText followed by handleStart. |
| // |
| /** The start position of the current block. Block is overloaded here, |
| * it really means the current start position for the current comment, |
| * tag, text. Use getBlockStartPosition to access this. */ |
| private int currentBlockStartPos; |
| /** Start position of the last block. */ |
| private int lastBlockStartPos; |
| |
| /** |
| * array for mapping numeric references in range |
| * 130-159 to displayable Unicode characters. |
| */ |
| private static final char[] cp1252Map = { |
| 8218, // ‚ |
| 402, // ƒ |
| 8222, // „ |
| 8230, // … |
| 8224, // † |
| 8225, // ‡ |
| 710, // ˆ |
| 8240, // ‰ |
| 352, // Š |
| 8249, // ‹ |
| 338, // Œ |
| 141, //  |
| 142, // Ž |
| 143, //  |
| 144, //  |
| 8216, // ‘ |
| 8217, // ’ |
| 8220, // “ |
| 8221, // ” |
| 8226, // • |
| 8211, // – |
| 8212, // — |
| 732, // ˜ |
| 8482, // ™ |
| 353, // š |
| 8250, // › |
| 339, // œ |
| 157, //  |
| 158, // ž |
| 376 // Ÿ |
| }; |
| |
| /** |
| * Creates parser with the specified {@code dtd}. |
| * |
| * @param dtd the dtd. |
| */ |
| public Parser(DTD dtd) { |
| this.dtd = dtd; |
| } |
| |
| |
| /** |
| * @return the line number of the line currently being parsed |
| */ |
| protected int getCurrentLine() { |
| return ln; |
| } |
| |
| /** |
| * Returns the start position of the current block. Block is |
| * overloaded here, it really means the current start position for |
| * the current comment tag, text, block.... This is provided for |
| * subclassers that wish to know the start of the current block when |
| * called with one of the handleXXX methods. |
| * |
| * @return the start position of the current block |
| */ |
| int getBlockStartPosition() { |
| return Math.max(0, lastBlockStartPos - 1); |
| } |
| |
| /** |
| * Makes a TagElement. |
| * |
| * @param elem the element storing the tag definition |
| * @param fictional the value of the flag "{@code fictional}" to be set for the tag |
| * |
| * @return the created {@code TagElement} |
| */ |
| protected TagElement makeTag(Element elem, boolean fictional) { |
| return new TagElement(elem, fictional); |
| } |
| |
| /** |
| * Makes a TagElement. |
| * |
| * @param elem the element storing the tag definition |
| * |
| * @return the created {@code TagElement} |
| */ |
| protected TagElement makeTag(Element elem) { |
| return makeTag(elem, false); |
| } |
| |
| /** |
| * Returns attributes for the current tag. |
| * |
| * @return {@code SimpleAttributeSet} containing the attributes |
| */ |
| protected SimpleAttributeSet getAttributes() { |
| return attributes; |
| } |
| |
| /** |
| * Removes the current attributes. |
| */ |
| protected void flushAttributes() { |
| attributes.removeAttributes(attributes); |
| } |
| |
| /** |
| * Called when PCDATA is encountered. |
| * |
| * @param text the section text |
| */ |
| protected void handleText(char text[]) { |
| } |
| |
| /** |
| * Called when an HTML title tag is encountered. |
| * |
| * @param text the title text |
| */ |
| protected void handleTitle(char text[]) { |
| // default behavior is to call handleText. Subclasses |
| // can override if necessary. |
| handleText(text); |
| } |
| |
| /** |
| * Called when an HTML comment is encountered. |
| * |
| * @param text the comment being handled |
| */ |
| protected void handleComment(char text[]) { |
| } |
| |
| /** |
| * Called when the content terminates without closing the HTML comment. |
| */ |
| protected void handleEOFInComment() { |
| // We've reached EOF. Our recovery strategy is to |
| // see if we have more than one line in the comment; |
| // if so, we pretend that the comment was an unterminated |
| // single line comment, and reparse the lines after the |
| // first line as normal HTML content. |
| |
| int commentEndPos = strIndexOf('\n'); |
| if (commentEndPos >= 0) { |
| handleComment(getChars(0, commentEndPos)); |
| try { |
| in.close(); |
| in = new CharArrayReader(getChars(commentEndPos + 1)); |
| ch = '>'; |
| } catch (IOException e) { |
| error("ioexception"); |
| } |
| |
| resetStrBuffer(); |
| } else { |
| // no newline, so signal an error |
| error("eof.comment"); |
| } |
| } |
| |
| /** |
| * Called when an empty tag is encountered. |
| * |
| * @param tag the tag being handled |
| * @throws ChangedCharSetException if the document charset was changed |
| */ |
| protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { |
| } |
| |
| /** |
| * Called when a start tag is encountered. |
| * |
| * @param tag the tag being handled |
| */ |
| protected void handleStartTag(TagElement tag) { |
| } |
| |
| /** |
| * Called when an end tag is encountered. |
| * |
| * @param tag the tag being handled |
| */ |
| protected void handleEndTag(TagElement tag) { |
| } |
| |
| /** |
| * An error has occurred. |
| * |
| * @param ln the number of line containing the error |
| * @param msg the error message |
| */ |
| protected void handleError(int ln, String msg) { |
| /* |
| Thread.dumpStack(); |
| System.out.println("**** " + stack); |
| System.out.println("line " + ln + ": error: " + msg); |
| System.out.println(); |
| */ |
| } |
| |
| /** |
| * Output text. |
| */ |
| void handleText(TagElement tag) { |
| if (tag.breaksFlow()) { |
| space = false; |
| if (!strict) { |
| ignoreSpace = true; |
| } |
| } |
| if (textpos == 0) { |
| if ((!space) || (stack == null) || last.breaksFlow() || |
| !stack.advance(dtd.pcdata)) { |
| last = tag; |
| space = false; |
| lastBlockStartPos = currentBlockStartPos; |
| return; |
| } |
| } |
| if (space) { |
| if (!ignoreSpace) { |
| // enlarge buffer if needed |
| if (textpos + 1 > text.length) { |
| char newtext[] = new char[text.length + 200]; |
| System.arraycopy(text, 0, newtext, 0, text.length); |
| text = newtext; |
| } |
| |
| // output pending space |
| text[textpos++] = ' '; |
| if (!strict && !tag.getElement().isEmpty()) { |
| ignoreSpace = true; |
| } |
| } |
| space = false; |
| } |
| char newtext[] = new char[textpos]; |
| System.arraycopy(text, 0, newtext, 0, textpos); |
| // Handles cases of bad html where the title tag |
| // was getting lost when we did error recovery. |
| if (tag.getElement().getName().equals("title")) { |
| handleTitle(newtext); |
| } else { |
| handleText(newtext); |
| } |
| lastBlockStartPos = currentBlockStartPos; |
| textpos = 0; |
| last = tag; |
| space = false; |
| } |
| |
| /** |
| * Invokes the error handler. |
| * |
| * @param err the error type |
| * @param arg1 the 1st error message argument |
| * @param arg2 the 2nd error message argument |
| * @param arg3 the 3rd error message argument |
| */ |
| protected void error(String err, String arg1, String arg2, |
| String arg3) { |
| handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); |
| } |
| |
| /** |
| * Invokes the error handler with the 3rd error message argument "?". |
| * |
| * @param err the error type |
| * @param arg1 the 1st error message argument |
| * @param arg2 the 2nd error message argument |
| */ |
| protected void error(String err, String arg1, String arg2) { |
| error(err, arg1, arg2, "?"); |
| } |
| |
| /** |
| * Invokes the error handler with the 2nd and 3rd error message argument "?". |
| * |
| * @param err the error type |
| * @param arg1 the 1st error message argument |
| */ |
| protected void error(String err, String arg1) { |
| error(err, arg1, "?", "?"); |
| } |
| |
| /** |
| * Invokes the error handler with the 1st, 2nd and 3rd error message argument "?". |
| * |
| * @param err the error type |
| */ |
| protected void error(String err) { |
| error(err, "?", "?", "?"); |
| } |
| |
| |
| /** |
| * Handle a start tag. The new tag is pushed |
| * onto the tag stack. The attribute list is |
| * checked for required attributes. |
| * |
| * @param tag the tag |
| * @throws ChangedCharSetException if the document charset was changed |
| */ |
| protected void startTag(TagElement tag) throws ChangedCharSetException { |
| Element elem = tag.getElement(); |
| |
| // If the tag is an empty tag and texpos != 0 |
| // this implies that there is text before the |
| // start tag that needs to be processed before |
| // handling the tag. |
| // |
| if (!elem.isEmpty() || |
| ((last != null) && !last.breaksFlow()) || |
| (textpos != 0)) { |
| handleText(tag); |
| } else { |
| // this variable gets updated in handleText(). |
| // Since in this case we do not call handleText() |
| // we need to update it here. |
| // |
| last = tag; |
| // Note that we should really check last.breakFlows before |
| // assuming this should be false. |
| space = false; |
| } |
| lastBlockStartPos = currentBlockStartPos; |
| |
| // check required attributes |
| for (AttributeList a = elem.atts ; a != null ; a = a.next) { |
| if ((a.modifier == REQUIRED) && |
| ((attributes.isEmpty()) || |
| ((!attributes.isDefined(a.name)) && |
| (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { |
| error("req.att ", a.getName(), elem.getName()); |
| } |
| } |
| |
| if (elem.isEmpty()) { |
| handleEmptyTag(tag); |
| /* |
| } else if (elem.getName().equals("form")) { |
| handleStartTag(tag); |
| */ |
| } else { |
| recent = elem; |
| stack = new TagStack(tag, stack); |
| handleStartTag(tag); |
| } |
| } |
| |
| /** |
| * Handle an end tag. The end tag is popped |
| * from the tag stack. |
| * |
| * @param omitted {@code true} if the tag is no actually present in the |
| * document, but is supposed by the parser |
| */ |
| protected void endTag(boolean omitted) { |
| handleText(stack.tag); |
| |
| if (omitted && !stack.elem.omitEnd()) { |
| error("end.missing", stack.elem.getName()); |
| } else if (!stack.terminate()) { |
| error("end.unexpected", stack.elem.getName()); |
| } |
| |
| // handle the tag |
| handleEndTag(stack.tag); |
| stack = stack.next; |
| recent = (stack != null) ? stack.elem : null; |
| } |
| |
| |
| boolean ignoreElement(Element elem) { |
| |
| String stackElement = stack.elem.getName(); |
| String elemName = elem.getName(); |
| /* We ignore all elements that are not valid in the context of |
| a table except <td>, <th> (these we handle in |
| legalElementContext()) and #pcdata. We also ignore the |
| <font> tag in the context of <ul> and <ol> We additonally |
| ignore the <meta> and the <style> tag if the body tag has |
| been seen. **/ |
| if ((elemName.equals("html") && seenHtml) || |
| (elemName.equals("head") && seenHead) || |
| (elemName.equals("body") && seenBody)) { |
| return true; |
| } |
| if (elemName.equals("dt") || elemName.equals("dd")) { |
| TagStack s = stack; |
| while (s != null && !s.elem.getName().equals("dl")) { |
| s = s.next; |
| } |
| if (s == null) { |
| return true; |
| } |
| } |
| |
| if (((stackElement.equals("table")) && |
| (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || |
| ((elemName.equals("font")) && |
| (stackElement.equals("ul") || stackElement.equals("ol"))) || |
| (elemName.equals("meta") && stack != null) || |
| (elemName.equals("style") && seenBody) || |
| (stackElement.equals("table") && elemName.equals("a"))) { |
| return true; |
| } |
| return false; |
| } |
| |
| |
| /** |
| * Marks the first time a tag has been seen in a document |
| * |
| * @param elem the element represented by the tag |
| */ |
| |
| protected void markFirstTime(Element elem) { |
| String elemName = elem.getName(); |
| if (elemName.equals("html")) { |
| seenHtml = true; |
| } else if (elemName.equals("head")) { |
| seenHead = true; |
| } else if (elemName.equals("body")) { |
| if (buf.length == 1) { |
| // Refer to note in definition of buf for details on this. |
| char[] newBuf = new char[256]; |
| |
| newBuf[0] = buf[0]; |
| buf = newBuf; |
| } |
| seenBody = true; |
| } |
| } |
| |
| /** |
| * Create a legal content for an element. |
| */ |
| boolean legalElementContext(Element elem) throws ChangedCharSetException { |
| |
| // System.out.println("-- legalContext -- " + elem); |
| |
| // Deal with the empty stack |
| if (stack == null) { |
| // System.out.println("-- stack is empty"); |
| if (elem != dtd.html) { |
| // System.out.println("-- pushing html"); |
| startTag(makeTag(dtd.html, true)); |
| return legalElementContext(elem); |
| } |
| return true; |
| } |
| |
| // Is it allowed in the current context |
| if (stack.advance(elem)) { |
| // System.out.println("-- legal context"); |
| markFirstTime(elem); |
| return true; |
| } |
| boolean insertTag = false; |
| |
| // The use of all error recovery strategies are contingent |
| // on the value of the strict property. |
| // |
| // These are commonly occurring errors. if insertTag is true, |
| // then we want to adopt an error recovery strategy that |
| // involves attempting to insert an additional tag to |
| // legalize the context. The two errors addressed here |
| // are: |
| // 1) when a <td> or <th> is seen soon after a <table> tag. |
| // In this case we insert a <tr>. |
| // 2) when any other tag apart from a <tr> is seen |
| // in the context of a <tr>. In this case we would |
| // like to add a <td>. If a <tr> is seen within a |
| // <tr> context, then we will close out the current |
| // <tr>. |
| // |
| // This insertion strategy is handled later in the method. |
| // The reason for checking this now, is that in other cases |
| // we would like to apply other error recovery strategies for example |
| // ignoring tags. |
| // |
| // In certain cases it is better to ignore a tag than try to |
| // fix the situation. So the first test is to see if this |
| // is what we need to do. |
| // |
| String stackElemName = stack.elem.getName(); |
| String elemName = elem.getName(); |
| |
| |
| if (!strict && |
| ((stackElemName.equals("table") && elemName.equals("td")) || |
| (stackElemName.equals("table") && elemName.equals("th")) || |
| (stackElemName.equals("tr") && !elemName.equals("tr")))){ |
| insertTag = true; |
| } |
| |
| |
| if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || |
| elem.getName().equals("body"))) { |
| if (skipTag = ignoreElement(elem)) { |
| error("tag.ignore", elem.getName()); |
| return skipTag; |
| } |
| } |
| |
| // Check for anything after the start of the table besides tr, td, th |
| // or caption, and if those aren't there, insert the <tr> and call |
| // legalElementContext again. |
| if (!strict && stackElemName.equals("table") && |
| !elemName.equals("tr") && !elemName.equals("td") && |
| !elemName.equals("th") && !elemName.equals("caption")) { |
| Element e = dtd.getElement("tr"); |
| TagElement t = makeTag(e, true); |
| legalTagContext(t); |
| startTag(t); |
| error("start.missing", elem.getName()); |
| return legalElementContext(elem); |
| } |
| |
| // They try to find a legal context by checking if the current |
| // tag is valid in an enclosing context. If so |
| // close out the tags by outputing end tags and then |
| // insert the current tag. If the tags that are |
| // being closed out do not have an optional end tag |
| // specification in the DTD then an html error is |
| // reported. |
| // |
| if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { |
| for (TagStack s = stack.next ; s != null ; s = s.next) { |
| if (s.advance(elem)) { |
| while (stack != s) { |
| endTag(true); |
| } |
| return true; |
| } |
| if (!s.terminate() || (strict && !s.elem.omitEnd())) { |
| break; |
| } |
| } |
| } |
| |
| // Check if we know what tag is expected next. |
| // If so insert the tag. Report an error if the |
| // tag does not have its start tag spec in the DTD as optional. |
| // |
| Element next = stack.first(); |
| if (next != null && (!strict || next.omitStart()) && |
| !(next==dtd.head && elem==dtd.pcdata) ) { |
| // System.out.println("-- omitting start tag: " + next); |
| TagElement t = makeTag(next, true); |
| legalTagContext(t); |
| startTag(t); |
| if (!next.omitStart()) { |
| error("start.missing", elem.getName()); |
| } |
| return legalElementContext(elem); |
| } |
| |
| |
| // Traverse the list of expected elements and determine if adding |
| // any of these elements would make for a legal context. |
| // |
| |
| if (!strict) { |
| ContentModel content = stack.contentModel(); |
| Vector<Element> elemVec = new Vector<Element>(); |
| if (content != null) { |
| content.getElements(elemVec); |
| for (Element e : elemVec) { |
| // Ensure that this element has not been included as |
| // part of the exclusions in the DTD. |
| // |
| if (stack.excluded(e.getIndex())) { |
| continue; |
| } |
| |
| boolean reqAtts = false; |
| |
| for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { |
| if (a.modifier == REQUIRED) { |
| reqAtts = true; |
| break; |
| } |
| } |
| // Ensure that no tag that has required attributes |
| // gets inserted. |
| // |
| if (reqAtts) { |
| continue; |
| } |
| |
| ContentModel m = e.getContent(); |
| if (m != null && m.first(elem)) { |
| // System.out.println("-- adding a legal tag: " + e); |
| TagElement t = makeTag(e, true); |
| legalTagContext(t); |
| startTag(t); |
| error("start.missing", e.getName()); |
| return legalElementContext(elem); |
| } |
| } |
| } |
| } |
| |
| // Check if the stack can be terminated. If so add the appropriate |
| // end tag. Report an error if the tag being ended does not have its |
| // end tag spec in the DTD as optional. |
| // |
| if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { |
| // System.out.println("-- omitting end tag: " + stack.elem); |
| if (!stack.elem.omitEnd()) { |
| error("end.missing", elem.getName()); |
| } |
| |
| endTag(true); |
| return legalElementContext(elem); |
| } |
| |
| // At this point we know that something is screwed up. |
| return false; |
| } |
| |
| /** |
| * Create a legal context for a tag. |
| */ |
| void legalTagContext(TagElement tag) throws ChangedCharSetException { |
| if (legalElementContext(tag.getElement())) { |
| markFirstTime(tag.getElement()); |
| return; |
| } |
| |
| // Avoid putting a block tag in a flow tag. |
| if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { |
| endTag(true); |
| legalTagContext(tag); |
| return; |
| } |
| |
| // Avoid putting something wierd in the head of the document. |
| for (TagStack s = stack ; s != null ; s = s.next) { |
| if (s.tag.getElement() == dtd.head) { |
| while (stack != s) { |
| endTag(true); |
| } |
| endTag(true); |
| legalTagContext(tag); |
| return; |
| } |
| } |
| |
| // Everything failed |
| error("tag.unexpected", tag.getElement().getName()); |
| } |
| |
| /** |
| * Error context. Something went wrong, make sure we are in |
| * the document's body context |
| */ |
| void errorContext() throws ChangedCharSetException { |
| for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { |
| handleEndTag(stack.tag); |
| } |
| if (stack == null) { |
| legalElementContext(dtd.body); |
| startTag(makeTag(dtd.body, true)); |
| } |
| } |
| |
| /** |
| * Add a char to the string buffer. |
| */ |
| void addString(int c) { |
| if (strpos == str.length) { |
| char newstr[] = new char[str.length + 128]; |
| System.arraycopy(str, 0, newstr, 0, str.length); |
| str = newstr; |
| } |
| str[strpos++] = (char)c; |
| } |
| |
| /** |
| * Get the string that's been accumulated. |
| */ |
| String getString(int pos) { |
| char newStr[] = new char[strpos - pos]; |
| System.arraycopy(str, pos, newStr, 0, strpos - pos); |
| strpos = pos; |
| return new String(newStr); |
| } |
| |
| char[] getChars(int pos) { |
| char newStr[] = new char[strpos - pos]; |
| System.arraycopy(str, pos, newStr, 0, strpos - pos); |
| strpos = pos; |
| return newStr; |
| } |
| |
| char[] getChars(int pos, int endPos) { |
| char newStr[] = new char[endPos - pos]; |
| System.arraycopy(str, pos, newStr, 0, endPos - pos); |
| // REMIND: it's not clear whether this version should set strpos or not |
| // strpos = pos; |
| return newStr; |
| } |
| |
| void resetStrBuffer() { |
| strpos = 0; |
| } |
| |
| int strIndexOf(char target) { |
| for (int i = 0; i < strpos; i++) { |
| if (str[i] == target) { |
| return i; |
| } |
| } |
| |
| return -1; |
| } |
| |
| /** |
| * Skip space. |
| * [5] 297:5 |
| */ |
| void skipSpace() throws IOException { |
| while (true) { |
| switch (ch) { |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| break; |
| case ' ': |
| case '\t': |
| ch = readCh(); |
| break; |
| |
| default: |
| return; |
| } |
| } |
| } |
| |
| /** |
| * Parse identifier. Uppercase characters are folded |
| * to lowercase when lower is true. Returns falsed if |
| * no identifier is found. [55] 346:17 |
| */ |
| boolean parseIdentifier(boolean lower) throws IOException { |
| switch (ch) { |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| if (lower) { |
| ch = 'a' + (ch - 'A'); |
| } |
| break; |
| |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| break; |
| |
| default: |
| return false; |
| } |
| |
| while (true) { |
| addString(ch); |
| |
| switch (ch = readCh()) { |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| if (lower) { |
| ch = 'a' + (ch - 'A'); |
| } |
| break; |
| |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| |
| case '.': case '-': |
| |
| case '_': // not officially allowed |
| break; |
| |
| default: |
| return true; |
| } |
| } |
| } |
| |
| /** |
| * Parse an entity reference. [59] 350:17 |
| */ |
| private char[] parseEntityReference() throws IOException { |
| int pos = strpos; |
| |
| if ((ch = readCh()) == '#') { |
| int n = 0; |
| ch = readCh(); |
| if ((ch >= '0') && (ch <= '9') || |
| ch == 'x' || ch == 'X') { |
| |
| if ((ch >= '0') && (ch <= '9')) { |
| // parse decimal reference |
| while ((ch >= '0') && (ch <= '9')) { |
| n = (n * 10) + ch - '0'; |
| ch = readCh(); |
| } |
| } else { |
| // parse hexadecimal reference |
| ch = readCh(); |
| char lch = (char) Character.toLowerCase(ch); |
| while ((lch >= '0') && (lch <= '9') || |
| (lch >= 'a') && (lch <= 'f')) { |
| if (lch >= '0' && lch <= '9') { |
| n = (n * 16) + lch - '0'; |
| } else { |
| n = (n * 16) + lch - 'a' + 10; |
| } |
| ch = readCh(); |
| lch = (char) Character.toLowerCase(ch); |
| } |
| } |
| switch (ch) { |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| break; |
| |
| case ';': |
| ch = readCh(); |
| break; |
| } |
| char data[] = mapNumericReference(n); |
| return data; |
| } |
| addString('#'); |
| if (!parseIdentifier(false)) { |
| error("ident.expected"); |
| strpos = pos; |
| char data[] = {'&', '#'}; |
| return data; |
| } |
| } else if (!parseIdentifier(false)) { |
| char data[] = {'&'}; |
| return data; |
| } |
| |
| boolean semicolon = false; |
| |
| switch (ch) { |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| break; |
| |
| case ';': |
| semicolon = true; |
| |
| ch = readCh(); |
| break; |
| } |
| |
| String nm = getString(pos); |
| Entity ent = dtd.getEntity(nm); |
| |
| // entities are case sensitive - however if strict |
| // is false then we will try to make a match by |
| // converting the string to all lowercase. |
| // |
| if (!strict && (ent == null)) { |
| ent = dtd.getEntity(nm.toLowerCase()); |
| } |
| if ((ent == null) || !ent.isGeneral()) { |
| |
| if (nm.length() == 0) { |
| error("invalid.entref", nm); |
| return new char[0]; |
| } |
| /* given that there is not a match restore the entity reference */ |
| String str = "&" + nm + (semicolon ? ";" : ""); |
| |
| char b[] = new char[str.length()]; |
| str.getChars(0, b.length, b, 0); |
| return b; |
| } |
| return ent.getData(); |
| } |
| |
| /** |
| * Converts numeric character reference to char array. |
| * |
| * Normally the code in a reference should be always converted |
| * to the Unicode character with the same code, but due to |
| * wide usage of Cp1252 charset most browsers map numeric references |
| * in the range 130-159 (which are control chars in Unicode set) |
| * to displayable characters with other codes. |
| * |
| * @param c the code of numeric character reference. |
| * @return a char array corresponding to the reference code. |
| */ |
| private char[] mapNumericReference(int c) { |
| char[] data; |
| if (c >= 0xffff) { // outside unicode BMP. |
| try { |
| data = Character.toChars(c); |
| } catch (IllegalArgumentException e) { |
| data = new char[0]; |
| } |
| } else { |
| data = new char[1]; |
| data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130]; |
| } |
| return data; |
| } |
| |
| /** |
| * Parse a comment. [92] 391:7 |
| */ |
| void parseComment() throws IOException { |
| |
| while (true) { |
| int c = ch; |
| switch (c) { |
| case '-': |
| /** Presuming that the start string of a comment "<!--" has |
| already been parsed, the '-' character is valid only as |
| part of a comment termination and further more it must |
| be present in even numbers. Hence if strict is true, we |
| presume the comment has been terminated and return. |
| However if strict is false, then there is no even number |
| requirement and this character can appear anywhere in the |
| comment. The parser reads on until it sees the following |
| pattern: "-->" or "--!>". |
| **/ |
| if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { |
| if ((ch = readCh()) == '>') { |
| return; |
| } |
| if (ch == '!') { |
| if ((ch = readCh()) == '>') { |
| return; |
| } else { |
| /* to account for extra read()'s that happened */ |
| addString('-'); |
| addString('!'); |
| continue; |
| } |
| } |
| break; |
| } |
| |
| if ((ch = readCh()) == '-') { |
| ch = readCh(); |
| if (strict || ch == '>') { |
| return; |
| } |
| if (ch == '!') { |
| if ((ch = readCh()) == '>') { |
| return; |
| } else { |
| /* to account for extra read()'s that happened */ |
| addString('-'); |
| addString('!'); |
| continue; |
| } |
| } |
| /* to account for the extra read() */ |
| addString('-'); |
| } |
| break; |
| |
| case -1: |
| handleEOFInComment(); |
| return; |
| |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| |
| case '>': |
| ch = readCh(); |
| break; |
| |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| c = '\n'; |
| break; |
| default: |
| ch = readCh(); |
| break; |
| } |
| |
| addString(c); |
| } |
| } |
| |
| /** |
| * Parse literal content. [46] 343:1 and [47] 344:1 |
| */ |
| void parseLiteral(boolean replace) throws IOException { |
| while (true) { |
| int c = ch; |
| switch (c) { |
| case -1: |
| error("eof.literal", stack.elem.getName()); |
| endTag(true); |
| return; |
| |
| case '>': |
| ch = readCh(); |
| int i = textpos - (stack.elem.name.length() + 2), j = 0; |
| |
| // match end tag |
| if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { |
| while ((++i < textpos) && |
| (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); |
| if (i == textpos) { |
| textpos -= (stack.elem.name.length() + 2); |
| if ((textpos > 0) && (text[textpos-1] == '\n')) { |
| textpos--; |
| } |
| endTag(false); |
| return; |
| } |
| } |
| break; |
| |
| case '&': |
| char data[] = parseEntityReference(); |
| if (textpos + data.length > text.length) { |
| char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; |
| System.arraycopy(text, 0, newtext, 0, text.length); |
| text = newtext; |
| } |
| System.arraycopy(data, 0, text, textpos, data.length); |
| textpos += data.length; |
| continue; |
| |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| c = '\n'; |
| break; |
| default: |
| ch = readCh(); |
| break; |
| } |
| |
| // output character |
| if (textpos == text.length) { |
| char newtext[] = new char[text.length + 128]; |
| System.arraycopy(text, 0, newtext, 0, text.length); |
| text = newtext; |
| } |
| text[textpos++] = (char)c; |
| } |
| } |
| |
| /** |
| * Parse attribute value. [33] 331:1 |
| */ |
| @SuppressWarnings("fallthrough") |
| String parseAttributeValue(boolean lower) throws IOException { |
| int delim = -1; |
| |
| // Check for a delimiter |
| switch(ch) { |
| case '\'': |
| case '"': |
| delim = ch; |
| ch = readCh(); |
| break; |
| } |
| |
| // Parse the rest of the value |
| while (true) { |
| int c = ch; |
| |
| switch (c) { |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| if (delim < 0) { |
| return getString(0); |
| } |
| break; |
| |
| case '\r': |
| ln++; |
| |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| if (delim < 0) { |
| return getString(0); |
| } |
| break; |
| |
| case '\t': |
| if (delim < 0) |
| c = ' '; |
| // Fall through |
| case ' ': |
| ch = readCh(); |
| if (delim < 0) { |
| return getString(0); |
| } |
| break; |
| |
| case '>': |
| case '<': |
| if (delim < 0) { |
| return getString(0); |
| } |
| ch = readCh(); |
| break; |
| |
| case '\'': |
| case '"': |
| ch = readCh(); |
| if (c == delim) { |
| return getString(0); |
| } else if (delim == -1) { |
| error("attvalerr"); |
| if (strict || ch == ' ') { |
| return getString(0); |
| } else { |
| continue; |
| } |
| } |
| break; |
| |
| case '=': |
| if (delim < 0) { |
| /* In SGML a construct like <img src=/cgi-bin/foo?x=1> |
| is considered invalid since an = sign can only be contained |
| in an attributes value if the string is quoted. |
| */ |
| error("attvalerr"); |
| /* If strict is true then we return with the string we have thus far. |
| Otherwise we accept the = sign as part of the attribute's value and |
| process the rest of the img tag. */ |
| if (strict) { |
| return getString(0); |
| } |
| } |
| ch = readCh(); |
| break; |
| |
| case '&': |
| if (strict && delim < 0) { |
| ch = readCh(); |
| break; |
| } |
| |
| char data[] = parseEntityReference(); |
| for (int i = 0 ; i < data.length ; i++) { |
| c = data[i]; |
| addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); |
| } |
| continue; |
| |
| case -1: |
| return getString(0); |
| |
| default: |
| if (lower && (c >= 'A') && (c <= 'Z')) { |
| c = 'a' + c - 'A'; |
| } |
| ch = readCh(); |
| break; |
| } |
| addString(c); |
| } |
| } |
| |
| |
| /** |
| * Parse attribute specification List. [31] 327:17 |
| */ |
| void parseAttributeSpecificationList(Element elem) throws IOException { |
| |
| while (true) { |
| skipSpace(); |
| |
| switch (ch) { |
| case '/': |
| case '>': |
| case '<': |
| case -1: |
| return; |
| |
| case '-': |
| if ((ch = readCh()) == '-') { |
| ch = readCh(); |
| parseComment(); |
| strpos = 0; |
| } else { |
| error("invalid.tagchar", "-", elem.getName()); |
| ch = readCh(); |
| } |
| continue; |
| } |
| |
| AttributeList att; |
| String attname; |
| String attvalue; |
| |
| if (parseIdentifier(true)) { |
| attname = getString(0); |
| skipSpace(); |
| if (ch == '=') { |
| ch = readCh(); |
| skipSpace(); |
| att = elem.getAttribute(attname); |
| // Bug ID 4102750 |
| // Load the NAME of an Attribute Case Sensitive |
| // The case of the NAME must be intact |
| // MG 021898 |
| attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); |
| // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); |
| } else { |
| attvalue = attname; |
| att = elem.getAttributeByValue(attvalue); |
| if (att == null) { |
| att = elem.getAttribute(attname); |
| if (att != null) { |
| attvalue = att.getValue(); |
| } |
| else { |
| // Make it null so that NULL_ATTRIBUTE_VALUE is |
| // used |
| attvalue = null; |
| } |
| } |
| } |
| } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs |
| ch = readCh(); |
| continue; |
| } else if (!strict && ch == '"') { // allows for quoted attributes |
| ch = readCh(); |
| skipSpace(); |
| if (parseIdentifier(true)) { |
| attname = getString(0); |
| if (ch == '"') { |
| ch = readCh(); |
| } |
| skipSpace(); |
| if (ch == '=') { |
| ch = readCh(); |
| skipSpace(); |
| att = elem.getAttribute(attname); |
| attvalue = parseAttributeValue((att != null) && |
| (att.type != CDATA) && |
| (att.type != NOTATION)); |
| } else { |
| attvalue = attname; |
| att = elem.getAttributeByValue(attvalue); |
| if (att == null) { |
| att = elem.getAttribute(attname); |
| if (att != null) { |
| attvalue = att.getValue(); |
| } |
| } |
| } |
| } else { |
| char str[] = {(char)ch}; |
| error("invalid.tagchar", new String(str), elem.getName()); |
| ch = readCh(); |
| continue; |
| } |
| } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { |
| ch = readCh(); |
| skipSpace(); |
| attname = elem.getName(); |
| att = elem.getAttribute(attname); |
| attvalue = parseAttributeValue((att != null) && |
| (att.type != CDATA) && |
| (att.type != NOTATION)); |
| } else if (!strict && (ch == '=')) { |
| ch = readCh(); |
| skipSpace(); |
| attvalue = parseAttributeValue(true); |
| error("attvalerr"); |
| return; |
| } else { |
| char str[] = {(char)ch}; |
| error("invalid.tagchar", new String(str), elem.getName()); |
| if (!strict) { |
| ch = readCh(); |
| continue; |
| } else { |
| return; |
| } |
| } |
| |
| if (att != null) { |
| attname = att.getName(); |
| } else { |
| error("invalid.tagatt", attname, elem.getName()); |
| } |
| |
| // Check out the value |
| if (attributes.isDefined(attname)) { |
| error("multi.tagatt", attname, elem.getName()); |
| } |
| if (attvalue == null) { |
| attvalue = ((att != null) && (att.value != null)) ? att.value : |
| HTML.NULL_ATTRIBUTE_VALUE; |
| } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { |
| error("invalid.tagattval", attname, elem.getName()); |
| } |
| HTML.Attribute attkey = HTML.getAttributeKey(attname); |
| if (attkey == null) { |
| attributes.addAttribute(attname, attvalue); |
| } else { |
| attributes.addAttribute(attkey, attvalue); |
| } |
| } |
| } |
| |
| /** |
| * Parses the Document Type Declaration markup declaration. |
| * Currently ignores it. |
| * |
| * @return the string representation of the markup declaration |
| * @throws IOException if an I/O error occurs |
| */ |
| public String parseDTDMarkup() throws IOException { |
| |
| StringBuilder strBuff = new StringBuilder(); |
| ch = readCh(); |
| while(true) { |
| switch (ch) { |
| case '>': |
| ch = readCh(); |
| return strBuff.toString(); |
| case -1: |
| error("invalid.markup"); |
| return strBuff.toString(); |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| case '"': |
| ch = readCh(); |
| break; |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| break; |
| default: |
| strBuff.append((char)(ch & 0xFF)); |
| ch = readCh(); |
| break; |
| } |
| } |
| } |
| |
| /** |
| * Parse markup declarations. |
| * Currently only handles the Document Type Declaration markup. |
| * Returns true if it is a markup declaration false otherwise. |
| * |
| * @param strBuff the markup declaration |
| * @return {@code true} if this is a valid markup declaration; |
| * otherwise {@code false} |
| * @throws IOException if an I/O error occurs |
| */ |
| protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { |
| |
| /* Currently handles only the DOCTYPE */ |
| if ((strBuff.length() == "DOCTYPE".length()) && |
| (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { |
| parseDTDMarkup(); |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Parse an invalid tag. |
| */ |
| void parseInvalidTag() throws IOException { |
| // ignore all data upto the close bracket '>' |
| while (true) { |
| skipSpace(); |
| switch (ch) { |
| case '>': |
| case -1: |
| ch = readCh(); |
| return; |
| case '<': |
| return; |
| default: |
| ch = readCh(); |
| |
| } |
| } |
| } |
| |
| /** |
| * Parse a start or end tag. |
| */ |
| @SuppressWarnings("fallthrough") |
| void parseTag() throws IOException { |
| Element elem; |
| boolean net = false; |
| boolean warned = false; |
| boolean unknown = false; |
| |
| switch (ch = readCh()) { |
| case '!': |
| switch (ch = readCh()) { |
| case '-': |
| // Parse comment. [92] 391:7 |
| while (true) { |
| if (ch == '-') { |
| if (!strict || ((ch = readCh()) == '-')) { |
| ch = readCh(); |
| if (!strict && ch == '-') { |
| ch = readCh(); |
| } |
| // send over any text you might see |
| // before parsing and sending the |
| // comment |
| if (textpos != 0) { |
| char newtext[] = new char[textpos]; |
| System.arraycopy(text, 0, newtext, 0, textpos); |
| handleText(newtext); |
| lastBlockStartPos = currentBlockStartPos; |
| textpos = 0; |
| } |
| parseComment(); |
| last = makeTag(dtd.getElement("comment"), true); |
| handleComment(getChars(0)); |
| continue; |
| } else if (!warned) { |
| warned = true; |
| error("invalid.commentchar", "-"); |
| } |
| } |
| skipSpace(); |
| switch (ch) { |
| case '-': |
| continue; |
| case '>': |
| ch = readCh(); |
| return; |
| case -1: |
| return; |
| default: |
| ch = readCh(); |
| if (!warned) { |
| warned = true; |
| error("invalid.commentchar", |
| String.valueOf((char)ch)); |
| } |
| break; |
| } |
| } |
| |
| default: |
| // deal with marked sections |
| StringBuffer strBuff = new StringBuffer(); |
| while (true) { |
| strBuff.append((char)ch); |
| if (parseMarkupDeclarations(strBuff)) { |
| return; |
| } |
| switch(ch) { |
| case '>': |
| ch = readCh(); |
| // Fall through |
| case -1: |
| error("invalid.markup"); |
| return; |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| break; |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| break; |
| |
| default: |
| ch = readCh(); |
| break; |
| } |
| } |
| } |
| |
| case '/': |
| // parse end tag [19] 317:4 |
| switch (ch = readCh()) { |
| case '>': |
| ch = readCh(); |
| // Fall through |
| case '<': |
| // empty end tag. either </> or </< |
| if (recent == null) { |
| error("invalid.shortend"); |
| return; |
| } |
| elem = recent; |
| break; |
| |
| default: |
| if (!parseIdentifier(true)) { |
| error("expected.endtagname"); |
| return; |
| } |
| skipSpace(); |
| switch (ch) { |
| case '>': |
| ch = readCh(); |
| break; |
| case '<': |
| break; |
| |
| default: |
| error("expected", "'>'"); |
| while ((ch != -1) && (ch != '\n') && (ch != '>')) { |
| ch = readCh(); |
| } |
| if (ch == '>') { |
| ch = readCh(); |
| } |
| break; |
| } |
| String elemStr = getString(0); |
| if (!dtd.elementExists(elemStr)) { |
| error("end.unrecognized", elemStr); |
| // Ignore RE before end tag |
| if ((textpos > 0) && (text[textpos-1] == '\n')) { |
| textpos--; |
| } |
| elem = dtd.getElement("unknown"); |
| elem.name = elemStr; |
| unknown = true; |
| } else { |
| elem = dtd.getElement(elemStr); |
| } |
| break; |
| } |
| |
| |
| // If the stack is null, we're seeing end tags without any begin |
| // tags. Ignore them. |
| |
| if (stack == null) { |
| error("end.extra.tag", elem.getName()); |
| return; |
| } |
| |
| // Ignore RE before end tag |
| if ((textpos > 0) && (text[textpos-1] == '\n')) { |
| // In a pre tag, if there are blank lines |
| // we do not want to remove the newline |
| // before the end tag. Hence this code. |
| // |
| if (stack.pre) { |
| if ((textpos > 1) && (text[textpos-2] != '\n')) { |
| textpos--; |
| } |
| } else { |
| textpos--; |
| } |
| } |
| |
| // If the end tag is a form, since we did not put it |
| // on the tag stack, there is no corresponding start |
| // start tag to find. Hence do not touch the tag stack. |
| // |
| |
| /* |
| if (!strict && elem.getName().equals("form")) { |
| if (lastFormSent != null) { |
| handleEndTag(lastFormSent); |
| return; |
| } else { |
| // do nothing. |
| return; |
| } |
| } |
| */ |
| |
| if (unknown) { |
| // we will not see a corresponding start tag |
| // on the stack. If we are seeing an |
| // end tag, lets send this on as an empty |
| // tag with the end tag attribute set to |
| // true. |
| TagElement t = makeTag(elem); |
| handleText(t); |
| attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); |
| handleEmptyTag(makeTag(elem)); |
| unknown = false; |
| return; |
| } |
| |
| // find the corresponding start tag |
| |
| // A commonly occurring error appears to be the insertion |
| // of extra end tags in a table. The intent here is ignore |
| // such extra end tags. |
| // |
| if (!strict) { |
| String stackElem = stack.elem.getName(); |
| |
| if (stackElem.equals("table")) { |
| // If it is not a valid end tag ignore it and return |
| // |
| if (!elem.getName().equals(stackElem)) { |
| error("tag.ignore", elem.getName()); |
| return; |
| } |
| } |
| |
| |
| |
| if (stackElem.equals("tr") || |
| stackElem.equals("td")) { |
| if ((!elem.getName().equals("table")) && |
| (!elem.getName().equals(stackElem))) { |
| error("tag.ignore", elem.getName()); |
| return; |
| } |
| } |
| } |
| TagStack sp = stack; |
| |
| while ((sp != null) && (elem != sp.elem)) { |
| sp = sp.next; |
| } |
| if (sp == null) { |
| error("unmatched.endtag", elem.getName()); |
| return; |
| } |
| |
| // People put font ending tags in the darndest places. |
| // Don't close other contexts based on them being between |
| // a font tag and the corresponding end tag. Instead, |
| // ignore the end tag like it doesn't exist and allow the end |
| // of the document to close us out. |
| String elemName = elem.getName(); |
| if (stack != sp && |
| (elemName.equals("font") || |
| elemName.equals("center"))) { |
| |
| // Since closing out a center tag can have real wierd |
| // effects on the formatting, make sure that tags |
| // for which omitting an end tag is legimitate |
| // get closed out. |
| // |
| if (elemName.equals("center")) { |
| while(stack.elem.omitEnd() && stack != sp) { |
| endTag(true); |
| } |
| if (stack.elem == elem) { |
| endTag(false); |
| } |
| } |
| return; |
| } |
| // People do the same thing with center tags. In this |
| // case we would like to close off the center tag but |
| // not necessarily all enclosing tags. |
| |
| |
| |
| // end tags |
| while (stack != sp) { |
| endTag(true); |
| } |
| |
| endTag(false); |
| return; |
| |
| case -1: |
| error("eof"); |
| return; |
| } |
| |
| // start tag [14] 314:1 |
| if (!parseIdentifier(true)) { |
| elem = recent; |
| if ((ch != '>') || (elem == null)) { |
| error("expected.tagname"); |
| return; |
| } |
| } else { |
| String elemStr = getString(0); |
| |
| if (elemStr.equals("image")) { |
| elemStr = "img"; |
| } |
| |
| /* determine if this element is part of the dtd. */ |
| |
| if (!dtd.elementExists(elemStr)) { |
| // parseInvalidTag(); |
| error("tag.unrecognized ", elemStr); |
| elem = dtd.getElement("unknown"); |
| elem.name = elemStr; |
| unknown = true; |
| } else { |
| elem = dtd.getElement(elemStr); |
| } |
| } |
| |
| // Parse attributes |
| parseAttributeSpecificationList(elem); |
| |
| switch (ch) { |
| case '/': |
| net = true; |
| // Fall through |
| case '>': |
| ch = readCh(); |
| if (ch == '>' && net) { |
| ch = readCh(); |
| } |
| case '<': |
| break; |
| |
| default: |
| error("expected", "'>'"); |
| break; |
| } |
| |
| if (!strict) { |
| if (elem.getName().equals("script")) { |
| error("javascript.unsupported"); |
| } |
| } |
| |
| // ignore RE after start tag |
| // |
| if (!elem.isEmpty()) { |
| if (ch == '\n') { |
| ln++; |
| lfCount++; |
| ch = readCh(); |
| } else if (ch == '\r') { |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| } |
| } |
| |
| // ensure a legal context for the tag |
| TagElement tag = makeTag(elem, false); |
| |
| |
| /** In dealing with forms, we have decided to treat |
| them as legal in any context. Also, even though |
| they do have a start and an end tag, we will |
| not put this tag on the stack. This is to deal |
| several pages in the web oasis that choose to |
| start and end forms in any possible location. **/ |
| |
| /* |
| if (!strict && elem.getName().equals("form")) { |
| if (lastFormSent == null) { |
| lastFormSent = tag; |
| } else { |
| handleEndTag(lastFormSent); |
| lastFormSent = tag; |
| } |
| } else { |
| */ |
| // Smlly, if a tag is unknown, we will apply |
| // no legalTagContext logic to it. |
| // |
| if (!unknown) { |
| legalTagContext(tag); |
| |
| // If skip tag is true, this implies that |
| // the tag was illegal and that the error |
| // recovery strategy adopted is to ignore |
| // the tag. |
| if (!strict && skipTag) { |
| skipTag = false; |
| return; |
| } |
| } |
| /* |
| } |
| */ |
| |
| startTag(tag); |
| |
| if (!elem.isEmpty()) { |
| switch (elem.getType()) { |
| case CDATA: |
| parseLiteral(false); |
| break; |
| case RCDATA: |
| parseLiteral(true); |
| break; |
| default: |
| if (stack != null) { |
| stack.net = net; |
| } |
| break; |
| } |
| } |
| } |
| |
| private static final String START_COMMENT = "<!--"; |
| private static final String END_COMMENT = "-->"; |
| private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); |
| private static final char[] SCRIPT_END_TAG_UPPER_CASE = |
| "</SCRIPT>".toCharArray(); |
| |
| void parseScript() throws IOException { |
| char[] charsToAdd = new char[SCRIPT_END_TAG.length]; |
| boolean insideComment = false; |
| |
| /* Here, ch should be the first character after <script> */ |
| while (true) { |
| int i = 0; |
| while (!insideComment && i < SCRIPT_END_TAG.length |
| && (SCRIPT_END_TAG[i] == ch |
| || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { |
| charsToAdd[i] = (char) ch; |
| ch = readCh(); |
| i++; |
| } |
| if (i == SCRIPT_END_TAG.length) { |
| return; |
| } |
| |
| if (!insideComment && i == 1 && charsToAdd[0] == START_COMMENT.charAt(0)) { |
| // it isn't end script tag, but may be it's start comment tag? |
| while (i < START_COMMENT.length() |
| && START_COMMENT.charAt(i) == ch) { |
| charsToAdd[i] = (char) ch; |
| ch = readCh(); |
| i++; |
| } |
| if (i == START_COMMENT.length()) { |
| insideComment = true; |
| } |
| } |
| if (insideComment) { |
| while (i < END_COMMENT.length() |
| && END_COMMENT.charAt(i) == ch) { |
| charsToAdd[i] = (char) ch; |
| ch = readCh(); |
| i++; |
| } |
| if (i == END_COMMENT.length()) { |
| insideComment = false; |
| } |
| } |
| |
| /* To account for extra read()'s that happened */ |
| if (i > 0) { |
| for (int j = 0; j < i; j++) { |
| addString(charsToAdd[j]); |
| } |
| continue; |
| } |
| switch (ch) { |
| case -1: |
| error("eof.script"); |
| return; |
| case '\n': |
| ln++; |
| ch = readCh(); |
| lfCount++; |
| addString('\n'); |
| break; |
| case '\r': |
| ln++; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } else { |
| crCount++; |
| } |
| addString('\n'); |
| break; |
| default: |
| addString(ch); |
| ch = readCh(); |
| break; |
| } // switch |
| } // while |
| } |
| |
| /** |
| * Parse Content. [24] 320:1 |
| */ |
| void parseContent() throws IOException { |
| Thread curThread = Thread.currentThread(); |
| |
| for (;;) { |
| if (curThread.isInterrupted()) { |
| curThread.interrupt(); // resignal the interrupt |
| break; |
| } |
| |
| int c = ch; |
| currentBlockStartPos = currentPosition; |
| |
| if (recent == dtd.script) { // means: if after starting <script> tag |
| |
| /* Here, ch has to be the first character after <script> */ |
| parseScript(); |
| last = makeTag(dtd.getElement("comment"), true); |
| |
| /* Remove leading and trailing HTML comment declarations */ |
| String str = new String(getChars(0)).trim(); |
| int minLength = START_COMMENT.length() + END_COMMENT.length(); |
| if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) |
| && str.length() >= (minLength)) { |
| str = str.substring(START_COMMENT.length(), |
| str.length() - END_COMMENT.length()); |
| } |
| |
| /* Handle resulting chars as comment */ |
| handleComment(str.toCharArray()); |
| endTag(false); |
| lastBlockStartPos = currentPosition; |
| |
| continue; |
| } else { |
| switch (c) { |
| case '<': |
| parseTag(); |
| lastBlockStartPos = currentPosition; |
| continue; |
| |
| case '/': |
| ch = readCh(); |
| if ((stack != null) && stack.net) { |
| // null end tag. |
| endTag(false); |
| continue; |
| } else if (textpos == 0) { |
| if (!legalElementContext(dtd.pcdata)) { |
| error("unexpected.pcdata"); |
| } |
| if (last.breaksFlow()) { |
| space = false; |
| } |
| } |
| break; |
| |
| case -1: |
| return; |
| |
| case '&': |
| if (textpos == 0) { |
| if (!legalElementContext(dtd.pcdata)) { |
| error("unexpected.pcdata"); |
| } |
| if (last.breaksFlow()) { |
| space = false; |
| } |
| } |
| char data[] = parseEntityReference(); |
| if (textpos + data.length + 1 > text.length) { |
| char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; |
| System.arraycopy(text, 0, newtext, 0, text.length); |
| text = newtext; |
| } |
| if (space) { |
| space = false; |
| text[textpos++] = ' '; |
| } |
| System.arraycopy(data, 0, text, textpos, data.length); |
| textpos += data.length; |
| ignoreSpace = false; |
| continue; |
| |
| case '\n': |
| ln++; |
| lfCount++; |
| ch = readCh(); |
| if ((stack != null) && stack.pre) { |
| break; |
| } |
| if (textpos == 0) { |
| lastBlockStartPos = currentPosition; |
| } |
| if (!ignoreSpace) { |
| space = true; |
| } |
| continue; |
| |
| case '\r': |
| ln++; |
| c = '\n'; |
| if ((ch = readCh()) == '\n') { |
| ch = readCh(); |
| crlfCount++; |
| } |
| else { |
| crCount++; |
| } |
| if ((stack != null) && stack.pre) { |
| break; |
| } |
| if (textpos == 0) { |
| lastBlockStartPos = currentPosition; |
| } |
| if (!ignoreSpace) { |
| space = true; |
| } |
| continue; |
| |
| |
| case '\t': |
| case ' ': |
| ch = readCh(); |
| if ((stack != null) && stack.pre) { |
| break; |
| } |
| if (textpos == 0) { |
| lastBlockStartPos = currentPosition; |
| } |
| if (!ignoreSpace) { |
| space = true; |
| } |
| continue; |
| |
| default: |
| if (textpos == 0) { |
| if (!legalElementContext(dtd.pcdata)) { |
| error("unexpected.pcdata"); |
| } |
| if (last.breaksFlow()) { |
| space = false; |
| } |
| } |
| ch = readCh(); |
| break; |
| } |
| } |
| |
| // enlarge buffer if needed |
| if (textpos + 2 > text.length) { |
| char newtext[] = new char[text.length + 128]; |
| System.arraycopy(text, 0, newtext, 0, text.length); |
| text = newtext; |
| } |
| |
| // output pending space |
| if (space) { |
| if (textpos == 0) { |
| lastBlockStartPos--; |
| } |
| text[textpos++] = ' '; |
| space = false; |
| } |
| text[textpos++] = (char)c; |
| ignoreSpace = false; |
| } |
| } |
| |
| /** |
| * Returns the end of line string. This will return the end of line |
| * string that has been encountered the most, one of \r, \n or \r\n. |
| */ |
| String getEndOfLineString() { |
| if (crlfCount >= crCount) { |
| if (lfCount >= crlfCount) { |
| return "\n"; |
| } |
| else { |
| return "\r\n"; |
| } |
| } |
| else { |
| if (crCount > lfCount) { |
| return "\r"; |
| } |
| else { |
| return "\n"; |
| } |
| } |
| } |
| |
| /** |
| * Parse an HTML stream, given a DTD. |
| * |
| * @param in the reader to read the source from |
| * @throws IOException if an I/O error occurs |
| */ |
| public synchronized void parse(Reader in) throws IOException { |
| this.in = in; |
| |
| this.ln = 1; |
| |
| seenHtml = false; |
| seenHead = false; |
| seenBody = false; |
| |
| crCount = lfCount = crlfCount = 0; |
| |
| try { |
| ch = readCh(); |
| text = new char[1024]; |
| str = new char[128]; |
| |
| parseContent(); |
| // NOTE: interruption may have occurred. Control flows out |
| // of here normally. |
| while (stack != null) { |
| endTag(true); |
| } |
| in.close(); |
| } catch (IOException e) { |
| errorContext(); |
| error("ioexception"); |
| throw e; |
| } catch (Exception e) { |
| errorContext(); |
| error("exception", e.getClass().getName(), e.getMessage()); |
| e.printStackTrace(); |
| } catch (ThreadDeath e) { |
| errorContext(); |
| error("terminated"); |
| e.printStackTrace(); |
| throw e; |
| } finally { |
| for (; stack != null ; stack = stack.next) { |
| handleEndTag(stack.tag); |
| } |
| |
| text = null; |
| str = null; |
| } |
| |
| } |
| |
| |
| /* |
| * Input cache. This is much faster than calling down to a synchronized |
| * method of BufferedReader for each byte. Measurements done 5/30/97 |
| * show that there's no point in having a bigger buffer: Increasing |
| * the buffer to 8192 had no measurable impact for a program discarding |
| * one character at a time (reading from an http URL to a local machine). |
| * NOTE: If the current encoding is bogus, and we read too much |
| * (past the content-type) we may suffer a MalformedInputException. For |
| * this reason the initial size is 1 and when the body is encountered the |
| * size is adjusted to 256. |
| */ |
| private char buf[] = new char[1]; |
| private int pos; |
| private int len; |
| /* |
| tracks position relative to the beginning of the |
| document. |
| */ |
| private int currentPosition; |
| |
| |
| private final int readCh() throws IOException { |
| |
| if (pos >= len) { |
| |
| // This loop allows us to ignore interrupts if the flag |
| // says so |
| for (;;) { |
| try { |
| len = in.read(buf); |
| break; |
| } catch (InterruptedIOException ex) { |
| throw ex; |
| } |
| } |
| |
| if (len <= 0) { |
| return -1; // eof |
| } |
| pos = 0; |
| } |
| ++currentPosition; |
| |
| return buf[pos++]; |
| } |
| |
| |
| /** |
| * Returns the current position. |
| * |
| * @return the current position |
| */ |
| protected int getCurrentPos() { |
| return currentPosition; |
| } |
| } |