| /* |
| * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. |
| */ |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.sun.org.apache.xerces.internal.impl.xpath.regex; |
| |
| import java.text.CharacterIterator; |
| import java.util.Locale; |
| import java.util.Stack; |
| |
| import com.sun.org.apache.xerces.internal.util.IntStack; |
| |
| /** |
| * A regular expression matching engine using Non-deterministic Finite Automaton (NFA). |
| * This engine does not conform to the POSIX regular expression. |
| * |
| * <hr width="50%"> |
| * <h3>How to use</h3> |
| * |
| * <dl> |
| * <dt>A. Standard way |
| * <dd> |
| * <pre> |
| * RegularExpression re = new RegularExpression(<var>regex</var>); |
| * if (re.matches(text)) { ... } |
| * </pre> |
| * |
| * <dt>B. Capturing groups |
| * <dd> |
| * <pre> |
| * RegularExpression re = new RegularExpression(<var>regex</var>); |
| * Match match = new Match(); |
| * if (re.matches(text, match)) { |
| * ... // You can refer captured texts with methods of the <code>Match</code> class. |
| * } |
| * </pre> |
| * |
| * </dl> |
| * |
| * <h4>Case-insensitive matching</h4> |
| * <pre> |
| * RegularExpression re = new RegularExpression(<var>regex</var>, "i"); |
| * if (re.matches(text) >= 0) { ...} |
| * </pre> |
| * |
| * <h4>Options</h4> |
| * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a> |
| * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>. |
| * This <var>options</var> parameter consists of the following characters. |
| * </p> |
| * <dl> |
| * <dt><a name="I_OPTION"><code>"i"</code></a> |
| * <dd>This option indicates case-insensitive matching. |
| * <dt><a name="M_OPTION"><code>"m"</code></a> |
| * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text. |
| * <dt><a name="S_OPTION"><code>"s"</code></a> |
| * <dd class="REGEX"><kbd>.</kbd> matches any one character. |
| * <dt><a name="U_OPTION"><code>"u"</code></a> |
| * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode. |
| * <dt><a name="W_OPTION"><code>"w"</code></a> |
| * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of |
| * 'Unicode Regular Expression Guidelines' Revision 4. |
| * When "w" and "u" are specified at the same time, |
| * <kbd>\b \B \< \></kbd> are processed for the "w" option. |
| * <dt><a name="COMMA_OPTION"><code>","</code></a> |
| * <dd>The parser treats a comma in a character class as a range separator. |
| * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option. |
| * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option. |
| * |
| * <dt><a name="X_OPTION"><code>"X"</code></a> |
| * <dd class="REGEX"> |
| * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>. |
| * The <code>match()</code> method does not do subsring matching |
| * but entire string matching. |
| * |
| * </dl> |
| * |
| * <hr width="50%"> |
| * <h3>Syntax</h3> |
| * <table border="1" bgcolor="#ddeeff"> |
| * <tr> |
| * <td> |
| * <h4>Differences from the Perl 5 regular expression</h4> |
| * <ul> |
| * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.) |
| * <li>Supports subtraction, union, and intersection operations for character classes. |
| * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations), |
| * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>, |
| * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>, |
| * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>, |
| * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd> |
| * </ul> |
| * </td> |
| * </tr> |
| * </table> |
| * |
| * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P> |
| * <ul> |
| * <li>Character |
| * <dl> |
| * <dt class="REGEX"><kbd>.</kbd> (A period) |
| * <dd>Matches any one character except the following characters. |
| * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), |
| * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028) |
| * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates. |
| * <dd>When <a href="#S_OPTION">the "s" option</a> is specified, |
| * it matches any character including the above four characters. |
| * |
| * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd> |
| * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A), |
| * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009) |
| * |
| * <dt class="REGEX"><kbd>\c</kbd><var>C</var> |
| * <dd>Matches a control character. |
| * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>', |
| * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'. |
| * It matches a control character of which the character code is less than |
| * the character code of the <var>C</var> by 0x0040. |
| * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A), |
| * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B). |
| * |
| * <dt class="REGEX">a non-meta character |
| * <dd>Matches the character. |
| * |
| * <dt class="REGEX"><KBD>\</KBD> + a meta character |
| * <dd>Matches the meta character. |
| * |
| * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> |
| * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode. |
| * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and |
| * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>. |
| * |
| * <!-- |
| * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var> |
| * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode. |
| * --> |
| * |
| * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var> |
| * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode. |
| * |
| * <dt class="REGEX"><kbd>\g</kbd> |
| * <dd>Matches a grapheme. |
| * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd> |
| * |
| * <dt class="REGEX"><kbd>\X</kbd> |
| * <dd class="REGEX">Matches a combining character sequence. |
| * It is equivalent to <kbd>(?:\PM\pM*)</kbd> |
| * </dl> |
| * </li> |
| * |
| * <li>Character class |
| * <dl> |
| + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>) |
| + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>) |
| * <dd>Positive character class. It matches a character in ranges. |
| * <dd><var>R<sub>n</sub></var>: |
| * <ul> |
| * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>) |
| * <p>This range matches the character. |
| * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var> |
| * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point. |
| + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>, |
| + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd> |
| * <p>... |
| * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd> |
| * <p>These expressions specifies the same ranges as the following expressions. |
| * </ul> |
| * <p class="REGEX">Enumerated ranges are merged (union operation). |
| * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd> |
| * |
| * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>) |
| * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>) |
| * <dd>Negative character class. It matches a character not in ranges. |
| * |
| * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd> |
| * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.) |
| * <dd>Subtraction or union or intersection for character classes. |
| * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>. |
| * <dd>The result of this operations is a <u>positive character class</u> |
| * even if an expression includes any negative character classes. |
| * You have to take care on this in case-insensitive matching. |
| * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>, |
| * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching. |
| * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because |
| * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>' |
| * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>. |
| * |
| * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt> |
| * <dd>Character class subtraction for the XML Schema. |
| * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>. |
| * |
| * <dt class="REGEX"><kbd>\d</kbd> |
| * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>. |
| * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| * <span class="REGEX"><kbd>\p{Nd}</kbd></span>. |
| * |
| * <dt class="REGEX"><kbd>\D</kbd> |
| * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd> |
| * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| * <span class="REGEX"><kbd>\P{Nd}</kbd></span>. |
| * |
| * <dt class="REGEX"><kbd>\s</kbd> |
| * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd> |
| * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>. |
| * |
| * <dt class="REGEX"><kbd>\S</kbd> |
| * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd> |
| * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>. |
| * |
| * <dt class="REGEX"><kbd>\w</kbd> |
| * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd> |
| * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. |
| * |
| * <dt class="REGEX"><kbd>\W</kbd> |
| * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd> |
| * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. |
| * |
| * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd> |
| * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>. |
| * The following names are available: |
| * <dl> |
| * <dt>Unicode General Categories: |
| * <dd><kbd> |
| * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp, |
| * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So, |
| * </kbd> |
| * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters) |
| * <dt>Unicode Blocks: |
| * <dd><kbd> |
| * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B, |
| * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek, |
| * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati, |
| * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian, |
| * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation, |
| * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols, |
| * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators, |
| * Miscellaneous Technical, Control Pictures, Optical Character Recognition, |
| * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes, |
| * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana, |
| * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, |
| * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs, |
| * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates, |
| * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms, |
| * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms, |
| * Small Form Variants, Arabic Presentation Forms-B, Specials, |
| * Halfwidth and Fullwidth Forms |
| * </kbd> |
| * <dt>Others: |
| * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>) |
| * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>) |
| * <dd><kbd>UNASSGINED</kbd> |
| * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>) |
| * </dl> |
| * |
| * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd> |
| * <dd>Matches one character not in the specified General Category or the specified Block. |
| * </dl> |
| * </li> |
| * |
| * <li>Selection and Quantifier |
| * <dl> |
| * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR> |
| * <dd>... |
| * |
| * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD> |
| * <dd>Matches 0 or more <var>X</var>. |
| * |
| * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD> |
| * <dd>Matches 1 or more <var>X</var>. |
| * |
| * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD> |
| * <dd>Matches 0 or 1 <var>X</var>. |
| * |
| * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd> |
| * <dd>Matches <var>number</var> times. |
| * |
| * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd> |
| * <dd>... |
| * |
| * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd> |
| * <dd>... |
| * |
| * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd> |
| * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd> |
| * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd> |
| * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd> |
| * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd> |
| * <dd>Non-greedy matching. |
| * </dl> |
| * </li> |
| * |
| * <li>Grouping, Capturing, and Back-reference |
| * <dl> |
| * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD> |
| * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>". |
| * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>", |
| * you have to write "<KBD>(?:foo)+</KBD>". |
| * |
| * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD> |
| * <dd>Grouping with capturing. |
| * It make a group and applications can know |
| * where in target text a group matched with methods of a <code>Match</code> instance |
| * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>. |
| * The 0th group means whole of this regular expression. |
| * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis. |
| * |
| * <p>For instance, a regular expression is |
| * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>" |
| * and target text is |
| * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>": |
| * <ul> |
| * <li><code>Match.getCapturedText(0)</code>: |
| * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>" |
| * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>" |
| * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>" |
| * </ul> |
| * |
| * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd> |
| * <dd> |
| * |
| * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd> |
| * <dd>Independent expression group. ................ |
| * |
| * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd> |
| * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd> |
| * <dd>............................ |
| * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'. |
| * Note that it can not contain 'u'. |
| * |
| * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd> |
| * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd> |
| * <dd>...... |
| * <dd>These expressions must be at the beginning of a group. |
| * </dl> |
| * </li> |
| * |
| * <li>Anchor |
| * <dl> |
| * <dt class="REGEX"><kbd>\A</kbd> |
| * <dd>Matches the beginnig of the text. |
| * |
| * <dt class="REGEX"><kbd>\Z</kbd> |
| * <dd>Matches the end of the text, or before an EOL character at the end of the text, |
| * or CARRIAGE RETURN + LINE FEED at the end of the text. |
| * |
| * <dt class="REGEX"><kbd>\z</kbd> |
| * <dd>Matches the end of the text. |
| * |
| * <dt class="REGEX"><kbd>^</kbd> |
| * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>. |
| * <dd>When <a href="#M_OPTION">a "m" option</a> is set, |
| * it matches the beginning of the text, or after one of EOL characters ( |
| * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028), |
| * PARAGRAPH SEPARATOR (U+2029).) |
| * |
| * <dt class="REGEX"><kbd>$</kbd> |
| * <dd>Matches the end of the text, or before an EOL character at the end of the text, |
| * or CARRIAGE RETURN + LINE FEED at the end of the text. |
| * <dd>When <a href="#M_OPTION">a "m" option</a> is set, |
| * it matches the end of the text, or before an EOL character. |
| * |
| * <dt class="REGEX"><kbd>\b</kbd> |
| * <dd>Matches word boundary. |
| * (See <a href="#W_OPTION">a "w" option</a>) |
| * |
| * <dt class="REGEX"><kbd>\B</kbd> |
| * <dd>Matches non word boundary. |
| * (See <a href="#W_OPTION">a "w" option</a>) |
| * |
| * <dt class="REGEX"><kbd>\<</kbd> |
| * <dd>Matches the beginning of a word. |
| * (See <a href="#W_OPTION">a "w" option</a>) |
| * |
| * <dt class="REGEX"><kbd>\></kbd> |
| * <dd>Matches the end of a word. |
| * (See <a href="#W_OPTION">a "w" option</a>) |
| * </dl> |
| * </li> |
| * <li>Lookahead and lookbehind |
| * <dl> |
| * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd> |
| * <dd>Lookahead. |
| * |
| * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd> |
| * <dd>Negative lookahead. |
| * |
| * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd> |
| * <dd>Lookbehind. |
| * <dd>(Note for text capturing......) |
| * |
| * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd> |
| * <dd>Negative lookbehind. |
| * </dl> |
| * </li> |
| * |
| * <li>Misc. |
| * <dl> |
| * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>, |
| * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd> |
| * <dd>...... |
| * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd> |
| * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'. |
| * You can not write comments in character classes and before quantifiers. |
| * </dl> |
| * </li> |
| * </ul> |
| * |
| * |
| * <hr width="50%"> |
| * <h3>BNF for the regular expression</h3> |
| * <pre> |
| * regex ::= ('(?' options ')')? term ('|' term)* |
| * term ::= factor+ |
| * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? |
| * | '(?#' [^)]* ')' |
| * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' |
| * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] |
| * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' |
| * | '(?>' regex ')' | '(?' options ':' regex ')' |
| * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' |
| * options ::= [imsw]* ('-' [imsw]+)? |
| * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' |
| * looks ::= '(?=' regex ')' | '(?!' regex ')' |
| * | '(?<=' regex ')' | '(?<!' regex ')' |
| * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 |
| * category-block ::= '\' [pP] category-symbol-1 |
| * | ('\p{' | '\P{') (category-symbol | block-name |
| * | other-properties) '}' |
| * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' |
| * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' |
| * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' |
| * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' |
| * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' |
| * | 'Sm' | 'Sc' | 'Sk' | 'So' |
| * block-name ::= (See above) |
| * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' |
| * character-1 ::= (any character except meta-characters) |
| * |
| * char-class ::= '[' ranges ']' |
| * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' |
| * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+ |
| * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block |
| * | range-char | range-char '-' range-char |
| * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 |
| * code-point ::= '\x' hex-char hex-char |
| * | '\x{' hex-char+ '}' |
| * <!-- | '\u005c u' hex-char hex-char hex-char hex-char |
| * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char |
| * hex-char ::= [0-9a-fA-F] |
| * character-2 ::= (any character except \[]-,) |
| * </pre> |
| * |
| * <hr width="50%"> |
| * <h3>TODO</h3> |
| * <ul> |
| * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a> |
| * <ul> |
| * <li>2.4 Canonical Equivalents |
| * <li>Level 3 |
| * </ul> |
| * <li>Parsing performance |
| * </ul> |
| * |
| * <hr width="50%"> |
| * |
| * @xerces.internal |
| * |
| * @author TAMURA Kent <kent@trl.ibm.co.jp> |
| */ |
| public class RegularExpression implements java.io.Serializable { |
| |
| private static final long serialVersionUID = 6242499334195006401L; |
| |
| static final boolean DEBUG = false; |
| |
| /** |
| * Compiles a token tree into an operation flow. |
| */ |
| private synchronized void compile(Token tok) { |
| if (this.operations != null) |
| return; |
| this.numberOfClosures = 0; |
| this.operations = this.compile(tok, null, false); |
| } |
| |
| /** |
| * Converts a token to an operation. |
| */ |
| private Op compile(Token tok, Op next, boolean reverse) { |
| Op ret; |
| switch (tok.type) { |
| case Token.DOT: |
| ret = Op.createDot(); |
| ret.next = next; |
| break; |
| |
| case Token.CHAR: |
| ret = Op.createChar(tok.getChar()); |
| ret.next = next; |
| break; |
| |
| case Token.ANCHOR: |
| ret = Op.createAnchor(tok.getChar()); |
| ret.next = next; |
| break; |
| |
| case Token.RANGE: |
| case Token.NRANGE: |
| ret = Op.createRange(tok); |
| ret.next = next; |
| break; |
| |
| case Token.CONCAT: |
| ret = next; |
| if (!reverse) { |
| for (int i = tok.size()-1; i >= 0; i --) { |
| ret = compile(tok.getChild(i), ret, false); |
| } |
| } else { |
| for (int i = 0; i < tok.size(); i ++) { |
| ret = compile(tok.getChild(i), ret, true); |
| } |
| } |
| break; |
| |
| case Token.UNION: |
| Op.UnionOp uni = Op.createUnion(tok.size()); |
| for (int i = 0; i < tok.size(); i ++) { |
| uni.addElement(compile(tok.getChild(i), next, reverse)); |
| } |
| ret = uni; // ret.next is null. |
| break; |
| |
| case Token.CLOSURE: |
| case Token.NONGREEDYCLOSURE: |
| Token child = tok.getChild(0); |
| int min = tok.getMin(); |
| int max = tok.getMax(); |
| if (min >= 0 && min == max) { // {n} |
| ret = next; |
| for (int i = 0; i < min; i ++) { |
| ret = compile(child, ret, reverse); |
| } |
| break; |
| } |
| if (min > 0 && max > 0) |
| max -= min; |
| if (max > 0) { |
| // X{2,6} -> XX(X(X(XX?)?)?)? |
| ret = next; |
| for (int i = 0; i < max; i ++) { |
| Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); |
| q.next = next; |
| q.setChild(compile(child, ret, reverse)); |
| ret = q; |
| } |
| } else { |
| Op.ChildOp op; |
| if (tok.type == Token.NONGREEDYCLOSURE) { |
| op = Op.createNonGreedyClosure(); |
| } else { // Token.CLOSURE |
| op = Op.createClosure(this.numberOfClosures++); |
| } |
| op.next = next; |
| op.setChild(compile(child, op, reverse)); |
| ret = op; |
| } |
| if (min > 0) { |
| for (int i = 0; i < min; i ++) { |
| ret = compile(child, ret, reverse); |
| } |
| } |
| break; |
| |
| case Token.EMPTY: |
| ret = next; |
| break; |
| |
| case Token.STRING: |
| ret = Op.createString(tok.getString()); |
| ret.next = next; |
| break; |
| |
| case Token.BACKREFERENCE: |
| ret = Op.createBackReference(tok.getReferenceNumber()); |
| ret.next = next; |
| break; |
| |
| case Token.PAREN: |
| if (tok.getParenNumber() == 0) { |
| ret = compile(tok.getChild(0), next, reverse); |
| } else if (reverse) { |
| next = Op.createCapture(tok.getParenNumber(), next); |
| next = compile(tok.getChild(0), next, reverse); |
| ret = Op.createCapture(-tok.getParenNumber(), next); |
| } else { |
| next = Op.createCapture(-tok.getParenNumber(), next); |
| next = compile(tok.getChild(0), next, reverse); |
| ret = Op.createCapture(tok.getParenNumber(), next); |
| } |
| break; |
| |
| case Token.LOOKAHEAD: |
| ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); |
| break; |
| case Token.NEGATIVELOOKAHEAD: |
| ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); |
| break; |
| case Token.LOOKBEHIND: |
| ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); |
| break; |
| case Token.NEGATIVELOOKBEHIND: |
| ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); |
| break; |
| |
| case Token.INDEPENDENT: |
| ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); |
| break; |
| |
| case Token.MODIFIERGROUP: |
| ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), |
| ((Token.ModifierToken)tok).getOptions(), |
| ((Token.ModifierToken)tok).getOptionsMask()); |
| break; |
| |
| case Token.CONDITION: |
| Token.ConditionToken ctok = (Token.ConditionToken)tok; |
| int ref = ctok.refNumber; |
| Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); |
| Op yes = compile(ctok.yes, next, reverse); |
| Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); |
| ret = Op.createCondition(next, ref, condition, yes, no); |
| break; |
| |
| default: |
| throw new RuntimeException("Unknown token type: "+tok.type); |
| } // switch (tok.type) |
| return ret; |
| } |
| |
| |
| //Public |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| * |
| * @return true if the target is matched to this regular expression. |
| */ |
| public boolean matches(char[] target) { |
| return this.matches(target, 0, target .length , (Match)null); |
| } |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| * in specified range or not. |
| * |
| * @param start Start offset of the range. |
| * @param end End offset +1 of the range. |
| * @return true if the target is matched to this regular expression. |
| */ |
| public boolean matches(char[] target, int start, int end) { |
| return this.matches(target, start, end, (Match)null); |
| } |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| * |
| * @param match A Match instance for storing matching result. |
| * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| */ |
| public boolean matches(char[] target, Match match) { |
| return this.matches(target, 0, target .length , match); |
| } |
| |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| * in specified range or not. |
| * |
| * @param start Start offset of the range. |
| * @param end End offset +1 of the range. |
| * @param match A Match instance for storing matching result. |
| * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| */ |
| public boolean matches(char[] target, int start, int end, Match match) { |
| |
| synchronized (this) { |
| if (this.operations == null) |
| this.prepare(); |
| if (this.context == null) |
| this.context = new Context(); |
| } |
| Context con = null; |
| synchronized (this.context) { |
| con = this.context.inuse ? new Context() : this.context; |
| con.reset(target, start, end, this.numberOfClosures); |
| } |
| if (match != null) { |
| match.setNumberOfGroups(this.nofparen); |
| match.setSource(target); |
| } else if (this.hasBackReferences) { |
| match = new Match(); |
| match.setNumberOfGroups(this.nofparen); |
| // Need not to call setSource() because |
| // a caller can not access this match instance. |
| } |
| con.match = match; |
| |
| if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { |
| int matchEnd = this. match(con, this.operations, con.start, 1, this.options); |
| //System.err.println("DEBUG: matchEnd="+matchEnd); |
| if (matchEnd == con.limit) { |
| if (con.match != null) { |
| con.match.setBeginning(0, con.start); |
| con.match.setEnd(0, matchEnd); |
| } |
| con.setInUse(false); |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * The pattern has only fixed string. |
| * The engine uses Boyer-Moore. |
| */ |
| if (this.fixedStringOnly) { |
| //System.err.println("DEBUG: fixed-only: "+this.fixedString); |
| int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| if (o >= 0) { |
| if (con.match != null) { |
| con.match.setBeginning(0, o); |
| con.match.setEnd(0, o+this.fixedString.length()); |
| } |
| con.setInUse(false); |
| return true; |
| } |
| con.setInUse(false); |
| return false; |
| } |
| |
| /* |
| * The pattern contains a fixed string. |
| * The engine checks with Boyer-Moore whether the text contains the fixed string or not. |
| * If not, it return with false. |
| */ |
| if (this.fixedString != null) { |
| int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| if (o < 0) { |
| //System.err.println("Non-match in fixed-string search."); |
| con.setInUse(false); |
| return false; |
| } |
| } |
| |
| int limit = con.limit-this.minlength; |
| int matchStart; |
| int matchEnd = -1; |
| |
| /* |
| * Checks whether the expression starts with ".*". |
| */ |
| if (this.operations != null |
| && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { |
| if (isSet(this.options, SINGLE_LINE)) { |
| matchStart = con.start; |
| matchEnd = this. match(con, this.operations, con.start, 1, this.options); |
| } else { |
| boolean previousIsEOL = true; |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| int ch = target [ matchStart ] ; |
| if (isEOLChar(ch)) { |
| previousIsEOL = true; |
| } else { |
| if (previousIsEOL) { |
| if (0 <= (matchEnd = this. match(con, this.operations, |
| matchStart, 1, this.options))) |
| break; |
| } |
| previousIsEOL = false; |
| } |
| } |
| } |
| } |
| |
| /* |
| * Optimization against the first character. |
| */ |
| else if (this.firstChar != null) { |
| //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); |
| RangeToken range = this.firstChar; |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| int ch = target [matchStart] ; |
| if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { |
| ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]); |
| } |
| if (!range.match(ch)) { |
| continue; |
| } |
| if (0 <= (matchEnd = this. match(con, this.operations, |
| matchStart, 1, this.options))) { |
| break; |
| } |
| } |
| } |
| |
| /* |
| * Straightforward matching. |
| */ |
| else { |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) |
| break; |
| } |
| } |
| |
| if (matchEnd >= 0) { |
| if (con.match != null) { |
| con.match.setBeginning(0, matchStart); |
| con.match.setEnd(0, matchEnd); |
| } |
| con.setInUse(false); |
| return true; |
| } else { |
| con.setInUse(false); |
| return false; |
| } |
| } |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| * |
| * @return true if the target is matched to this regular expression. |
| */ |
| public boolean matches(String target) { |
| return this.matches(target, 0, target .length() , (Match)null); |
| } |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| * in specified range or not. |
| * |
| * @param start Start offset of the range. |
| * @param end End offset +1 of the range. |
| * @return true if the target is matched to this regular expression. |
| */ |
| public boolean matches(String target, int start, int end) { |
| return this.matches(target, start, end, (Match)null); |
| } |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| * |
| * @param match A Match instance for storing matching result. |
| * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| */ |
| public boolean matches(String target, Match match) { |
| return this.matches(target, 0, target .length() , match); |
| } |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| * in specified range or not. |
| * |
| * @param start Start offset of the range. |
| * @param end End offset +1 of the range. |
| * @param match A Match instance for storing matching result. |
| * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| */ |
| public boolean matches(String target, int start, int end, Match match) { |
| |
| synchronized (this) { |
| if (this.operations == null) |
| this.prepare(); |
| if (this.context == null) |
| this.context = new Context(); |
| } |
| Context con = null; |
| synchronized (this.context) { |
| con = this.context.inuse ? new Context() : this.context; |
| con.reset(target, start, end, this.numberOfClosures); |
| } |
| if (match != null) { |
| match.setNumberOfGroups(this.nofparen); |
| match.setSource(target); |
| } else if (this.hasBackReferences) { |
| match = new Match(); |
| match.setNumberOfGroups(this.nofparen); |
| // Need not to call setSource() because |
| // a caller can not access this match instance. |
| } |
| con.match = match; |
| |
| if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { |
| if (DEBUG) { |
| System.err.println("target string="+target); |
| } |
| int matchEnd = this. match(con, this.operations, con.start, 1, this.options); |
| if (DEBUG) { |
| System.err.println("matchEnd="+matchEnd); |
| System.err.println("con.limit="+con.limit); |
| } |
| if (matchEnd == con.limit) { |
| if (con.match != null) { |
| con.match.setBeginning(0, con.start); |
| con.match.setEnd(0, matchEnd); |
| } |
| con.setInUse(false); |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * The pattern has only fixed string. |
| * The engine uses Boyer-Moore. |
| */ |
| if (this.fixedStringOnly) { |
| //System.err.println("DEBUG: fixed-only: "+this.fixedString); |
| int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| if (o >= 0) { |
| if (con.match != null) { |
| con.match.setBeginning(0, o); |
| con.match.setEnd(0, o+this.fixedString.length()); |
| } |
| con.setInUse(false); |
| return true; |
| } |
| con.setInUse(false); |
| return false; |
| } |
| |
| /* |
| * The pattern contains a fixed string. |
| * The engine checks with Boyer-Moore whether the text contains the fixed string or not. |
| * If not, it return with false. |
| */ |
| if (this.fixedString != null) { |
| int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| if (o < 0) { |
| //System.err.println("Non-match in fixed-string search."); |
| con.setInUse(false); |
| return false; |
| } |
| } |
| |
| int limit = con.limit-this.minlength; |
| int matchStart; |
| int matchEnd = -1; |
| |
| /* |
| * Checks whether the expression starts with ".*". |
| */ |
| if (this.operations != null |
| && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { |
| if (isSet(this.options, SINGLE_LINE)) { |
| matchStart = con.start; |
| matchEnd = this.match(con, this.operations, con.start, 1, this.options); |
| } else { |
| boolean previousIsEOL = true; |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| int ch = target .charAt( matchStart ) ; |
| if (isEOLChar(ch)) { |
| previousIsEOL = true; |
| } else { |
| if (previousIsEOL) { |
| if (0 <= (matchEnd = this.match(con, this.operations, |
| matchStart, 1, this.options))) |
| break; |
| } |
| previousIsEOL = false; |
| } |
| } |
| } |
| } |
| |
| /* |
| * Optimization against the first character. |
| */ |
| else if (this.firstChar != null) { |
| //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); |
| RangeToken range = this.firstChar; |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| int ch = target .charAt( matchStart ) ; |
| if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { |
| ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1)); |
| } |
| if (!range.match(ch)) { |
| continue; |
| } |
| if (0 <= (matchEnd = this.match(con, this.operations, |
| matchStart, 1, this.options))) { |
| break; |
| } |
| } |
| } |
| |
| /* |
| * Straightforward matching. |
| */ |
| else { |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options))) |
| break; |
| } |
| } |
| |
| if (matchEnd >= 0) { |
| if (con.match != null) { |
| con.match.setBeginning(0, matchStart); |
| con.match.setEnd(0, matchEnd); |
| } |
| con.setInUse(false); |
| return true; |
| } else { |
| con.setInUse(false); |
| return false; |
| } |
| } |
| |
| /** |
| * @return -1 when not match; offset of the end of matched string when match. |
| */ |
| @SuppressWarnings("fallthrough") |
| private int match(Context con, Op op, int offset, int dx, int opts) { |
| final ExpressionTarget target = con.target; |
| final Stack<Op> opStack = new Stack<>(); |
| final IntStack dataStack = new IntStack(); |
| final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE); |
| int retValue = -1; |
| boolean returned = false; |
| |
| for (;;) { |
| if (op == null || offset > con.limit || offset < con.start) { |
| if (op == null) { |
| retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; |
| } |
| else { |
| retValue = -1; |
| } |
| returned = true; |
| } |
| else { |
| retValue = -1; |
| // dx value is either 1 or -1 |
| switch (op.type) { |
| case Op.CHAR: |
| { |
| final int o1 = (dx > 0) ? offset : offset -1; |
| if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) { |
| returned = true; |
| break; |
| } |
| offset += dx; |
| op = op.next; |
| } |
| break; |
| |
| case Op.DOT: |
| { |
| int o1 = (dx > 0) ? offset : offset - 1; |
| if (o1 >= con.limit || o1 < 0) { |
| returned = true; |
| break; |
| } |
| if (isSet(opts, SINGLE_LINE)) { |
| if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) { |
| o1 += dx; |
| } |
| } |
| else { |
| int ch = target.charAt(o1); |
| if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) { |
| o1 += dx; |
| ch = REUtil.composeFromSurrogates(ch, target.charAt(o1)); |
| } |
| if (isEOLChar(ch)) { |
| returned = true; |
| break; |
| } |
| } |
| offset = (dx > 0) ? o1 + 1 : o1; |
| op = op.next; |
| } |
| break; |
| |
| case Op.RANGE: |
| case Op.NRANGE: |
| { |
| int o1 = (dx > 0) ? offset : offset -1; |
| if (o1 >= con.limit || o1 < 0) { |
| returned = true; |
| break; |
| } |
| int ch = target.charAt(offset); |
| if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) { |
| o1 += dx; |
| ch = REUtil.composeFromSurrogates(ch, target.charAt(o1)); |
| } |
| final RangeToken tok = op.getToken(); |
| if (!tok.match(ch)) { |
| returned = true; |
| break; |
| } |
| offset = (dx > 0) ? o1+1 : o1; |
| op = op.next; |
| } |
| break; |
| |
| case Op.ANCHOR: |
| { |
| if (!matchAnchor(target, op, con, offset, opts)) { |
| returned = true; |
| break; |
| } |
| op = op.next; |
| } |
| break; |
| |
| case Op.BACKREFERENCE: |
| { |
| int refno = op.getData(); |
| if (refno <= 0 || refno >= this.nofparen) { |
| throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); |
| } |
| if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) { |
| returned = true; |
| break; |
| } |
| int o2 = con.match.getBeginning(refno); |
| int literallen = con.match.getEnd(refno)-o2; |
| if (dx > 0) { |
| if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) { |
| returned = true; |
| break; |
| } |
| offset += literallen; |
| } |
| else { |
| if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) { |
| returned = true; |
| break; |
| } |
| offset -= literallen; |
| } |
| op = op.next; |
| } |
| break; |
| |
| case Op.STRING: |
| { |
| String literal = op.getString(); |
| int literallen = literal.length(); |
| if (dx > 0) { |
| if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) { |
| returned = true; |
| break; |
| } |
| offset += literallen; |
| } |
| else { |
| if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) { |
| returned = true; |
| break; |
| } |
| offset -= literallen; |
| } |
| op = op.next; |
| } |
| break; |
| |
| case Op.CLOSURE: |
| { |
| // Saves current position to avoid zero-width repeats. |
| final int id = op.getData(); |
| if (con.closureContexts[id].contains(offset)) { |
| returned = true; |
| break; |
| } |
| |
| con.closureContexts[id].addOffset(offset); |
| } |
| // fall through |
| |
| case Op.QUESTION: |
| { |
| opStack.push(op); |
| dataStack.push(offset); |
| op = op.getChild(); |
| } |
| break; |
| |
| case Op.NONGREEDYCLOSURE: |
| case Op.NONGREEDYQUESTION: |
| { |
| opStack.push(op); |
| dataStack.push(offset); |
| op = op.next; |
| } |
| break; |
| |
| case Op.UNION: |
| if (op.size() == 0) { |
| returned = true; |
| } |
| else { |
| opStack.push(op); |
| dataStack.push(0); |
| dataStack.push(offset); |
| op = op.elementAt(0); |
| } |
| break; |
| |
| case Op.CAPTURE: |
| { |
| final int refno = op.getData(); |
| if (con.match != null) { |
| if (refno > 0) { |
| dataStack.push(con.match.getBeginning(refno)); |
| con.match.setBeginning(refno, offset); |
| } |
| else { |
| final int index = -refno; |
| dataStack.push(con.match.getEnd(index)); |
| con.match.setEnd(index, offset); |
| } |
| opStack.push(op); |
| dataStack.push(offset); |
| } |
| op = op.next; |
| } |
| break; |
| |
| case Op.LOOKAHEAD: |
| case Op.NEGATIVELOOKAHEAD: |
| case Op.LOOKBEHIND: |
| case Op.NEGATIVELOOKBEHIND: |
| { |
| opStack.push(op); |
| dataStack.push(dx); |
| dataStack.push(offset); |
| dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1; |
| op = op.getChild(); |
| } |
| break; |
| |
| case Op.INDEPENDENT: |
| { |
| opStack.push(op); |
| dataStack.push(offset); |
| op = op.getChild(); |
| } |
| break; |
| |
| case Op.MODIFIER: |
| { |
| int localopts = opts; |
| localopts |= op.getData(); |
| localopts &= ~op.getData2(); |
| opStack.push(op); |
| dataStack.push(opts); |
| dataStack.push(offset); |
| opts = localopts; |
| op = op.getChild(); |
| } |
| break; |
| |
| case Op.CONDITION: |
| { |
| Op.ConditionOp cop = (Op.ConditionOp)op; |
| if (cop.refNumber > 0) { |
| if (cop.refNumber >= this.nofparen) { |
| throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); |
| } |
| if (con.match.getBeginning(cop.refNumber) >= 0 |
| && con.match.getEnd(cop.refNumber) >= 0) { |
| op = cop.yes; |
| } |
| else if (cop.no != null) { |
| op = cop.no; |
| } |
| else { |
| op = cop.next; |
| } |
| } |
| else { |
| opStack.push(op); |
| dataStack.push(offset); |
| op = cop.condition; |
| } |
| } |
| break; |
| |
| default: |
| throw new RuntimeException("Unknown operation type: " + op.type); |
| } |
| } |
| |
| // handle recursive operations |
| while (returned) { |
| // exhausted all the operations |
| if (opStack.isEmpty()) { |
| return retValue; |
| } |
| |
| op = opStack.pop(); |
| offset = dataStack.pop(); |
| |
| switch (op.type) { |
| case Op.CLOSURE: |
| case Op.QUESTION: |
| if (retValue < 0) { |
| op = op.next; |
| returned = false; |
| } |
| break; |
| |
| case Op.NONGREEDYCLOSURE: |
| case Op.NONGREEDYQUESTION: |
| if (retValue < 0) { |
| op = op.getChild(); |
| returned = false; |
| } |
| break; |
| |
| case Op.UNION: |
| { |
| int unionIndex = dataStack.pop(); |
| if (DEBUG) { |
| System.err.println("UNION: "+unionIndex+", ret="+retValue); |
| } |
| |
| if (retValue < 0) { |
| if (++unionIndex < op.size()) { |
| opStack.push(op); |
| dataStack.push(unionIndex); |
| dataStack.push(offset); |
| op = op.elementAt(unionIndex); |
| returned = false; |
| } |
| else { |
| retValue = -1; |
| } |
| } |
| } |
| break; |
| |
| case Op.CAPTURE: |
| final int refno = op.getData(); |
| final int saved = dataStack.pop(); |
| if (retValue < 0) { |
| if (refno > 0) { |
| con.match.setBeginning(refno, saved); |
| } |
| else { |
| con.match.setEnd(-refno, saved); |
| } |
| } |
| break; |
| |
| case Op.LOOKAHEAD: |
| case Op.LOOKBEHIND: |
| { |
| dx = dataStack.pop(); |
| if (0 <= retValue) { |
| op = op.next; |
| returned = false; |
| } |
| retValue = -1; |
| } |
| break; |
| |
| case Op.NEGATIVELOOKAHEAD: |
| case Op.NEGATIVELOOKBEHIND: |
| { |
| dx = dataStack.pop(); |
| if (0 > retValue) { |
| op = op.next; |
| returned = false; |
| } |
| retValue = -1; |
| } |
| break; |
| |
| case Op.MODIFIER: |
| opts = dataStack.pop(); |
| // fall through |
| |
| case Op.INDEPENDENT: |
| if (retValue >= 0) { |
| offset = retValue; |
| op = op.next; |
| returned = false; |
| } |
| break; |
| |
| case Op.CONDITION: |
| { |
| final Op.ConditionOp cop = (Op.ConditionOp)op; |
| if (0 <= retValue) { |
| op = cop.yes; |
| } |
| else if (cop.no != null) { |
| op = cop.no; |
| } |
| else { |
| op = cop.next; |
| } |
| } |
| returned = false; |
| break; |
| |
| default: |
| break; |
| } |
| } |
| } |
| } |
| |
| private boolean matchChar(int ch, int other, boolean ignoreCase) { |
| return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other; |
| } |
| |
| boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) { |
| boolean go = false; |
| switch (op.getData()) { |
| case '^': |
| if (isSet(opts, MULTIPLE_LINES)) { |
| if (!(offset == con.start |
| || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1)))) |
| return false; |
| } else { |
| if (offset != con.start) |
| return false; |
| } |
| break; |
| |
| case '@': // Internal use only. |
| // The @ always matches line beginnings. |
| if (!(offset == con.start |
| || offset > con.start && isEOLChar(target.charAt(offset-1)))) |
| return false; |
| break; |
| |
| case '$': |
| if (isSet(opts, MULTIPLE_LINES)) { |
| if (!(offset == con.limit |
| || offset < con.limit && isEOLChar(target.charAt(offset)))) |
| return false; |
| } else { |
| if (!(offset == con.limit |
| || offset+1 == con.limit && isEOLChar(target.charAt(offset)) |
| || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN |
| && target.charAt(offset+1) == LINE_FEED)) |
| return false; |
| } |
| break; |
| |
| case 'A': |
| if (offset != con.start) return false; |
| break; |
| |
| case 'Z': |
| if (!(offset == con.limit |
| || offset+1 == con.limit && isEOLChar(target.charAt(offset)) |
| || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN |
| && target.charAt(offset+1) == LINE_FEED)) |
| return false; |
| break; |
| |
| case 'z': |
| if (offset != con.limit) return false; |
| break; |
| |
| case 'b': |
| if (con.length == 0) |
| return false; |
| { |
| int after = getWordType(target, con.start, con.limit, offset, opts); |
| if (after == WT_IGNORE) return false; |
| int before = getPreviousWordType(target, con.start, con.limit, offset, opts); |
| if (after == before) return false; |
| } |
| break; |
| |
| case 'B': |
| if (con.length == 0) |
| go = true; |
| else { |
| int after = getWordType(target, con.start, con.limit, offset, opts); |
| go = after == WT_IGNORE |
| || after == getPreviousWordType(target, con.start, con.limit, offset, opts); |
| } |
| if (!go) return false; |
| break; |
| |
| case '<': |
| if (con.length == 0 || offset == con.limit) return false; |
| if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER |
| || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) |
| return false; |
| break; |
| |
| case '>': |
| if (con.length == 0 || offset == con.start) return false; |
| if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER |
| || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) |
| return false; |
| break; |
| } // switch anchor type |
| |
| return true; |
| } |
| |
| private static final int getPreviousWordType(ExpressionTarget target, int begin, int end, |
| int offset, int opts) { |
| int ret = getWordType(target, begin, end, --offset, opts); |
| while (ret == WT_IGNORE) |
| ret = getWordType(target, begin, end, --offset, opts); |
| return ret; |
| } |
| |
| private static final int getWordType(ExpressionTarget target, int begin, int end, |
| int offset, int opts) { |
| if (offset < begin || offset >= end) return WT_OTHER; |
| return getWordType0(target.charAt(offset) , opts); |
| } |
| |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| * |
| * @return true if the target is matched to this regular expression. |
| */ |
| public boolean matches(CharacterIterator target) { |
| return this.matches(target, (Match)null); |
| } |
| |
| |
| /** |
| * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| * |
| * @param match A Match instance for storing matching result. |
| * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| */ |
| public boolean matches(CharacterIterator target, Match match) { |
| int start = target.getBeginIndex(); |
| int end = target.getEndIndex(); |
| |
| |
| |
| synchronized (this) { |
| if (this.operations == null) |
| this.prepare(); |
| if (this.context == null) |
| this.context = new Context(); |
| } |
| Context con = null; |
| synchronized (this.context) { |
| con = this.context.inuse ? new Context() : this.context; |
| con.reset(target, start, end, this.numberOfClosures); |
| } |
| if (match != null) { |
| match.setNumberOfGroups(this.nofparen); |
| match.setSource(target); |
| } else if (this.hasBackReferences) { |
| match = new Match(); |
| match.setNumberOfGroups(this.nofparen); |
| // Need not to call setSource() because |
| // a caller can not access this match instance. |
| } |
| con.match = match; |
| |
| if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { |
| int matchEnd = this.match(con, this.operations, con.start, 1, this.options); |
| //System.err.println("DEBUG: matchEnd="+matchEnd); |
| if (matchEnd == con.limit) { |
| if (con.match != null) { |
| con.match.setBeginning(0, con.start); |
| con.match.setEnd(0, matchEnd); |
| } |
| con.setInUse(false); |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * The pattern has only fixed string. |
| * The engine uses Boyer-Moore. |
| */ |
| if (this.fixedStringOnly) { |
| //System.err.println("DEBUG: fixed-only: "+this.fixedString); |
| int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| if (o >= 0) { |
| if (con.match != null) { |
| con.match.setBeginning(0, o); |
| con.match.setEnd(0, o+this.fixedString.length()); |
| } |
| con.setInUse(false); |
| return true; |
| } |
| con.setInUse(false); |
| return false; |
| } |
| |
| /* |
| * The pattern contains a fixed string. |
| * The engine checks with Boyer-Moore whether the text contains the fixed string or not. |
| * If not, it return with false. |
| */ |
| if (this.fixedString != null) { |
| int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| if (o < 0) { |
| //System.err.println("Non-match in fixed-string search."); |
| con.setInUse(false); |
| return false; |
| } |
| } |
| |
| int limit = con.limit-this.minlength; |
| int matchStart; |
| int matchEnd = -1; |
| |
| /* |
| * Checks whether the expression starts with ".*". |
| */ |
| if (this.operations != null |
| && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { |
| if (isSet(this.options, SINGLE_LINE)) { |
| matchStart = con.start; |
| matchEnd = this.match(con, this.operations, con.start, 1, this.options); |
| } else { |
| boolean previousIsEOL = true; |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| int ch = target .setIndex( matchStart ) ; |
| if (isEOLChar(ch)) { |
| previousIsEOL = true; |
| } else { |
| if (previousIsEOL) { |
| if (0 <= (matchEnd = this.match(con, this.operations, |
| matchStart, 1, this.options))) |
| break; |
| } |
| previousIsEOL = false; |
| } |
| } |
| } |
| } |
| |
| /* |
| * Optimization against the first character. |
| */ |
| else if (this.firstChar != null) { |
| //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); |
| RangeToken range = this.firstChar; |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| int ch = target .setIndex( matchStart ) ; |
| if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { |
| ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1)); |
| } |
| if (!range.match(ch)) { |
| continue; |
| } |
| if (0 <= (matchEnd = this.match(con, this.operations, |
| matchStart, 1, this.options))) { |
| break; |
| } |
| } |
| } |
| |
| /* |
| * Straightforward matching. |
| */ |
| else { |
| for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) |
| break; |
| } |
| } |
| |
| if (matchEnd >= 0) { |
| if (con.match != null) { |
| con.match.setBeginning(0, matchStart); |
| con.match.setEnd(0, matchEnd); |
| } |
| con.setInUse(false); |
| return true; |
| } else { |
| con.setInUse(false); |
| return false; |
| } |
| } |
| |
| // ================================================================ |
| |
| /** |
| * A regular expression. |
| * @serial |
| */ |
| String regex; |
| /** |
| * @serial |
| */ |
| int options; |
| |
| /** |
| * The number of parenthesis in the regular expression. |
| * @serial |
| */ |
| int nofparen; |
| /** |
| * Internal representation of the regular expression. |
| * @serial |
| */ |
| Token tokentree; |
| |
| boolean hasBackReferences = false; |
| |
| transient int minlength; |
| transient Op operations = null; |
| transient int numberOfClosures; |
| transient Context context = null; |
| transient RangeToken firstChar = null; |
| |
| transient String fixedString = null; |
| transient int fixedStringOptions; |
| transient BMPattern fixedStringTable = null; |
| transient boolean fixedStringOnly = false; |
| |
| static abstract class ExpressionTarget { |
| abstract char charAt(int index); |
| abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen); |
| abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen); |
| } |
| |
| static final class StringTarget extends ExpressionTarget { |
| |
| private String target; |
| |
| StringTarget(String target) { |
| this.target = target; |
| } |
| |
| final void resetTarget(String target) { |
| this.target = target; |
| } |
| |
| final char charAt(int index) { |
| return target.charAt(index); |
| } |
| |
| final boolean regionMatches(boolean ignoreCase, int offset, int limit, |
| String part, int partlen) { |
| if (limit-offset < partlen) { |
| return false; |
| } |
| return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen); |
| } |
| |
| final boolean regionMatches(boolean ignoreCase, int offset, int limit, |
| int offset2, int partlen) { |
| if (limit-offset < partlen) { |
| return false; |
| } |
| return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen) |
| : target.regionMatches(offset, target, offset2, partlen); |
| } |
| } |
| |
| static final class CharArrayTarget extends ExpressionTarget { |
| |
| char[] target; |
| |
| CharArrayTarget(char[] target) { |
| this.target = target; |
| } |
| |
| final void resetTarget(char[] target) { |
| this.target = target; |
| } |
| |
| char charAt(int index) { |
| return target[index]; |
| } |
| |
| final boolean regionMatches(boolean ignoreCase, int offset, int limit, |
| String part, int partlen) { |
| if (offset < 0 || limit-offset < partlen) { |
| return false; |
| } |
| return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen) |
| : regionMatches(offset, limit, part, partlen); |
| } |
| |
| private final boolean regionMatches(int offset, int limit, String part, int partlen) { |
| int i = 0; |
| while (partlen-- > 0) { |
| if (target[offset++] != part.charAt(i++)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) { |
| int i = 0; |
| while (partlen-- > 0) { |
| final char ch1 = target[offset++] ; |
| final char ch2 = part.charAt(i++); |
| if (ch1 == ch2) { |
| continue; |
| } |
| final char uch1 = Character.toUpperCase(ch1); |
| final char uch2 = Character.toUpperCase(ch2); |
| if (uch1 == uch2) { |
| continue; |
| } |
| if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) { |
| if (offset < 0 || limit-offset < partlen) { |
| return false; |
| } |
| return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen) |
| : regionMatches(offset, limit, offset2, partlen); |
| } |
| |
| private final boolean regionMatches(int offset, int limit, int offset2, int partlen) { |
| int i = offset2; |
| while (partlen-- > 0) { |
| if ( target [ offset++ ] != target [ i++ ] ) |
| return false; |
| } |
| return true; |
| } |
| |
| private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) { |
| int i = offset2; |
| while (partlen-- > 0) { |
| final char ch1 = target[offset++] ; |
| final char ch2 = target[i++] ; |
| if (ch1 == ch2) { |
| continue; |
| } |
| final char uch1 = Character.toUpperCase(ch1); |
| final char uch2 = Character.toUpperCase(ch2); |
| if (uch1 == uch2) { |
| continue; |
| } |
| if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } |
| |
| static final class CharacterIteratorTarget extends ExpressionTarget { |
| CharacterIterator target; |
| |
| CharacterIteratorTarget(CharacterIterator target) { |
| this.target = target; |
| } |
| |
| final void resetTarget(CharacterIterator target) { |
| this.target = target; |
| } |
| |
| final char charAt(int index) { |
| return target.setIndex(index); |
| } |
| |
| final boolean regionMatches(boolean ignoreCase, int offset, int limit, |
| String part, int partlen) { |
| if (offset < 0 || limit-offset < partlen) { |
| return false; |
| } |
| return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen) |
| : regionMatches(offset, limit, part, partlen); |
| } |
| |
| private final boolean regionMatches(int offset, int limit, String part, int partlen) { |
| int i = 0; |
| while (partlen-- > 0) { |
| if (target.setIndex(offset++) != part.charAt(i++)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) { |
| int i = 0; |
| while (partlen-- > 0) { |
| final char ch1 = target.setIndex(offset++) ; |
| final char ch2 = part.charAt(i++); |
| if (ch1 == ch2) { |
| continue; |
| } |
| final char uch1 = Character.toUpperCase(ch1); |
| final char uch2 = Character.toUpperCase(ch2); |
| if (uch1 == uch2) { |
| continue; |
| } |
| if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) { |
| if (offset < 0 || limit-offset < partlen) { |
| return false; |
| } |
| return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen) |
| : regionMatches(offset, limit, offset2, partlen); |
| } |
| |
| private final boolean regionMatches(int offset, int limit, int offset2, int partlen) { |
| int i = offset2; |
| while (partlen-- > 0) { |
| if (target.setIndex(offset++) != target.setIndex(i++)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) { |
| int i = offset2; |
| while (partlen-- > 0) { |
| final char ch1 = target.setIndex(offset++) ; |
| final char ch2 = target.setIndex(i++) ; |
| if (ch1 == ch2) { |
| continue; |
| } |
| final char uch1 = Character.toUpperCase(ch1); |
| final char uch2 = Character.toUpperCase(ch2); |
| if (uch1 == uch2) { |
| continue; |
| } |
| if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } |
| |
| static final class ClosureContext { |
| |
| int[] offsets = new int[4]; |
| int currentIndex = 0; |
| |
| boolean contains(int offset) { |
| for (int i=0; i<currentIndex;++i) { |
| if (offsets[i] == offset) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| void reset() { |
| currentIndex = 0; |
| } |
| |
| void addOffset(int offset) { |
| // We do not check for duplicates, caller is responsible for that |
| if (currentIndex == offsets.length) { |
| offsets = expandOffsets(); |
| } |
| offsets[currentIndex++] = offset; |
| } |
| |
| private int[] expandOffsets() { |
| final int len = offsets.length; |
| final int newLen = len << 1; |
| int[] newOffsets = new int[newLen]; |
| |
| System.arraycopy(offsets, 0, newOffsets, 0, currentIndex); |
| return newOffsets; |
| } |
| } |
| |
| static final class Context { |
| int start; |
| int limit; |
| int length; |
| Match match; |
| boolean inuse = false; |
| ClosureContext[] closureContexts; |
| |
| private StringTarget stringTarget; |
| private CharArrayTarget charArrayTarget; |
| private CharacterIteratorTarget characterIteratorTarget; |
| |
| ExpressionTarget target; |
| |
| Context() { |
| } |
| |
| private void resetCommon(int nofclosures) { |
| this.length = this.limit-this.start; |
| setInUse(true); |
| this.match = null; |
| if (this.closureContexts == null || this.closureContexts.length != nofclosures) { |
| this.closureContexts = new ClosureContext[nofclosures]; |
| } |
| for (int i = 0; i < nofclosures; i ++) { |
| if (this.closureContexts[i] == null) { |
| this.closureContexts[i] = new ClosureContext(); |
| } |
| else { |
| this.closureContexts[i].reset(); |
| } |
| } |
| } |
| |
| void reset(CharacterIterator target, int start, int limit, int nofclosures) { |
| if (characterIteratorTarget == null) { |
| characterIteratorTarget = new CharacterIteratorTarget(target); |
| } |
| else { |
| characterIteratorTarget.resetTarget(target); |
| } |
| this.target = characterIteratorTarget; |
| this.start = start; |
| this.limit = limit; |
| this.resetCommon(nofclosures); |
| } |
| |
| void reset(String target, int start, int limit, int nofclosures) { |
| if (stringTarget == null) { |
| stringTarget = new StringTarget(target); |
| } |
| else { |
| stringTarget.resetTarget(target); |
| } |
| this.target = stringTarget; |
| this.start = start; |
| this.limit = limit; |
| this.resetCommon(nofclosures); |
| } |
| |
| void reset(char[] target, int start, int limit, int nofclosures) { |
| if (charArrayTarget == null) { |
| charArrayTarget = new CharArrayTarget(target); |
| } |
| else { |
| charArrayTarget.resetTarget(target); |
| } |
| this.target = charArrayTarget; |
| this.start = start; |
| this.limit = limit; |
| this.resetCommon(nofclosures); |
| } |
| synchronized void setInUse(boolean inUse) { |
| this.inuse = inUse; |
| } |
| } |
| |
| /** |
| * Prepares for matching. This method is called just before starting matching. |
| */ |
| void prepare() { |
| if (Op.COUNT) Op.nofinstances = 0; |
| this.compile(this.tokentree); |
| /* |
| if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .* |
| Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@'); |
| anchor.next = this.operations; |
| this.operations = anchor; |
| } |
| */ |
| if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); |
| |
| this.minlength = this.tokentree.getMinLength(); |
| |
| this.firstChar = null; |
| if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) |
| && !isSet(this.options, XMLSCHEMA_MODE)) { |
| RangeToken firstChar = Token.createRange(); |
| int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); |
| if (fresult == Token.FC_TERMINAL) { |
| firstChar.compactRanges(); |
| this.firstChar = firstChar; |
| if (DEBUG) |
| System.err.println("DEBUG: Use the first character optimization: "+firstChar); |
| } |
| } |
| |
| if (this.operations != null |
| && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) |
| && this.operations.next == null) { |
| if (DEBUG) |
| System.err.print(" *** Only fixed string! *** "); |
| this.fixedStringOnly = true; |
| if (this.operations.type == Op.STRING) |
| this.fixedString = this.operations.getString(); |
| else if (this.operations.getData() >= 0x10000) { // Op.CHAR |
| this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); |
| } else { |
| char[] ac = new char[1]; |
| ac[0] = (char)this.operations.getData(); |
| this.fixedString = new String(ac); |
| } |
| this.fixedStringOptions = this.options; |
| this.fixedStringTable = new BMPattern(this.fixedString, 256, |
| isSet(this.fixedStringOptions, IGNORE_CASE)); |
| } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) |
| && !isSet(this.options, XMLSCHEMA_MODE)) { |
| Token.FixedStringContainer container = new Token.FixedStringContainer(); |
| this.tokentree.findFixedString(container, this.options); |
| this.fixedString = container.token == null ? null : container.token.getString(); |
| this.fixedStringOptions = container.options; |
| if (this.fixedString != null && this.fixedString.length() < 2) |
| this.fixedString = null; |
| // This pattern has a fixed string of which length is more than one. |
| if (this.fixedString != null) { |
| this.fixedStringTable = new BMPattern(this.fixedString, 256, |
| isSet(this.fixedStringOptions, IGNORE_CASE)); |
| if (DEBUG) { |
| System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() |
| +"/" //+this.fixedString |
| +"/"+REUtil.createOptionString(this.fixedStringOptions)); |
| System.err.print("String: "); |
| REUtil.dumpString(this.fixedString); |
| } |
| } |
| } |
| } |
| |
| /** |
| * An option. |
| * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span> |
| * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span> |
| * does not capture. |
| * |
| * @see #RegularExpression(java.lang.String,int) |
| * @see #setPattern(java.lang.String,int) |
| static final int MARK_PARENS = 1<<0; |
| */ |
| |
| /** |
| * "i" |
| */ |
| static final int IGNORE_CASE = 1<<1; |
| |
| /** |
| * "s" |
| */ |
| static final int SINGLE_LINE = 1<<2; |
| |
| /** |
| * "m" |
| */ |
| static final int MULTIPLE_LINES = 1<<3; |
| |
| /** |
| * "x" |
| */ |
| static final int EXTENDED_COMMENT = 1<<4; |
| |
| /** |
| * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>. |
| * |
| * @see #RegularExpression(java.lang.String,int) |
| * @see #setPattern(java.lang.String,int) |
| * @see #UNICODE_WORD_BOUNDARY |
| */ |
| static final int USE_UNICODE_CATEGORY = 1<<5; // "u" |
| |
| /** |
| * An option. |
| * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>. |
| * <p>By default, the engine considers a position between a word character |
| * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character |
| * is a word boundary. |
| * <p>By this option, the engine checks word boundaries with the method of |
| * 'Unicode Regular Expression Guidelines' Revision 4. |
| * |
| * @see #RegularExpression(java.lang.String,int) |
| * @see #setPattern(java.lang.String,int) |
| */ |
| static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w" |
| |
| /** |
| * "H" |
| */ |
| static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; |
| /** |
| * "F" |
| */ |
| static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; |
| /** |
| * "X". XML Schema mode. |
| */ |
| static final int XMLSCHEMA_MODE = 1<<9; |
| /** |
| * ",". |
| */ |
| static final int SPECIAL_COMMA = 1<<10; |
| |
| |
| private static final boolean isSet(int options, int flag) { |
| return (options & flag) == flag; |
| } |
| |
| /** |
| * Creates a new RegularExpression instance. |
| * |
| * @param regex A regular expression |
| * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. |
| */ |
| public RegularExpression(String regex) throws ParseException { |
| this(regex, null); |
| } |
| |
| /** |
| * Creates a new RegularExpression instance with options. |
| * |
| * @param regex A regular expression |
| * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" |
| * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. |
| */ |
| public RegularExpression(String regex, String options) throws ParseException { |
| this.setPattern(regex, options); |
| } |
| |
| /** |
| * Creates a new RegularExpression instance with options. |
| * |
| * @param regex A regular expression |
| * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" |
| * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. |
| */ |
| public RegularExpression(String regex, String options, Locale locale) throws ParseException { |
| this.setPattern(regex, options, locale); |
| } |
| |
| RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { |
| this.regex = regex; |
| this.tokentree = tok; |
| this.nofparen = parens; |
| this.options = options; |
| this.hasBackReferences = hasBackReferences; |
| } |
| |
| /** |
| * |
| */ |
| public void setPattern(String newPattern) throws ParseException { |
| this.setPattern(newPattern, Locale.getDefault()); |
| } |
| |
| public void setPattern(String newPattern, Locale locale) throws ParseException { |
| this.setPattern(newPattern, this.options, locale); |
| } |
| |
| private void setPattern(String newPattern, int options, Locale locale) throws ParseException { |
| this.regex = newPattern; |
| this.options = options; |
| RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) |
| ? new ParserForXMLSchema(locale) : new RegexParser(locale); |
| this.tokentree = rp.parse(this.regex, this.options); |
| this.nofparen = rp.parennumber; |
| this.hasBackReferences = rp.hasBackReferences; |
| |
| this.operations = null; |
| this.context = null; |
| } |
| /** |
| * |
| */ |
| public void setPattern(String newPattern, String options) throws ParseException { |
| this.setPattern(newPattern, options, Locale.getDefault()); |
| } |
| |
| public void setPattern(String newPattern, String options, Locale locale) throws ParseException { |
| this.setPattern(newPattern, REUtil.parseOptions(options), locale); |
| } |
| |
| /** |
| * |
| */ |
| public String getPattern() { |
| return this.regex; |
| } |
| |
| /** |
| * Represents this instence in String. |
| */ |
| public String toString() { |
| return this.tokentree.toString(this.options); |
| } |
| |
| /** |
| * Returns a option string. |
| * The order of letters in it may be different from a string specified |
| * in a constructor or <code>setPattern()</code>. |
| * |
| * @see #RegularExpression(java.lang.String,java.lang.String) |
| * @see #setPattern(java.lang.String,java.lang.String) |
| */ |
| public String getOptions() { |
| return REUtil.createOptionString(this.options); |
| } |
| |
| /** |
| * Return true if patterns are the same and the options are equivalent. |
| */ |
| public boolean equals(Object obj) { |
| if (obj == null) return false; |
| if (!(obj instanceof RegularExpression)) |
| return false; |
| RegularExpression r = (RegularExpression)obj; |
| return this.regex.equals(r.regex) && this.options == r.options; |
| } |
| |
| boolean equals(String pattern, int options) { |
| return this.regex.equals(pattern) && this.options == options; |
| } |
| |
| /** |
| * |
| */ |
| public int hashCode() { |
| return (this.regex+"/"+this.getOptions()).hashCode(); |
| } |
| |
| /** |
| * Return the number of regular expression groups. |
| * This method returns 1 when the regular expression has no capturing-parenthesis. |
| * |
| */ |
| public int getNumberOfGroups() { |
| return this.nofparen; |
| } |
| |
| // ================================================================ |
| |
| private static final int WT_IGNORE = 0; |
| private static final int WT_LETTER = 1; |
| private static final int WT_OTHER = 2; |
| private static final int getWordType0(char ch, int opts) { |
| if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { |
| if (isSet(opts, USE_UNICODE_CATEGORY)) { |
| return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; |
| } |
| return isWordChar(ch) ? WT_LETTER : WT_OTHER; |
| } |
| |
| switch (Character.getType(ch)) { |
| case Character.UPPERCASE_LETTER: // L |
| case Character.LOWERCASE_LETTER: // L |
| case Character.TITLECASE_LETTER: // L |
| case Character.MODIFIER_LETTER: // L |
| case Character.OTHER_LETTER: // L |
| case Character.LETTER_NUMBER: // N |
| case Character.DECIMAL_DIGIT_NUMBER: // N |
| case Character.OTHER_NUMBER: // N |
| case Character.COMBINING_SPACING_MARK: // Mc |
| return WT_LETTER; |
| |
| case Character.FORMAT: // Cf |
| case Character.NON_SPACING_MARK: // Mn |
| case Character.ENCLOSING_MARK: // Mc |
| return WT_IGNORE; |
| |
| case Character.CONTROL: // Cc |
| switch (ch) { |
| case '\t': |
| case '\n': |
| case '\u000B': |
| case '\f': |
| case '\r': |
| return WT_OTHER; |
| default: |
| return WT_IGNORE; |
| } |
| |
| default: |
| return WT_OTHER; |
| } |
| } |
| |
| // ================================================================ |
| |
| static final int LINE_FEED = 0x000A; |
| static final int CARRIAGE_RETURN = 0x000D; |
| static final int LINE_SEPARATOR = 0x2028; |
| static final int PARAGRAPH_SEPARATOR = 0x2029; |
| |
| private static final boolean isEOLChar(int ch) { |
| return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR |
| || ch == PARAGRAPH_SEPARATOR; |
| } |
| |
| private static final boolean isWordChar(int ch) { // Legacy word characters |
| if (ch == '_') return true; |
| if (ch < '0') return false; |
| if (ch > 'z') return false; |
| if (ch <= '9') return true; |
| if (ch < 'A') return false; |
| if (ch <= 'Z') return true; |
| if (ch < 'a') return false; |
| return true; |
| } |
| |
| private static final boolean matchIgnoreCase(int chardata, int ch) { |
| if (chardata == ch) return true; |
| if (chardata > 0xffff || ch > 0xffff) return false; |
| char uch1 = Character.toUpperCase((char)chardata); |
| char uch2 = Character.toUpperCase((char)ch); |
| if (uch1 == uch2) return true; |
| return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); |
| } |
| } |