blob: 5dc6caa19d2f2c422e0edf7dfe6cfe2c84bc1c0f [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 1999-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26package java.util.regex;
27
28
29/**
30 * An engine that performs match operations on a {@link java.lang.CharSequence
31 * </code>character sequence<code>} by interpreting a {@link Pattern}.
32 *
33 * <p> A matcher is created from a pattern by invoking the pattern's {@link
34 * Pattern#matcher matcher} method. Once created, a matcher can be used to
35 * perform three different kinds of match operations:
36 *
37 * <ul>
38 *
39 * <li><p> The {@link #matches matches} method attempts to match the entire
40 * input sequence against the pattern. </p></li>
41 *
42 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the
43 * input sequence, starting at the beginning, against the pattern. </p></li>
44 *
45 * <li><p> The {@link #find find} method scans the input sequence looking for
46 * the next subsequence that matches the pattern. </p></li>
47 *
48 * </ul>
49 *
50 * <p> Each of these methods returns a boolean indicating success or failure.
51 * More information about a successful match can be obtained by querying the
52 * state of the matcher.
53 *
54 * <p> A matcher finds matches in a subset of its input called the
55 * <i>region</i>. By default, the region contains all of the matcher's input.
56 * The region can be modified via the{@link #region region} method and queried
57 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
58 * methods. The way that the region boundaries interact with some pattern
59 * constructs can be changed. See {@link #useAnchoringBounds
60 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
61 * for more details.
62 *
63 * <p> This class also defines methods for replacing matched subsequences with
64 * new strings whose contents can, if desired, be computed from the match
65 * result. The {@link #appendReplacement appendReplacement} and {@link
66 * #appendTail appendTail} methods can be used in tandem in order to collect
67 * the result into an existing string buffer, or the more convenient {@link
68 * #replaceAll replaceAll} method can be used to create a string in which every
69 * matching subsequence in the input sequence is replaced.
70 *
71 * <p> The explicit state of a matcher includes the start and end indices of
72 * the most recent successful match. It also includes the start and end
73 * indices of the input subsequence captured by each <a
74 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
75 * count of such subsequences. As a convenience, methods are also provided for
76 * returning these captured subsequences in string form.
77 *
78 * <p> The explicit state of a matcher is initially undefined; attempting to
79 * query any part of it before a successful match will cause an {@link
80 * IllegalStateException} to be thrown. The explicit state of a matcher is
81 * recomputed by every match operation.
82 *
83 * <p> The implicit state of a matcher includes the input character sequence as
84 * well as the <i>append position</i>, which is initially zero and is updated
85 * by the {@link #appendReplacement appendReplacement} method.
86 *
87 * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
88 * method or, if a new input sequence is desired, its {@link
89 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
90 * matcher discards its explicit state information and sets the append position
91 * to zero.
92 *
93 * <p> Instances of this class are not safe for use by multiple concurrent
94 * threads. </p>
95 *
96 *
97 * @author Mike McCloskey
98 * @author Mark Reinhold
99 * @author JSR-51 Expert Group
100 * @since 1.4
101 * @spec JSR-51
102 */
103
104public final class Matcher implements MatchResult {
105
106 /**
107 * The Pattern object that created this Matcher.
108 */
109 Pattern parentPattern;
110
111 /**
112 * The storage used by groups. They may contain invalid values if
113 * a group was skipped during the matching.
114 */
115 int[] groups;
116
117 /**
118 * The range within the sequence that is to be matched. Anchors
119 * will match at these "hard" boundaries. Changing the region
120 * changes these values.
121 */
122 int from, to;
123
124 /**
125 * Lookbehind uses this value to ensure that the subexpression
126 * match ends at the point where the lookbehind was encountered.
127 */
128 int lookbehindTo;
129
130 /**
131 * The original string being matched.
132 */
133 CharSequence text;
134
135 /**
136 * Matcher state used by the last node. NOANCHOR is used when a
137 * match does not have to consume all of the input. ENDANCHOR is
138 * the mode used for matching all the input.
139 */
140 static final int ENDANCHOR = 1;
141 static final int NOANCHOR = 0;
142 int acceptMode = NOANCHOR;
143
144 /**
145 * The range of string that last matched the pattern. If the last
146 * match failed then first is -1; last initially holds 0 then it
147 * holds the index of the end of the last match (which is where the
148 * next search starts).
149 */
150 int first = -1, last = 0;
151
152 /**
153 * The end index of what matched in the last match operation.
154 */
155 int oldLast = -1;
156
157 /**
158 * The index of the last position appended in a substitution.
159 */
160 int lastAppendPosition = 0;
161
162 /**
163 * Storage used by nodes to tell what repetition they are on in
164 * a pattern, and where groups begin. The nodes themselves are stateless,
165 * so they rely on this field to hold state during a match.
166 */
167 int[] locals;
168
169 /**
170 * Boolean indicating whether or not more input could change
171 * the results of the last match.
172 *
173 * If hitEnd is true, and a match was found, then more input
174 * might cause a different match to be found.
175 * If hitEnd is true and a match was not found, then more
176 * input could cause a match to be found.
177 * If hitEnd is false and a match was found, then more input
178 * will not change the match.
179 * If hitEnd is false and a match was not found, then more
180 * input will not cause a match to be found.
181 */
182 boolean hitEnd;
183
184 /**
185 * Boolean indicating whether or not more input could change
186 * a positive match into a negative one.
187 *
188 * If requireEnd is true, and a match was found, then more
189 * input could cause the match to be lost.
190 * If requireEnd is false and a match was found, then more
191 * input might change the match but the match won't be lost.
192 * If a match was not found, then requireEnd has no meaning.
193 */
194 boolean requireEnd;
195
196 /**
197 * If transparentBounds is true then the boundaries of this
198 * matcher's region are transparent to lookahead, lookbehind,
199 * and boundary matching constructs that try to see beyond them.
200 */
201 boolean transparentBounds = false;
202
203 /**
204 * If anchoringBounds is true then the boundaries of this
205 * matcher's region match anchors such as ^ and $.
206 */
207 boolean anchoringBounds = true;
208
209 /**
210 * No default constructor.
211 */
212 Matcher() {
213 }
214
215 /**
216 * All matchers have the state used by Pattern during a match.
217 */
218 Matcher(Pattern parent, CharSequence text) {
219 this.parentPattern = parent;
220 this.text = text;
221
222 // Allocate state storage
223 int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
224 groups = new int[parentGroupCount * 2];
225 locals = new int[parent.localCount];
226
227 // Put fields into initial states
228 reset();
229 }
230
231 /**
232 * Returns the pattern that is interpreted by this matcher.
233 *
234 * @return The pattern for which this matcher was created
235 */
236 public Pattern pattern() {
237 return parentPattern;
238 }
239
240 /**
241 * Returns the match state of this matcher as a {@link MatchResult}.
242 * The result is unaffected by subsequent operations performed upon this
243 * matcher.
244 *
245 * @return a <code>MatchResult</code> with the state of this matcher
246 * @since 1.5
247 */
248 public MatchResult toMatchResult() {
249 Matcher result = new Matcher(this.parentPattern, text.toString());
250 result.first = this.first;
251 result.last = this.last;
252 result.groups = (int[])(this.groups.clone());
253 return result;
254 }
255
256 /**
257 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
258 * find matches with.
259 *
260 * <p> This method causes this matcher to lose information
261 * about the groups of the last match that occurred. The
262 * matcher's position in the input is maintained and its
263 * last append position is unaffected.</p>
264 *
265 * @param newPattern
266 * The new pattern used by this matcher
267 * @return This matcher
268 * @throws IllegalArgumentException
269 * If newPattern is <tt>null</tt>
270 * @since 1.5
271 */
272 public Matcher usePattern(Pattern newPattern) {
273 if (newPattern == null)
274 throw new IllegalArgumentException("Pattern cannot be null");
275 parentPattern = newPattern;
276
277 // Reallocate state storage
278 int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
279 groups = new int[parentGroupCount * 2];
280 locals = new int[newPattern.localCount];
281 for (int i = 0; i < groups.length; i++)
282 groups[i] = -1;
283 for (int i = 0; i < locals.length; i++)
284 locals[i] = -1;
285 return this;
286 }
287
288 /**
289 * Resets this matcher.
290 *
291 * <p> Resetting a matcher discards all of its explicit state information
292 * and sets its append position to zero. The matcher's region is set to the
293 * default region, which is its entire character sequence. The anchoring
294 * and transparency of this matcher's region boundaries are unaffected.
295 *
296 * @return This matcher
297 */
298 public Matcher reset() {
299 first = -1;
300 last = 0;
301 oldLast = -1;
302 for(int i=0; i<groups.length; i++)
303 groups[i] = -1;
304 for(int i=0; i<locals.length; i++)
305 locals[i] = -1;
306 lastAppendPosition = 0;
307 from = 0;
308 to = getTextLength();
309 return this;
310 }
311
312 /**
313 * Resets this matcher with a new input sequence.
314 *
315 * <p> Resetting a matcher discards all of its explicit state information
316 * and sets its append position to zero. The matcher's region is set to
317 * the default region, which is its entire character sequence. The
318 * anchoring and transparency of this matcher's region boundaries are
319 * unaffected.
320 *
321 * @param input
322 * The new input character sequence
323 *
324 * @return This matcher
325 */
326 public Matcher reset(CharSequence input) {
327 text = input;
328 return reset();
329 }
330
331 /**
332 * Returns the start index of the previous match. </p>
333 *
334 * @return The index of the first character matched
335 *
336 * @throws IllegalStateException
337 * If no match has yet been attempted,
338 * or if the previous match operation failed
339 */
340 public int start() {
341 if (first < 0)
342 throw new IllegalStateException("No match available");
343 return first;
344 }
345
346 /**
347 * Returns the start index of the subsequence captured by the given group
348 * during the previous match operation.
349 *
350 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
351 * to right, starting at one. Group zero denotes the entire pattern, so
352 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
353 * <i>m.</i><tt>start()</tt>. </p>
354 *
355 * @param group
356 * The index of a capturing group in this matcher's pattern
357 *
358 * @return The index of the first character captured by the group,
359 * or <tt>-1</tt> if the match was successful but the group
360 * itself did not match anything
361 *
362 * @throws IllegalStateException
363 * If no match has yet been attempted,
364 * or if the previous match operation failed
365 *
366 * @throws IndexOutOfBoundsException
367 * If there is no capturing group in the pattern
368 * with the given index
369 */
370 public int start(int group) {
371 if (first < 0)
372 throw new IllegalStateException("No match available");
373 if (group > groupCount())
374 throw new IndexOutOfBoundsException("No group " + group);
375 return groups[group * 2];
376 }
377
378 /**
379 * Returns the offset after the last character matched. </p>
380 *
381 * @return The offset after the last character matched
382 *
383 * @throws IllegalStateException
384 * If no match has yet been attempted,
385 * or if the previous match operation failed
386 */
387 public int end() {
388 if (first < 0)
389 throw new IllegalStateException("No match available");
390 return last;
391 }
392
393 /**
394 * Returns the offset after the last character of the subsequence
395 * captured by the given group during the previous match operation.
396 *
397 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
398 * to right, starting at one. Group zero denotes the entire pattern, so
399 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
400 * <i>m.</i><tt>end()</tt>. </p>
401 *
402 * @param group
403 * The index of a capturing group in this matcher's pattern
404 *
405 * @return The offset after the last character captured by the group,
406 * or <tt>-1</tt> if the match was successful
407 * but the group itself did not match anything
408 *
409 * @throws IllegalStateException
410 * If no match has yet been attempted,
411 * or if the previous match operation failed
412 *
413 * @throws IndexOutOfBoundsException
414 * If there is no capturing group in the pattern
415 * with the given index
416 */
417 public int end(int group) {
418 if (first < 0)
419 throw new IllegalStateException("No match available");
420 if (group > groupCount())
421 throw new IndexOutOfBoundsException("No group " + group);
422 return groups[group * 2 + 1];
423 }
424
425 /**
426 * Returns the input subsequence matched by the previous match.
427 *
428 * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
429 * the expressions <i>m.</i><tt>group()</tt> and
430 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt>&nbsp;<i>m.</i><tt>end())</tt>
431 * are equivalent. </p>
432 *
433 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
434 * string. This method will return the empty string when the pattern
435 * successfully matches the empty string in the input. </p>
436 *
437 * @return The (possibly empty) subsequence matched by the previous match,
438 * in string form
439 *
440 * @throws IllegalStateException
441 * If no match has yet been attempted,
442 * or if the previous match operation failed
443 */
444 public String group() {
445 return group(0);
446 }
447
448 /**
449 * Returns the input subsequence captured by the given group during the
450 * previous match operation.
451 *
452 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
453 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
454 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt>&nbsp;<i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
455 * are equivalent. </p>
456 *
457 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
458 * to right, starting at one. Group zero denotes the entire pattern, so
459 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
460 * </p>
461 *
462 * <p> If the match was successful but the group specified failed to match
463 * any part of the input sequence, then <tt>null</tt> is returned. Note
464 * that some groups, for example <tt>(a*)</tt>, match the empty string.
465 * This method will return the empty string when such a group successfully
466 * matches the empty string in the input. </p>
467 *
468 * @param group
469 * The index of a capturing group in this matcher's pattern
470 *
471 * @return The (possibly empty) subsequence captured by the group
472 * during the previous match, or <tt>null</tt> if the group
473 * failed to match part of the input
474 *
475 * @throws IllegalStateException
476 * If no match has yet been attempted,
477 * or if the previous match operation failed
478 *
479 * @throws IndexOutOfBoundsException
480 * If there is no capturing group in the pattern
481 * with the given index
482 */
483 public String group(int group) {
484 if (first < 0)
485 throw new IllegalStateException("No match found");
486 if (group < 0 || group > groupCount())
487 throw new IndexOutOfBoundsException("No group " + group);
488 if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
489 return null;
490 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
491 }
492
493 /**
494 * Returns the number of capturing groups in this matcher's pattern.
495 *
496 * <p> Group zero denotes the entire pattern by convention. It is not
497 * included in this count.
498 *
499 * <p> Any non-negative integer smaller than or equal to the value
500 * returned by this method is guaranteed to be a valid group index for
501 * this matcher. </p>
502 *
503 * @return The number of capturing groups in this matcher's pattern
504 */
505 public int groupCount() {
506 return parentPattern.capturingGroupCount - 1;
507 }
508
509 /**
510 * Attempts to match the entire region against the pattern.
511 *
512 * <p> If the match succeeds then more information can be obtained via the
513 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
514 *
515 * @return <tt>true</tt> if, and only if, the entire region sequence
516 * matches this matcher's pattern
517 */
518 public boolean matches() {
519 return match(from, ENDANCHOR);
520 }
521
522 /**
523 * Attempts to find the next subsequence of the input sequence that matches
524 * the pattern.
525 *
526 * <p> This method starts at the beginning of this matcher's region, or, if
527 * a previous invocation of the method was successful and the matcher has
528 * not since been reset, at the first character not matched by the previous
529 * match.
530 *
531 * <p> If the match succeeds then more information can be obtained via the
532 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
533 *
534 * @return <tt>true</tt> if, and only if, a subsequence of the input
535 * sequence matches this matcher's pattern
536 */
537 public boolean find() {
538 int nextSearchIndex = last;
539 if (nextSearchIndex == first)
540 nextSearchIndex++;
541
542 // If next search starts before region, start it at region
543 if (nextSearchIndex < from)
544 nextSearchIndex = from;
545
546 // If next search starts beyond region then it fails
547 if (nextSearchIndex > to) {
548 for (int i = 0; i < groups.length; i++)
549 groups[i] = -1;
550 return false;
551 }
552 return search(nextSearchIndex);
553 }
554
555 /**
556 * Resets this matcher and then attempts to find the next subsequence of
557 * the input sequence that matches the pattern, starting at the specified
558 * index.
559 *
560 * <p> If the match succeeds then more information can be obtained via the
561 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
562 * invocations of the {@link #find()} method will start at the first
563 * character not matched by this match. </p>
564 *
565 * @throws IndexOutOfBoundsException
566 * If start is less than zero or if start is greater than the
567 * length of the input sequence.
568 *
569 * @return <tt>true</tt> if, and only if, a subsequence of the input
570 * sequence starting at the given index matches this matcher's
571 * pattern
572 */
573 public boolean find(int start) {
574 int limit = getTextLength();
575 if ((start < 0) || (start > limit))
576 throw new IndexOutOfBoundsException("Illegal start index");
577 reset();
578 return search(start);
579 }
580
581 /**
582 * Attempts to match the input sequence, starting at the beginning of the
583 * region, against the pattern.
584 *
585 * <p> Like the {@link #matches matches} method, this method always starts
586 * at the beginning of the region; unlike that method, it does not
587 * require that the entire region be matched.
588 *
589 * <p> If the match succeeds then more information can be obtained via the
590 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
591 *
592 * @return <tt>true</tt> if, and only if, a prefix of the input
593 * sequence matches this matcher's pattern
594 */
595 public boolean lookingAt() {
596 return match(from, NOANCHOR);
597 }
598
599 /**
600 * Returns a literal replacement <code>String</code> for the specified
601 * <code>String</code>.
602 *
603 * This method produces a <code>String</code> that will work
604 * as a literal replacement <code>s</code> in the
605 * <code>appendReplacement</code> method of the {@link Matcher} class.
606 * The <code>String</code> produced will match the sequence of characters
607 * in <code>s</code> treated as a literal sequence. Slashes ('\') and
608 * dollar signs ('$') will be given no special meaning.
609 *
610 * @param s The string to be literalized
611 * @return A literal string replacement
612 * @since 1.5
613 */
614 public static String quoteReplacement(String s) {
615 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
616 return s;
617 StringBuilder sb = new StringBuilder();
618 for (int i=0; i<s.length(); i++) {
619 char c = s.charAt(i);
620 if (c == '\\' || c == '$') {
621 sb.append('\\');
622 }
623 sb.append(c);
624 }
625 return sb.toString();
626 }
627
628 /**
629 * Implements a non-terminal append-and-replace step.
630 *
631 * <p> This method performs the following actions: </p>
632 *
633 * <ol>
634 *
635 * <li><p> It reads characters from the input sequence, starting at the
636 * append position, and appends them to the given string buffer. It
637 * stops after reading the last character preceding the previous match,
638 * that is, the character at index {@link
639 * #start()}&nbsp;<tt>-</tt>&nbsp;<tt>1</tt>. </p></li>
640 *
641 * <li><p> It appends the given replacement string to the string buffer.
642 * </p></li>
643 *
644 * <li><p> It sets the append position of this matcher to the index of
645 * the last character matched, plus one, that is, to {@link #end()}.
646 * </p></li>
647 *
648 * </ol>
649 *
650 * <p> The replacement string may contain references to subsequences
651 * captured during the previous match: Each occurrence of
652 * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
653 * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>.
654 * The first number after the <tt>$</tt> is always treated as part of
655 * the group reference. Subsequent numbers are incorporated into g if
656 * they would form a legal group reference. Only the numerals '0'
657 * through '9' are considered as potential components of the group
658 * reference. If the second group matched the string <tt>"foo"</tt>, for
659 * example, then passing the replacement string <tt>"$2bar"</tt> would
660 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
661 * sign (<tt>$</tt>) may be included as a literal in the replacement
662 * string by preceding it with a backslash (<tt>\$</tt>).
663 *
664 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
665 * the replacement string may cause the results to be different than if it
666 * were being treated as a literal replacement string. Dollar signs may be
667 * treated as references to captured subsequences as described above, and
668 * backslashes are used to escape literal characters in the replacement
669 * string.
670 *
671 * <p> This method is intended to be used in a loop together with the
672 * {@link #appendTail appendTail} and {@link #find find} methods. The
673 * following code, for example, writes <tt>one dog two dogs in the
674 * yard</tt> to the standard-output stream: </p>
675 *
676 * <blockquote><pre>
677 * Pattern p = Pattern.compile("cat");
678 * Matcher m = p.matcher("one cat two cats in the yard");
679 * StringBuffer sb = new StringBuffer();
680 * while (m.find()) {
681 * m.appendReplacement(sb, "dog");
682 * }
683 * m.appendTail(sb);
684 * System.out.println(sb.toString());</pre></blockquote>
685 *
686 * @param sb
687 * The target string buffer
688 *
689 * @param replacement
690 * The replacement string
691 *
692 * @return This matcher
693 *
694 * @throws IllegalStateException
695 * If no match has yet been attempted,
696 * or if the previous match operation failed
697 *
698 * @throws IndexOutOfBoundsException
699 * If the replacement string refers to a capturing group
700 * that does not exist in the pattern
701 */
702 public Matcher appendReplacement(StringBuffer sb, String replacement) {
703
704 // If no match, return error
705 if (first < 0)
706 throw new IllegalStateException("No match available");
707
708 // Process substitution string to replace group references with groups
709 int cursor = 0;
710 StringBuilder result = new StringBuilder();
711
712 while (cursor < replacement.length()) {
713 char nextChar = replacement.charAt(cursor);
714 if (nextChar == '\\') {
715 cursor++;
716 nextChar = replacement.charAt(cursor);
717 result.append(nextChar);
718 cursor++;
719 } else if (nextChar == '$') {
720 // Skip past $
721 cursor++;
722 // The first number is always a group
723 int refNum = (int)replacement.charAt(cursor) - '0';
724 if ((refNum < 0)||(refNum > 9))
725 throw new IllegalArgumentException(
726 "Illegal group reference");
727 cursor++;
728
729 // Capture the largest legal group string
730 boolean done = false;
731 while (!done) {
732 if (cursor >= replacement.length()) {
733 break;
734 }
735 int nextDigit = replacement.charAt(cursor) - '0';
736 if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
737 break;
738 }
739 int newRefNum = (refNum * 10) + nextDigit;
740 if (groupCount() < newRefNum) {
741 done = true;
742 } else {
743 refNum = newRefNum;
744 cursor++;
745 }
746 }
747 // Append group
748 if (start(refNum) != -1 && end(refNum) != -1)
749 result.append(text, start(refNum), end(refNum));
750 } else {
751 result.append(nextChar);
752 cursor++;
753 }
754 }
755 // Append the intervening text
756 sb.append(text, lastAppendPosition, first);
757 // Append the match substitution
758 sb.append(result);
759
760 lastAppendPosition = last;
761 return this;
762 }
763
764 /**
765 * Implements a terminal append-and-replace step.
766 *
767 * <p> This method reads characters from the input sequence, starting at
768 * the append position, and appends them to the given string buffer. It is
769 * intended to be invoked after one or more invocations of the {@link
770 * #appendReplacement appendReplacement} method in order to copy the
771 * remainder of the input sequence. </p>
772 *
773 * @param sb
774 * The target string buffer
775 *
776 * @return The target string buffer
777 */
778 public StringBuffer appendTail(StringBuffer sb) {
779 sb.append(text, lastAppendPosition, getTextLength());
780 return sb;
781 }
782
783 /**
784 * Replaces every subsequence of the input sequence that matches the
785 * pattern with the given replacement string.
786 *
787 * <p> This method first resets this matcher. It then scans the input
788 * sequence looking for matches of the pattern. Characters that are not
789 * part of any match are appended directly to the result string; each match
790 * is replaced in the result by the replacement string. The replacement
791 * string may contain references to captured subsequences as in the {@link
792 * #appendReplacement appendReplacement} method.
793 *
794 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
795 * the replacement string may cause the results to be different than if it
796 * were being treated as a literal replacement string. Dollar signs may be
797 * treated as references to captured subsequences as described above, and
798 * backslashes are used to escape literal characters in the replacement
799 * string.
800 *
801 * <p> Given the regular expression <tt>a*b</tt>, the input
802 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
803 * <tt>"-"</tt>, an invocation of this method on a matcher for that
804 * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
805 *
806 * <p> Invoking this method changes this matcher's state. If the matcher
807 * is to be used in further matching operations then it should first be
808 * reset. </p>
809 *
810 * @param replacement
811 * The replacement string
812 *
813 * @return The string constructed by replacing each matching subsequence
814 * by the replacement string, substituting captured subsequences
815 * as needed
816 */
817 public String replaceAll(String replacement) {
818 reset();
819 boolean result = find();
820 if (result) {
821 StringBuffer sb = new StringBuffer();
822 do {
823 appendReplacement(sb, replacement);
824 result = find();
825 } while (result);
826 appendTail(sb);
827 return sb.toString();
828 }
829 return text.toString();
830 }
831
832 /**
833 * Replaces the first subsequence of the input sequence that matches the
834 * pattern with the given replacement string.
835 *
836 * <p> This method first resets this matcher. It then scans the input
837 * sequence looking for a match of the pattern. Characters that are not
838 * part of the match are appended directly to the result string; the match
839 * is replaced in the result by the replacement string. The replacement
840 * string may contain references to captured subsequences as in the {@link
841 * #appendReplacement appendReplacement} method.
842 *
843 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
844 * the replacement string may cause the results to be different than if it
845 * were being treated as a literal replacement string. Dollar signs may be
846 * treated as references to captured subsequences as described above, and
847 * backslashes are used to escape literal characters in the replacement
848 * string.
849 *
850 * <p> Given the regular expression <tt>dog</tt>, the input
851 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
852 * <tt>"cat"</tt>, an invocation of this method on a matcher for that
853 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
854 *
855 * <p> Invoking this method changes this matcher's state. If the matcher
856 * is to be used in further matching operations then it should first be
857 * reset. </p>
858 *
859 * @param replacement
860 * The replacement string
861 * @return The string constructed by replacing the first matching
862 * subsequence by the replacement string, substituting captured
863 * subsequences as needed
864 */
865 public String replaceFirst(String replacement) {
866 if (replacement == null)
867 throw new NullPointerException("replacement");
868 reset();
869 if (!find())
870 return text.toString();
871 StringBuffer sb = new StringBuffer();
872 appendReplacement(sb, replacement);
873 appendTail(sb);
874 return sb.toString();
875 }
876
877 /**
878 * Sets the limits of this matcher's region. The region is the part of the
879 * input sequence that will be searched to find a match. Invoking this
880 * method resets the matcher, and then sets the region to start at the
881 * index specified by the <code>start</code> parameter and end at the
882 * index specified by the <code>end</code> parameter.
883 *
884 * <p>Depending on the transparency and anchoring being used (see
885 * {@link #useTransparentBounds useTransparentBounds} and
886 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
887 * as anchors may behave differently at or around the boundaries of the
888 * region.
889 *
890 * @param start
891 * The index to start searching at (inclusive)
892 * @param end
893 * The index to end searching at (exclusive)
894 * @throws IndexOutOfBoundsException
895 * If start or end is less than zero, if
896 * start is greater than the length of the input sequence, if
897 * end is greater than the length of the input sequence, or if
898 * start is greater than end.
899 * @return this matcher
900 * @since 1.5
901 */
902 public Matcher region(int start, int end) {
903 if ((start < 0) || (start > getTextLength()))
904 throw new IndexOutOfBoundsException("start");
905 if ((end < 0) || (end > getTextLength()))
906 throw new IndexOutOfBoundsException("end");
907 if (start > end)
908 throw new IndexOutOfBoundsException("start > end");
909 reset();
910 from = start;
911 to = end;
912 return this;
913 }
914
915 /**
916 * Reports the start index of this matcher's region. The
917 * searches this matcher conducts are limited to finding matches
918 * within {@link #regionStart regionStart} (inclusive) and
919 * {@link #regionEnd regionEnd} (exclusive).
920 *
921 * @return The starting point of this matcher's region
922 * @since 1.5
923 */
924 public int regionStart() {
925 return from;
926 }
927
928 /**
929 * Reports the end index (exclusive) of this matcher's region.
930 * The searches this matcher conducts are limited to finding matches
931 * within {@link #regionStart regionStart} (inclusive) and
932 * {@link #regionEnd regionEnd} (exclusive).
933 *
934 * @return the ending point of this matcher's region
935 * @since 1.5
936 */
937 public int regionEnd() {
938 return to;
939 }
940
941 /**
942 * Queries the transparency of region bounds for this matcher.
943 *
944 * <p> This method returns <tt>true</tt> if this matcher uses
945 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
946 * bounds.
947 *
948 * <p> See {@link #useTransparentBounds useTransparentBounds} for a
949 * description of transparent and opaque bounds.
950 *
951 * <p> By default, a matcher uses opaque region boundaries.
952 *
953 * @return <tt>true</tt> iff this matcher is using transparent bounds,
954 * <tt>false</tt> otherwise.
955 * @see java.util.regex.Matcher#useTransparentBounds(boolean)
956 * @since 1.5
957 */
958 public boolean hasTransparentBounds() {
959 return transparentBounds;
960 }
961
962 /**
963 * Sets the transparency of region bounds for this matcher.
964 *
965 * <p> Invoking this method with an argument of <tt>true</tt> will set this
966 * matcher to use <i>transparent</i> bounds. If the boolean
967 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
968 *
969 * <p> Using transparent bounds, the boundaries of this
970 * matcher's region are transparent to lookahead, lookbehind,
971 * and boundary matching constructs. Those constructs can see beyond the
972 * boundaries of the region to see if a match is appropriate.
973 *
974 * <p> Using opaque bounds, the boundaries of this matcher's
975 * region are opaque to lookahead, lookbehind, and boundary matching
976 * constructs that may try to see beyond them. Those constructs cannot
977 * look past the boundaries so they will fail to match anything outside
978 * of the region.
979 *
980 * <p> By default, a matcher uses opaque bounds.
981 *
982 * @param b a boolean indicating whether to use opaque or transparent
983 * regions
984 * @return this matcher
985 * @see java.util.regex.Matcher#hasTransparentBounds
986 * @since 1.5
987 */
988 public Matcher useTransparentBounds(boolean b) {
989 transparentBounds = b;
990 return this;
991 }
992
993 /**
994 * Queries the anchoring of region bounds for this matcher.
995 *
996 * <p> This method returns <tt>true</tt> if this matcher uses
997 * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
998 *
999 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
1000 * description of anchoring bounds.
1001 *
1002 * <p> By default, a matcher uses anchoring region boundaries.
1003 *
1004 * @return <tt>true</tt> iff this matcher is using anchoring bounds,
1005 * <tt>false</tt> otherwise.
1006 * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
1007 * @since 1.5
1008 */
1009 public boolean hasAnchoringBounds() {
1010 return anchoringBounds;
1011 }
1012
1013 /**
1014 * Sets the anchoring of region bounds for this matcher.
1015 *
1016 * <p> Invoking this method with an argument of <tt>true</tt> will set this
1017 * matcher to use <i>anchoring</i> bounds. If the boolean
1018 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
1019 * used.
1020 *
1021 * <p> Using anchoring bounds, the boundaries of this
1022 * matcher's region match anchors such as ^ and $.
1023 *
1024 * <p> Without anchoring bounds, the boundaries of this
1025 * matcher's region will not match anchors such as ^ and $.
1026 *
1027 * <p> By default, a matcher uses anchoring region boundaries.
1028 *
1029 * @param b a boolean indicating whether or not to use anchoring bounds.
1030 * @return this matcher
1031 * @see java.util.regex.Matcher#hasAnchoringBounds
1032 * @since 1.5
1033 */
1034 public Matcher useAnchoringBounds(boolean b) {
1035 anchoringBounds = b;
1036 return this;
1037 }
1038
1039 /**
1040 * <p>Returns the string representation of this matcher. The
1041 * string representation of a <code>Matcher</code> contains information
1042 * that may be useful for debugging. The exact format is unspecified.
1043 *
1044 * @return The string representation of this matcher
1045 * @since 1.5
1046 */
1047 public String toString() {
1048 StringBuilder sb = new StringBuilder();
1049 sb.append("java.util.regex.Matcher");
1050 sb.append("[pattern=" + pattern());
1051 sb.append(" region=");
1052 sb.append(regionStart() + "," + regionEnd());
1053 sb.append(" lastmatch=");
1054 if ((first >= 0) && (group() != null)) {
1055 sb.append(group());
1056 }
1057 sb.append("]");
1058 return sb.toString();
1059 }
1060
1061 /**
1062 * <p>Returns true if the end of input was hit by the search engine in
1063 * the last match operation performed by this matcher.
1064 *
1065 * <p>When this method returns true, then it is possible that more input
1066 * would have changed the result of the last search.
1067 *
1068 * @return true iff the end of input was hit in the last match; false
1069 * otherwise
1070 * @since 1.5
1071 */
1072 public boolean hitEnd() {
1073 return hitEnd;
1074 }
1075
1076 /**
1077 * <p>Returns true if more input could change a positive match into a
1078 * negative one.
1079 *
1080 * <p>If this method returns true, and a match was found, then more
1081 * input could cause the match to be lost. If this method returns false
1082 * and a match was found, then more input might change the match but the
1083 * match won't be lost. If a match was not found, then requireEnd has no
1084 * meaning.
1085 *
1086 * @return true iff more input could change a positive match into a
1087 * negative one.
1088 * @since 1.5
1089 */
1090 public boolean requireEnd() {
1091 return requireEnd;
1092 }
1093
1094 /**
1095 * Initiates a search to find a Pattern within the given bounds.
1096 * The groups are filled with default values and the match of the root
1097 * of the state machine is called. The state machine will hold the state
1098 * of the match as it proceeds in this matcher.
1099 *
1100 * Matcher.from is not set here, because it is the "hard" boundary
1101 * of the start of the search which anchors will set to. The from param
1102 * is the "soft" boundary of the start of the search, meaning that the
1103 * regex tries to match at that index but ^ won't match there. Subsequent
1104 * calls to the search methods start at a new "soft" boundary which is
1105 * the end of the previous match.
1106 */
1107 boolean search(int from) {
1108 this.hitEnd = false;
1109 this.requireEnd = false;
1110 from = from < 0 ? 0 : from;
1111 this.first = from;
1112 this.oldLast = oldLast < 0 ? from : oldLast;
1113 for (int i = 0; i < groups.length; i++)
1114 groups[i] = -1;
1115 acceptMode = NOANCHOR;
1116 boolean result = parentPattern.root.match(this, from, text);
1117 if (!result)
1118 this.first = -1;
1119 this.oldLast = this.last;
1120 return result;
1121 }
1122
1123 /**
1124 * Initiates a search for an anchored match to a Pattern within the given
1125 * bounds. The groups are filled with default values and the match of the
1126 * root of the state machine is called. The state machine will hold the
1127 * state of the match as it proceeds in this matcher.
1128 */
1129 boolean match(int from, int anchor) {
1130 this.hitEnd = false;
1131 this.requireEnd = false;
1132 from = from < 0 ? 0 : from;
1133 this.first = from;
1134 this.oldLast = oldLast < 0 ? from : oldLast;
1135 for (int i = 0; i < groups.length; i++)
1136 groups[i] = -1;
1137 acceptMode = anchor;
1138 boolean result = parentPattern.matchRoot.match(this, from, text);
1139 if (!result)
1140 this.first = -1;
1141 this.oldLast = this.last;
1142 return result;
1143 }
1144
1145 /**
1146 * Returns the end index of the text.
1147 *
1148 * @return the index after the last character in the text
1149 */
1150 int getTextLength() {
1151 return text.length();
1152 }
1153
1154 /**
1155 * Generates a String from this Matcher's input in the specified range.
1156 *
1157 * @param beginIndex the beginning index, inclusive
1158 * @param endIndex the ending index, exclusive
1159 * @return A String generated from this Matcher's input
1160 */
1161 CharSequence getSubSequence(int beginIndex, int endIndex) {
1162 return text.subSequence(beginIndex, endIndex);
1163 }
1164
1165 /**
1166 * Returns this Matcher's input character at index i.
1167 *
1168 * @return A char from the specified index
1169 */
1170 char charAt(int i) {
1171 return text.charAt(i);
1172 }
1173
1174}