blob: ab9897aa69887738880e492675082f9c9766d41b [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/*
27 *******************************************************************************
28 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
29 * *
30 * The original version of this source code and documentation is copyrighted *
31 * and owned by IBM, These materials are provided under terms of a License *
32 * Agreement between IBM and Sun. This technology is protected by multiple *
33 * US and International patents. This notice and attribution to IBM may not *
34 * to removed. *
35 *******************************************************************************
36 */
37
38/*
39 **********************************************************************
40 * Author: Alan Liu
41 * Created: September 23 2003
42 * Since: ICU 2.8
43 **********************************************************************
44 */
45
46package sun.text.normalizer;
47
48import java.text.ParsePosition;
49
50/**
51 * An iterator that returns 32-bit code points. This class is deliberately
52 * <em>not</em> related to any of the JDK or ICU4J character iterator classes
53 * in order to minimize complexity.
54 * @author Alan Liu
55 * @since ICU 2.8
56 */
57public class RuleCharacterIterator {
58
59 // TODO: Ideas for later. (Do not implement if not needed, lest the
60 // code coverage numbers go down due to unused methods.)
61 // 1. Add a copy constructor, equals() method, clone() method.
62 // 2. Rather than return DONE, throw an exception if the end
63 // is reached -- this is an alternate usage model, probably not useful.
64 // 3. Return isEscaped from next(). If this happens,
65 // don't keep an isEscaped member variable.
66
67 /**
68 * Text being iterated.
69 */
70 private String text;
71
72 /**
73 * Position of iterator.
74 */
75 private ParsePosition pos;
76
77 /**
78 * Symbol table used to parse and dereference variables. May be null.
79 */
80 private SymbolTable sym;
81
82 /**
83 * Current variable expansion, or null if none.
84 */
85 private char[] buf;
86
87 /**
88 * Position within buf[]. Meaningless if buf == null.
89 */
90 private int bufPos;
91
92 /**
93 * Flag indicating whether the last character was parsed from an escape.
94 */
95 private boolean isEscaped;
96
97 /**
98 * Value returned when there are no more characters to iterate.
99 */
100 public static final int DONE = -1;
101
102 /**
103 * Bitmask option to enable parsing of variable names. If (options &
104 * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
105 * its value. Variables are parsed using the SymbolTable API.
106 */
107 public static final int PARSE_VARIABLES = 1;
108
109 /**
110 * Bitmask option to enable parsing of escape sequences. If (options &
111 * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
112 * to its value. Escapes are parsed using Utility.unescapeAt().
113 */
114 public static final int PARSE_ESCAPES = 2;
115
116 /**
117 * Bitmask option to enable skipping of whitespace. If (options &
118 * SKIP_WHITESPACE) != 0, then whitespace characters will be silently
119 * skipped, as if they were not present in the input. Whitespace
120 * characters are defined by UCharacterProperty.isRuleWhiteSpace().
121 */
122 public static final int SKIP_WHITESPACE = 4;
123
124 /**
125 * Constructs an iterator over the given text, starting at the given
126 * position.
127 * @param text the text to be iterated
128 * @param sym the symbol table, or null if there is none. If sym is null,
129 * then variables will not be deferenced, even if the PARSE_VARIABLES
130 * option is set.
131 * @param pos upon input, the index of the next character to return. If a
132 * variable has been dereferenced, then pos will <em>not</em> increment as
133 * characters of the variable value are iterated.
134 */
135 public RuleCharacterIterator(String text, SymbolTable sym,
136 ParsePosition pos) {
137 if (text == null || pos.getIndex() > text.length()) {
138 throw new IllegalArgumentException();
139 }
140 this.text = text;
141 this.sym = sym;
142 this.pos = pos;
143 buf = null;
144 }
145
146 /**
147 * Returns true if this iterator has no more characters to return.
148 */
149 public boolean atEnd() {
150 return buf == null && pos.getIndex() == text.length();
151 }
152
153 /**
154 * Returns the next character using the given options, or DONE if there
155 * are no more characters, and advance the position to the next
156 * character.
157 * @param options one or more of the following options, bitwise-OR-ed
158 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
159 * @return the current 32-bit code point, or DONE
160 */
161 public int next(int options) {
162 int c = DONE;
163 isEscaped = false;
164
165 for (;;) {
166 c = _current();
167 _advance(UTF16.getCharCount(c));
168
169 if (c == SymbolTable.SYMBOL_REF && buf == null &&
170 (options & PARSE_VARIABLES) != 0 && sym != null) {
171 String name = sym.parseReference(text, pos, text.length());
172 // If name == null there was an isolated SYMBOL_REF;
173 // return it. Caller must be prepared for this.
174 if (name == null) {
175 break;
176 }
177 bufPos = 0;
178 buf = sym.lookup(name);
179 if (buf == null) {
180 throw new IllegalArgumentException(
181 "Undefined variable: " + name);
182 }
183 // Handle empty variable value
184 if (buf.length == 0) {
185 buf = null;
186 }
187 continue;
188 }
189
190 if ((options & SKIP_WHITESPACE) != 0 &&
191 UCharacterProperty.isRuleWhiteSpace(c)) {
192 continue;
193 }
194
195 if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
196 int offset[] = new int[] { 0 };
197 c = Utility.unescapeAt(lookahead(), offset);
198 jumpahead(offset[0]);
199 isEscaped = true;
200 if (c < 0) {
201 throw new IllegalArgumentException("Invalid escape");
202 }
203 }
204
205 break;
206 }
207
208 return c;
209 }
210
211 /**
212 * Returns true if the last character returned by next() was
213 * escaped. This will only be the case if the option passed in to
214 * next() included PARSE_ESCAPED and the next character was an
215 * escape sequence.
216 */
217 public boolean isEscaped() {
218 return isEscaped;
219 }
220
221 /**
222 * Returns true if this iterator is currently within a variable expansion.
223 */
224 public boolean inVariable() {
225 return buf != null;
226 }
227
228 /**
229 * Returns an object which, when later passed to setPos(), will
230 * restore this iterator's position. Usage idiom:
231 *
232 * RuleCharacterIterator iterator = ...;
233 * Object pos = iterator.getPos(null); // allocate position object
234 * for (;;) {
235 * pos = iterator.getPos(pos); // reuse position object
236 * int c = iterator.next(...);
237 * ...
238 * }
239 * iterator.setPos(pos);
240 *
241 * @param p a position object previously returned by getPos(),
242 * or null. If not null, it will be updated and returned. If
243 * null, a new position object will be allocated and returned.
244 * @return a position object which may be passed to setPos(),
245 * either `p,' or if `p' == null, a newly-allocated object
246 */
247 public Object getPos(Object p) {
248 if (p == null) {
249 return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
250 }
251 Object[] a = (Object[]) p;
252 a[0] = buf;
253 int[] v = (int[]) a[1];
254 v[0] = pos.getIndex();
255 v[1] = bufPos;
256 return p;
257 }
258
259 /**
260 * Restores this iterator to the position it had when getPos()
261 * returned the given object.
262 * @param p a position object previously returned by getPos()
263 */
264 public void setPos(Object p) {
265 Object[] a = (Object[]) p;
266 buf = (char[]) a[0];
267 int[] v = (int[]) a[1];
268 pos.setIndex(v[0]);
269 bufPos = v[1];
270 }
271
272 /**
273 * Skips ahead past any ignored characters, as indicated by the given
274 * options. This is useful in conjunction with the lookahead() method.
275 *
276 * Currently, this only has an effect for SKIP_WHITESPACE.
277 * @param options one or more of the following options, bitwise-OR-ed
278 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
279 */
280 public void skipIgnored(int options) {
281 if ((options & SKIP_WHITESPACE) != 0) {
282 for (;;) {
283 int a = _current();
284 if (!UCharacterProperty.isRuleWhiteSpace(a)) break;
285 _advance(UTF16.getCharCount(a));
286 }
287 }
288 }
289
290 /**
291 * Returns a string containing the remainder of the characters to be
292 * returned by this iterator, without any option processing. If the
293 * iterator is currently within a variable expansion, this will only
294 * extend to the end of the variable expansion. This method is provided
295 * so that iterators may interoperate with string-based APIs. The typical
296 * sequence of calls is to call skipIgnored(), then call lookahead(), then
297 * parse the string returned by lookahead(), then call jumpahead() to
298 * resynchronize the iterator.
299 * @return a string containing the characters to be returned by future
300 * calls to next()
301 */
302 public String lookahead() {
303 if (buf != null) {
304 return new String(buf, bufPos, buf.length - bufPos);
305 } else {
306 return text.substring(pos.getIndex());
307 }
308 }
309
310 /**
311 * Advances the position by the given number of 16-bit code units.
312 * This is useful in conjunction with the lookahead() method.
313 * @param count the number of 16-bit code units to jump over
314 */
315 public void jumpahead(int count) {
316 if (count < 0) {
317 throw new IllegalArgumentException();
318 }
319 if (buf != null) {
320 bufPos += count;
321 if (bufPos > buf.length) {
322 throw new IllegalArgumentException();
323 }
324 if (bufPos == buf.length) {
325 buf = null;
326 }
327 } else {
328 int i = pos.getIndex() + count;
329 pos.setIndex(i);
330 if (i > text.length()) {
331 throw new IllegalArgumentException();
332 }
333 }
334 }
335
336 /**
337 * Returns the current 32-bit code point without parsing escapes, parsing
338 * variables, or skipping whitespace.
339 * @return the current 32-bit code point
340 */
341 private int _current() {
342 if (buf != null) {
343 return UTF16.charAt(buf, 0, buf.length, bufPos);
344 } else {
345 int i = pos.getIndex();
346 return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
347 }
348 }
349
350 /**
351 * Advances the position by the given amount.
352 * @param count the number of 16-bit code units to advance past
353 */
354 private void _advance(int count) {
355 if (buf != null) {
356 bufPos += count;
357 if (bufPos == buf.length) {
358 buf = null;
359 }
360 } else {
361 pos.setIndex(pos.getIndex() + count);
362 if (pos.getIndex() > text.length()) {
363 pos.setIndex(text.length());
364 }
365 }
366 }
367}