blob: 31b242739882a8c8254b8cffe554508be9908d09 [file] [log] [blame]
Tobias Sargeantf4d85d82018-01-05 14:34:38 +00001/*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.webkit;
18
19import java.util.Locale;
20import java.util.regex.MatchResult;
21import java.util.regex.Matcher;
22import java.util.regex.Pattern;
23
24/**
25 * Java implementation of legacy WebView.findAddress algorithm.
26 *
27 * @hide
28 */
29class FindAddress {
30 static class ZipRange {
31 int mLow;
32 int mHigh;
33 int mException1;
34 int mException2;
35 ZipRange(int low, int high, int exception1, int exception2) {
36 mLow = low;
37 mHigh = high;
38 mException1 = exception1;
39 mException2 = exception1;
40 }
41 boolean matches(String zipCode) {
42 int prefix = Integer.parseInt(zipCode.substring(0, 2));
43 return (mLow <= prefix && prefix <= mHigh) || prefix == mException1
44 || prefix == mException2;
45 }
46 }
47
48 // Addresses consist of at least this many words, not including state and zip code.
49 private static final int MIN_ADDRESS_WORDS = 4;
50
51 // Adddresses consist of at most this many words, not including state and zip code.
52 private static final int MAX_ADDRESS_WORDS = 14;
53
54 // Addresses consist of at most this many lines.
55 private static final int MAX_ADDRESS_LINES = 5;
56
57 // No words in an address are longer than this many characters.
58 private static final int kMaxAddressNameWordLength = 25;
59
60 // Location name should be in the first MAX_LOCATION_NAME_DISTANCE words
61 private static final int MAX_LOCATION_NAME_DISTANCE = 5;
62
63 private static final ZipRange[] sStateZipCodeRanges = {
64 new ZipRange(99, 99, -1, -1), // AK Alaska.
65 new ZipRange(35, 36, -1, -1), // AL Alabama.
66 new ZipRange(71, 72, -1, -1), // AR Arkansas.
67 new ZipRange(96, 96, -1, -1), // AS American Samoa.
68 new ZipRange(85, 86, -1, -1), // AZ Arizona.
69 new ZipRange(90, 96, -1, -1), // CA California.
70 new ZipRange(80, 81, -1, -1), // CO Colorado.
71 new ZipRange(6, 6, -1, -1), // CT Connecticut.
72 new ZipRange(20, 20, -1, -1), // DC District of Columbia.
73 new ZipRange(19, 19, -1, -1), // DE Delaware.
74 new ZipRange(32, 34, -1, -1), // FL Florida.
75 new ZipRange(96, 96, -1, -1), // FM Federated States of Micronesia.
76 new ZipRange(30, 31, -1, -1), // GA Georgia.
77 new ZipRange(96, 96, -1, -1), // GU Guam.
78 new ZipRange(96, 96, -1, -1), // HI Hawaii.
79 new ZipRange(50, 52, -1, -1), // IA Iowa.
80 new ZipRange(83, 83, -1, -1), // ID Idaho.
81 new ZipRange(60, 62, -1, -1), // IL Illinois.
82 new ZipRange(46, 47, -1, -1), // IN Indiana.
83 new ZipRange(66, 67, 73, -1), // KS Kansas.
84 new ZipRange(40, 42, -1, -1), // KY Kentucky.
85 new ZipRange(70, 71, -1, -1), // LA Louisiana.
86 new ZipRange(1, 2, -1, -1), // MA Massachusetts.
87 new ZipRange(20, 21, -1, -1), // MD Maryland.
88 new ZipRange(3, 4, -1, -1), // ME Maine.
89 new ZipRange(96, 96, -1, -1), // MH Marshall Islands.
90 new ZipRange(48, 49, -1, -1), // MI Michigan.
91 new ZipRange(55, 56, -1, -1), // MN Minnesota.
92 new ZipRange(63, 65, -1, -1), // MO Missouri.
93 new ZipRange(96, 96, -1, -1), // MP Northern Mariana Islands.
94 new ZipRange(38, 39, -1, -1), // MS Mississippi.
95 new ZipRange(55, 56, -1, -1), // MT Montana.
96 new ZipRange(27, 28, -1, -1), // NC North Carolina.
97 new ZipRange(58, 58, -1, -1), // ND North Dakota.
98 new ZipRange(68, 69, -1, -1), // NE Nebraska.
99 new ZipRange(3, 4, -1, -1), // NH New Hampshire.
100 new ZipRange(7, 8, -1, -1), // NJ New Jersey.
101 new ZipRange(87, 88, 86, -1), // NM New Mexico.
102 new ZipRange(88, 89, 96, -1), // NV Nevada.
103 new ZipRange(10, 14, 0, 6), // NY New York.
104 new ZipRange(43, 45, -1, -1), // OH Ohio.
105 new ZipRange(73, 74, -1, -1), // OK Oklahoma.
106 new ZipRange(97, 97, -1, -1), // OR Oregon.
107 new ZipRange(15, 19, -1, -1), // PA Pennsylvania.
108 new ZipRange(6, 6, 0, 9), // PR Puerto Rico.
109 new ZipRange(96, 96, -1, -1), // PW Palau.
110 new ZipRange(2, 2, -1, -1), // RI Rhode Island.
111 new ZipRange(29, 29, -1, -1), // SC South Carolina.
112 new ZipRange(57, 57, -1, -1), // SD South Dakota.
113 new ZipRange(37, 38, -1, -1), // TN Tennessee.
114 new ZipRange(75, 79, 87, 88), // TX Texas.
115 new ZipRange(84, 84, -1, -1), // UT Utah.
116 new ZipRange(22, 24, 20, -1), // VA Virginia.
117 new ZipRange(6, 9, -1, -1), // VI Virgin Islands.
118 new ZipRange(5, 5, -1, -1), // VT Vermont.
119 new ZipRange(98, 99, -1, -1), // WA Washington.
120 new ZipRange(53, 54, -1, -1), // WI Wisconsin.
121 new ZipRange(24, 26, -1, -1), // WV West Virginia.
122 new ZipRange(82, 83, -1, -1) // WY Wyoming.
123 };
124
125 // Newlines
126 private static final String NL = "\n\u000B\u000C\r\u0085\u2028\u2029";
127
128 // Space characters
129 private static final String SP = "\u0009\u0020\u00A0\u1680\u2000\u2001"
130 + "\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F"
131 + "\u205F\u3000";
132
133 // Whitespace
134 private static final String WS = SP + NL;
135
136 // Characters that are considered word delimiters.
137 private static final String WORD_DELIM = ",*\u2022" + WS;
138
139 // Lookahead for word end.
140 private static final String WORD_END = "(?=[" + WORD_DELIM + "]|$)";
141
142 // Address words are a sequence of non-delimiter characters.
143 private static final Pattern sWordRe =
144 Pattern.compile("[^" + WORD_DELIM + "]+" + WORD_END, Pattern.CASE_INSENSITIVE);
145
146 // Characters that are considered suffix delimiters for house numbers.
147 private static final String HOUSE_POST_DELIM = ",\"'" + WS;
148
149 // Lookahead for house end.
150 private static final String HOUSE_END = "(?=[" + HOUSE_POST_DELIM + "]|$)";
151
152 // Characters that are considered prefix delimiters for house numbers.
153 private static final String HOUSE_PRE_DELIM = ":" + HOUSE_POST_DELIM;
154
155 // A house number component is "one" or a number, optionally
156 // followed by a single alphabetic character, or
157 private static final String HOUSE_COMPONENT = "(?:one|\\d+([a-z](?=[^a-z]|$)|st|nd|rd|th)?)";
158
159 // House numbers are a repetition of |HOUSE_COMPONENT|, separated by -, and followed by
160 // a delimiter character.
161 private static final Pattern sHouseNumberRe =
162 Pattern.compile(HOUSE_COMPONENT + "(?:-" + HOUSE_COMPONENT + ")*" + HOUSE_END,
163 Pattern.CASE_INSENSITIVE);
164
165 // XXX: do we want to accept whitespace other than 0x20 in state names?
166 private static final Pattern sStateRe = Pattern.compile("(?:"
167 + "(ak|alaska)|"
168 + "(al|alabama)|"
169 + "(ar|arkansas)|"
170 + "(as|american[" + SP + "]+samoa)|"
171 + "(az|arizona)|"
172 + "(ca|california)|"
173 + "(co|colorado)|"
174 + "(ct|connecticut)|"
175 + "(dc|district[" + SP + "]+of[" + SP + "]+columbia)|"
176 + "(de|delaware)|"
177 + "(fl|florida)|"
178 + "(fm|federated[" + SP + "]+states[" + SP + "]+of[" + SP + "]+micronesia)|"
179 + "(ga|georgia)|"
180 + "(gu|guam)|"
181 + "(hi|hawaii)|"
182 + "(ia|iowa)|"
183 + "(id|idaho)|"
184 + "(il|illinois)|"
185 + "(in|indiana)|"
186 + "(ks|kansas)|"
187 + "(ky|kentucky)|"
188 + "(la|louisiana)|"
189 + "(ma|massachusetts)|"
190 + "(md|maryland)|"
191 + "(me|maine)|"
192 + "(mh|marshall[" + SP + "]+islands)|"
193 + "(mi|michigan)|"
194 + "(mn|minnesota)|"
195 + "(mo|missouri)|"
196 + "(mp|northern[" + SP + "]+mariana[" + SP + "]+islands)|"
197 + "(ms|mississippi)|"
198 + "(mt|montana)|"
199 + "(nc|north[" + SP + "]+carolina)|"
200 + "(nd|north[" + SP + "]+dakota)|"
201 + "(ne|nebraska)|"
202 + "(nh|new[" + SP + "]+hampshire)|"
203 + "(nj|new[" + SP + "]+jersey)|"
204 + "(nm|new[" + SP + "]+mexico)|"
205 + "(nv|nevada)|"
206 + "(ny|new[" + SP + "]+york)|"
207 + "(oh|ohio)|"
208 + "(ok|oklahoma)|"
209 + "(or|oregon)|"
210 + "(pa|pennsylvania)|"
211 + "(pr|puerto[" + SP + "]+rico)|"
212 + "(pw|palau)|"
213 + "(ri|rhode[" + SP + "]+island)|"
214 + "(sc|south[" + SP + "]+carolina)|"
215 + "(sd|south[" + SP + "]+dakota)|"
216 + "(tn|tennessee)|"
217 + "(tx|texas)|"
218 + "(ut|utah)|"
219 + "(va|virginia)|"
220 + "(vi|virgin[" + SP + "]+islands)|"
221 + "(vt|vermont)|"
222 + "(wa|washington)|"
223 + "(wi|wisconsin)|"
224 + "(wv|west[" + SP + "]+virginia)|"
225 + "(wy|wyoming)"
226 + ")" + WORD_END,
227 Pattern.CASE_INSENSITIVE);
228
229 private static final Pattern sLocationNameRe = Pattern.compile("(?:"
230 + "alley|annex|arcade|ave[.]?|avenue|alameda|bayou|"
231 + "beach|bend|bluffs?|bottom|boulevard|branch|bridge|"
232 + "brooks?|burgs?|bypass|broadway|camino|camp|canyon|"
233 + "cape|causeway|centers?|circles?|cliffs?|club|common|"
234 + "corners?|course|courts?|coves?|creek|crescent|crest|"
235 + "crossing|crossroad|curve|circulo|dale|dam|divide|"
236 + "drives?|estates?|expressway|extensions?|falls?|ferry|"
237 + "fields?|flats?|fords?|forest|forges?|forks?|fort|"
238 + "freeway|gardens?|gateway|glens?|greens?|groves?|"
239 + "harbors?|haven|heights|highway|hills?|hollow|inlet|"
240 + "islands?|isle|junctions?|keys?|knolls?|lakes?|land|"
241 + "landing|lane|lights?|loaf|locks?|lodge|loop|mall|"
242 + "manors?|meadows?|mews|mills?|mission|motorway|mount|"
243 + "mountains?|neck|orchard|oval|overpass|parks?|"
244 + "parkways?|pass|passage|path|pike|pines?|plains?|"
245 + "plaza|points?|ports?|prairie|privada|radial|ramp|"
246 + "ranch|rapids?|rd[.]?|rest|ridges?|river|roads?|route|"
247 + "row|rue|run|shoals?|shores?|skyway|springs?|spurs?|"
248 + "squares?|station|stravenue|stream|st[.]?|streets?|"
249 + "summit|speedway|terrace|throughway|trace|track|"
250 + "trafficway|trail|tunnel|turnpike|underpass|unions?|"
251 + "valleys?|viaduct|views?|villages?|ville|vista|walks?|"
252 + "wall|ways?|wells?|xing|xrd)" + WORD_END,
253 Pattern.CASE_INSENSITIVE);
254
255 private static final Pattern sSuffixedNumberRe =
256 Pattern.compile("(\\d+)(st|nd|rd|th)", Pattern.CASE_INSENSITIVE);
257
258 private static final Pattern sZipCodeRe =
259 Pattern.compile("(?:\\d{5}(?:-\\d{4})?)" + WORD_END, Pattern.CASE_INSENSITIVE);
260
261 private static boolean checkHouseNumber(String houseNumber) {
262 // Make sure that there are at most 5 digits.
263 int digitCount = 0;
264 for (int i = 0; i < houseNumber.length(); ++i) {
265 if (Character.isDigit(houseNumber.charAt(i))) ++digitCount;
266 }
267 if (digitCount > 5) return false;
268
269 // Make sure that any ordinals are valid.
270 Matcher suffixMatcher = sSuffixedNumberRe.matcher(houseNumber);
271 while (suffixMatcher.find()) {
272 int num = Integer.parseInt(suffixMatcher.group(1));
273 if (num == 0) {
274 return false; // 0th is invalid.
275 }
276 String suffix = suffixMatcher.group(2).toLowerCase(Locale.getDefault());
277 switch (num % 10) {
278 case 1:
279 return suffix.equals(num % 100 == 11 ? "th" : "st");
280 case 2:
281 return suffix.equals(num % 100 == 12 ? "th" : "nd");
282 case 3:
283 return suffix.equals(num % 100 == 13 ? "th" : "rd");
284 default:
285 return suffix.equals("th");
286 }
287 }
288 return true;
289 }
290
291 /**
292 * Attempt to match a house number beginnning at position offset
293 * in content. The house number must be followed by a word
294 * delimiter or the end of the string, and if offset is non-zero,
295 * then it must also be preceded by a word delimiter.
296 *
297 * @return a MatchResult if a valid house number was found.
298 */
299 private static MatchResult matchHouseNumber(String content, int offset) {
300 if (offset > 0 && HOUSE_PRE_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null;
301 Matcher matcher = sHouseNumberRe.matcher(content).region(offset, content.length());
302 if (matcher.lookingAt()) {
303 MatchResult matchResult = matcher.toMatchResult();
304 if (checkHouseNumber(matchResult.group(0))) return matchResult;
305 }
306 return null;
307 }
308
309 /**
310 * Attempt to match a US state beginnning at position offset in
311 * content. The matching state must be followed by a word
312 * delimiter or the end of the string, and if offset is non-zero,
313 * then it must also be preceded by a word delimiter.
314 *
315 * @return a MatchResult if a valid US state (or two letter code)
316 * was found.
317 */
318 private static MatchResult matchState(String content, int offset) {
319 if (offset > 0 && WORD_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null;
320 Matcher stateMatcher = sStateRe.matcher(content).region(offset, content.length());
321 return stateMatcher.lookingAt() ? stateMatcher.toMatchResult() : null;
322 }
323
324 /**
325 * Test whether zipCode matches the U.S. zip code format (ddddd or
326 * ddddd-dddd) and is within the expected range, given that
327 * stateMatch is a match of sStateRe.
328 *
329 * @return true if zipCode is a valid zip code, is legal for the
330 * matched state, and is followed by a word delimiter or the end
331 * of the string.
332 */
333 private static boolean isValidZipCode(String zipCode, MatchResult stateMatch) {
334 if (stateMatch == null) return false;
335 // Work out the index of the state, based on which group matched.
336 int stateIndex = stateMatch.groupCount();
337 while (stateIndex > 0) {
338 if (stateMatch.group(stateIndex--) != null) break;
339 }
340 return sZipCodeRe.matcher(zipCode).matches()
341 && sStateZipCodeRanges[stateIndex].matches(zipCode);
342 }
343
344 /**
345 * Test whether location is one of the valid locations.
346 *
347 * @return true if location starts with a valid location name
348 * followed by a word delimiter or the end of the string.
349 */
350 private static boolean isValidLocationName(String location) {
351 return sLocationNameRe.matcher(location).matches();
352 }
353
354 /**
355 * Attempt to match a complete address in content, starting with
356 * houseNumberMatch.
357 *
358 * @param content The string to search.
359 * @param houseNumberMatch A matching house number to start extending.
360 * @return +ve: the end of the match
361 * +ve: the position to restart searching for house numbers, negated.
362 */
363 private static int attemptMatch(String content, MatchResult houseNumberMatch) {
364 int restartPos = -1;
365 int nonZipMatch = -1;
366 int it = houseNumberMatch.end();
367 int numLines = 1;
368 boolean consecutiveHouseNumbers = true;
369 boolean foundLocationName = false;
370 int wordCount = 1;
371 String lastWord = "";
372
373 Matcher matcher = sWordRe.matcher(content);
374
375 for (; it < content.length(); lastWord = matcher.group(0), it = matcher.end()) {
376 if (!matcher.find(it)) {
377 // No more words in the input sequence.
378 return -content.length();
379 }
380 if (matcher.end() - matcher.start() > kMaxAddressNameWordLength) {
381 // Word is too long to be part of an address. Fail.
382 return -matcher.end();
383 }
384
385 // Count the number of newlines we just consumed.
386 while (it < matcher.start()) {
387 if (NL.indexOf(content.charAt(it++)) != -1) ++numLines;
388 }
389
390 // Consumed too many lines. Fail.
391 if (numLines > MAX_ADDRESS_LINES) break;
392
393 // Consumed too many words. Fail.
394 if (++wordCount > MAX_ADDRESS_WORDS) break;
395
396 if (matchHouseNumber(content, it) != null) {
397 if (consecutiveHouseNumbers && numLines > 1) {
398 // Last line ended with a number, and this this line starts with one.
399 // Restart at this number.
400 return -it;
401 }
402 // Remember the position of this match as the restart position.
403 if (restartPos == -1) restartPos = it;
404 continue;
405 }
406
407 consecutiveHouseNumbers = false;
408
409 if (isValidLocationName(matcher.group(0))) {
410 foundLocationName = true;
411 continue;
412 }
413
414 if (wordCount == MAX_LOCATION_NAME_DISTANCE && !foundLocationName) {
415 // Didn't find a location name in time. Fail.
416 it = matcher.end();
417 break;
418 }
419
420 if (foundLocationName && wordCount > MIN_ADDRESS_WORDS) {
421 // We can now attempt to match a state.
422 MatchResult stateMatch = matchState(content, it);
423 if (stateMatch != null) {
424 if (lastWord.equals("et") && stateMatch.group(0).equals("al")) {
425 // Reject "et al" as a false postitive.
426 it = stateMatch.end();
427 break;
428 }
429
430 // At this point we've matched a state; try to match a zip code after it.
431 Matcher zipMatcher = sWordRe.matcher(content);
432 if (zipMatcher.find(stateMatch.end())
433 && isValidZipCode(zipMatcher.group(0), stateMatch)) {
434 return zipMatcher.end();
435 }
436 // The content ends with a state but no zip
437 // code. This is a legal match according to the
438 // documentation. N.B. This differs from the
439 // original c++ implementation, which only allowed
440 // the zip code to be optional at the end of the
441 // string, which presumably is a bug. Now we
442 // prefer to find a match with a zip code, but
443 // remember non-zip matches and return them if
444 // necessary.
445 nonZipMatch = stateMatch.end();
446 }
447 }
448 }
449
450 if (nonZipMatch > 0) return nonZipMatch;
451
452 return -(restartPos > 0 ? restartPos : it);
453 }
454
455 /**
456 * Return the first matching address in content.
457 *
458 * @param content The string to search.
459 * @return The first valid address, or null if no address was matched.
460 */
461 static String findAddress(String content) {
462 Matcher houseNumberMatcher = sHouseNumberRe.matcher(content);
463 int start = 0;
464 while (houseNumberMatcher.find(start)) {
465 if (checkHouseNumber(houseNumberMatcher.group(0))) {
466 start = houseNumberMatcher.start();
467 int end = attemptMatch(content, houseNumberMatcher);
468 if (end > 0) {
469 return content.substring(start, end);
470 }
471 start = -end;
472 } else {
473 start = houseNumberMatcher.end();
474 }
475 }
476 return null;
477 }
478}