blob: b41ac1f36a91e07118fd7735f6a24ef3bb3e830f [file] [log] [blame]
Kenny Rootf8d0f092010-03-12 08:15:28 -08001/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
The Android Open Source Project9066cfe2009-03-03 19:31:44 -080016
17package com.google.android.util;
18
Artur Satayev2a9f3b82019-12-10 17:47:55 +000019import android.compat.annotation.UnsupportedAppUsage;
Artur Satayevfc46be72019-11-04 17:50:59 +000020
The Android Open Source Project9066cfe2009-03-03 19:31:44 -080021import java.util.ArrayList;
22import java.util.HashMap;
23import java.util.LinkedHashMap;
Artur Satayevfc46be72019-11-04 17:50:59 +000024import java.util.List;
25import java.util.Set;
The Android Open Source Project9066cfe2009-03-03 19:31:44 -080026import java.util.regex.Matcher;
27import java.util.regex.Pattern;
The Android Open Source Project9066cfe2009-03-03 19:31:44 -080028
29/**
30 *
31 * Logic for parsing a text message typed by the user looking for smileys,
32 * urls, acronyms,formatting (e.g., '*'s for bold), me commands
33 * (e.g., "/me is asleep"), and punctuation.
34 *
35 * It constructs an array, which breaks the text up into its
36 * constituent pieces, which we return to the client.
37 *
38 */
39public abstract class AbstractMessageParser {
40/**
41 * Interface representing the set of resources needed by a message parser
42 *
43 * @author jessan (Jessan Hutchison-Quillian)
44 */
45 public static interface Resources {
46
47 /** Get the known set of URL schemes. */
48 public Set<String> getSchemes();
49
50 /** Get the possible values for the last part of a domain name.
51 * Values are expected to be reversed in the Trie.
52 */
53 public TrieNode getDomainSuffixes();
54
55 /** Get the smileys accepted by the parser. */
56 public TrieNode getSmileys();
57
58 /** Get the acronyms accepted by the parser. */
59 public TrieNode getAcronyms();
60 }
61
62 /**
63 * Subclasses must define the schemes, domains, smileys and acronyms
64 * that are necessary for parsing
65 */
66 protected abstract Resources getResources();
67
68 /** Music note that indicates user is listening to a music track. */
69 public static final String musicNote = "\u266B ";
70
71 private String text;
72 private int nextChar;
73 private int nextClass;
74 private ArrayList<Part> parts;
75 private ArrayList<Token> tokens;
76 private HashMap<Character,Format> formatStart;
77 private boolean parseSmilies;
78 private boolean parseAcronyms;
79 private boolean parseFormatting;
80 private boolean parseUrls;
81 private boolean parseMeText;
82 private boolean parseMusic;
83
84 /**
85 * Create a message parser to parse urls, formatting, acronyms, smileys,
86 * /me text and music
87 *
88 * @param text the text to parse
89 */
90 public AbstractMessageParser(String text) {
91 this(text, true, true, true, true, true, true);
92 }
93
94 /**
95 * Create a message parser, specifying the kinds of text to parse
96 *
97 * @param text the text to parse
98 *
99 */
100 public AbstractMessageParser(String text, boolean parseSmilies,
101 boolean parseAcronyms, boolean parseFormatting, boolean parseUrls,
102 boolean parseMusic, boolean parseMeText) {
103 this.text = text;
104 this.nextChar = 0;
105 this.nextClass = 10;
106 this.parts = new ArrayList<Part>();
107 this.tokens = new ArrayList<Token>();
108 this.formatStart = new HashMap<Character,Format>();
109 this.parseSmilies = parseSmilies;
110 this.parseAcronyms = parseAcronyms;
111 this.parseFormatting = parseFormatting;
112 this.parseUrls = parseUrls;
113 this.parseMusic = parseMusic;
114 this.parseMeText = parseMeText;
115 }
116
117 /** Returns the raw text being parsed. */
118 public final String getRawText() { return text; }
119
120 /** Return the number of parts. */
121 public final int getPartCount() { return parts.size(); }
122
123 /** Return the part at the given index. */
124 public final Part getPart(int index) { return parts.get(index); }
125
126 /** Return the list of parts from the parsed text */
127 public final List<Part> getParts() { return parts; }
128
129 /** Parses the text string into an internal representation. */
130 public void parse() {
131 // Look for music track (of which there would be only one and it'll be the
132 // first token)
133 if (parseMusicTrack()) {
134 buildParts(null);
135 return;
136 }
137
138 // Look for me commands.
139 String meText = null;
140 if (parseMeText && text.startsWith("/me") && (text.length() > 3) &&
141 Character.isWhitespace(text.charAt(3))) {
142 meText = text.substring(0, 4);
143 text = text.substring(4);
144 }
145
146 // Break the text into tokens.
147 boolean wasSmiley = false;
148 while (nextChar < text.length()) {
149 if (!isWordBreak(nextChar)) {
150 if (!wasSmiley || !isSmileyBreak(nextChar)) {
151 throw new AssertionError("last chunk did not end at word break");
152 }
153 }
154
155 if (parseSmiley()) {
156 wasSmiley = true;
157 } else {
158 wasSmiley = false;
159
160 if (!parseAcronym() && !parseURL() && !parseFormatting()) {
161 parseText();
162 }
163 }
164 }
165
166 // Trim the whitespace before and after media components.
167 for (int i = 0; i < tokens.size(); ++i) {
168 if (tokens.get(i).isMedia()) {
169 if ((i > 0) && (tokens.get(i - 1) instanceof Html)) {
170 ((Html)tokens.get(i - 1)).trimLeadingWhitespace();
171 }
172 if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) {
173 ((Html)tokens.get(i + 1)).trimTrailingWhitespace();
174 }
175 }
176 }
177
178 // Remove any empty html tokens.
179 for (int i = 0; i < tokens.size(); ++i) {
180 if (tokens.get(i).isHtml() &&
181 (tokens.get(i).toHtml(true).length() == 0)) {
182 tokens.remove(i);
183 --i; // visit this index again
184 }
185 }
186
187 buildParts(meText);
188 }
189
190 /**
191 * Get a the appropriate Token for a given URL
192 *
193 * @param text the anchor text
194 * @param url the url
195 *
196 */
197 public static Token tokenForUrl(String url, String text) {
198 if(url == null) {
199 return null;
200 }
201
202 //Look for video links
203 Video video = Video.matchURL(url, text);
204 if (video != null) {
205 return video;
206 }
207
208 // Look for video links.
209 YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text);
210 if (ytVideo != null) {
211 return ytVideo;
212 }
213
214 // Look for photo links.
215 Photo photo = Photo.matchURL(url, text);
216 if (photo != null) {
217 return photo;
218 }
219
220 // Look for photo links.
221 FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text);
222 if (flickrPhoto != null) {
223 return flickrPhoto;
224 }
225
226 //Not media, so must be a regular URL
227 return new Link(url, text);
228 }
229
230 /**
231 * Builds the parts list.
232 *
233 * @param meText any meText parsed from the message
234 */
235 private void buildParts(String meText) {
236 for (int i = 0; i < tokens.size(); ++i) {
237 Token token = tokens.get(i);
238 if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) {
239 parts.add(new Part());
240 }
241 lastPart().add(token);
242 }
243
244 // The first part inherits the meText of the line.
245 if (parts.size() > 0) {
246 parts.get(0).setMeText(meText);
247 }
248 }
249
250 /** Returns the last part in the list. */
251 private Part lastPart() { return parts.get(parts.size() - 1); }
252
253 /**
254 * Looks for a music track (\u266B is first character, everything else is
255 * track info).
256 */
257 private boolean parseMusicTrack() {
258
259 if (parseMusic && text.startsWith(musicNote)) {
260 addToken(new MusicTrack(text.substring(musicNote.length())));
261 nextChar = text.length();
262 return true;
263 }
264 return false;
265 }
266
267 /** Consumes all of the text in the next word . */
268 private void parseText() {
269 StringBuilder buf = new StringBuilder();
270 int start = nextChar;
271 do {
272 char ch = text.charAt(nextChar++);
273 switch (ch) {
274 case '<': buf.append("&lt;"); break;
275 case '>': buf.append("&gt;"); break;
276 case '&': buf.append("&amp;"); break;
277 case '"': buf.append("&quot;"); break;
278 case '\'': buf.append("&apos;"); break;
279 case '\n': buf.append("<br>"); break;
280 default: buf.append(ch); break;
281 }
282 } while (!isWordBreak(nextChar));
283
284 addToken(new Html(text.substring(start, nextChar), buf.toString()));
285 }
286
287 /**
288 * Looks for smileys (e.g., ":)") in the text. The set of known smileys is
289 * loaded from a file into a trie at server start.
290 */
291 private boolean parseSmiley() {
292 if(!parseSmilies) {
293 return false;
294 }
295 TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar,
296 true);
297 if (match == null) {
298 return false;
299 } else {
300 int previousCharClass = getCharClass(nextChar - 1);
301 int nextCharClass = getCharClass(nextChar + match.getText().length());
302 if ((previousCharClass == 2 || previousCharClass == 3)
303 && (nextCharClass == 2 || nextCharClass == 3)) {
304 return false;
305 }
306 addToken(new Smiley(match.getText()));
307 nextChar += match.getText().length();
308 return true;
309 }
310 }
311
312 /** Looks for acronyms (e.g., "lol") in the text.
313 */
314 private boolean parseAcronym() {
315 if(!parseAcronyms) {
316 return false;
317 }
318 TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar);
319 if (match == null) {
320 return false;
321 } else {
322 addToken(new Acronym(match.getText(), match.getValue()));
323 nextChar += match.getText().length();
324 return true;
325 }
326 }
327
328 /** Determines if this is an allowable domain character. */
329 private boolean isDomainChar(char c) {
330 return c == '-' || Character.isLetter(c) || Character.isDigit(c);
331 }
332
333 /** Determines if the given string is a valid domain. */
334 private boolean isValidDomain(String domain) {
335 // For hostnames, check that it ends with a known domain suffix
336 if (matches(getResources().getDomainSuffixes(), reverse(domain))) {
337 return true;
338 }
339 return false;
340 }
341
342 /**
343 * Looks for a URL in two possible forms: either a proper URL with a known
344 * scheme or a domain name optionally followed by a path, query, or query.
345 */
346 private boolean parseURL() {
347 // Make sure this is a valid place to start a URL.
348 if (!parseUrls || !isURLBreak(nextChar)) {
349 return false;
350 }
351
352 int start = nextChar;
353
354 // Search for the first block of letters.
355 int index = start;
356 while ((index < text.length()) && isDomainChar(text.charAt(index))) {
357 index += 1;
358 }
359
360 String url = "";
361 boolean done = false;
362
363 if (index == text.length()) {
364 return false;
365 } else if (text.charAt(index) == ':') {
366 // Make sure this is a known scheme.
367 String scheme = text.substring(nextChar, index);
368 if (!getResources().getSchemes().contains(scheme)) {
369 return false;
370 }
371 } else if (text.charAt(index) == '.') {
372 // Search for the end of the domain name.
373 while (index < text.length()) {
374 char ch = text.charAt(index);
375 if ((ch != '.') && !isDomainChar(ch)) {
376 break;
377 } else {
378 index += 1;
379 }
380 }
381
382 // Make sure the domain name has a valid suffix. Since tries look for
383 // prefix matches, we reverse all the strings to get suffix comparisons.
384 String domain = text.substring(nextChar, index);
385 if (!isValidDomain(domain)) {
386 return false;
387 }
388
389 // Search for a port. We deal with this specially because a colon can
390 // also be a punctuation character.
391 if ((index + 1 < text.length()) && (text.charAt(index) == ':')) {
392 char ch = text.charAt(index + 1);
393 if (Character.isDigit(ch)) {
394 index += 1;
395 while ((index < text.length()) &&
396 Character.isDigit(text.charAt(index))) {
397 index += 1;
398 }
399 }
400 }
401
402 // The domain name should be followed by end of line, whitespace,
403 // punctuation, or a colon, slash, question, or hash character. The
404 // tricky part here is that some URL characters are also punctuation, so
405 // we need to distinguish them. Since we looked for ports above, a colon
406 // is always punctuation here. To distinguish '?' cases, we look at the
407 // character that follows it.
408 if (index == text.length()) {
409 done = true;
410 } else {
411 char ch = text.charAt(index);
412 if (ch == '?') {
413 // If the next character is whitespace or punctuation (or missing),
414 // then this question mark looks like punctuation.
415 if (index + 1 == text.length()) {
416 done = true;
417 } else {
418 char ch2 = text.charAt(index + 1);
419 if (Character.isWhitespace(ch2) || isPunctuation(ch2)) {
420 done = true;
421 }
422 }
423 } else if (isPunctuation(ch)) {
424 done = true;
425 } else if (Character.isWhitespace(ch)) {
426 done = true;
427 } else if ((ch == '/') || (ch == '#')) {
428 // In this case, the URL is not done. We will search for the end of
429 // it below.
430 } else {
431 return false;
432 }
433 }
434
435 // We will assume the user meant HTTP. (One weird case is where they
436 // type a port of 443. That could mean HTTPS, but they might also want
437 // HTTP. We'll let them specify if they don't want HTTP.)
438 url = "http://";
439 } else {
440 return false;
441 }
442
443 // If the URL is not done, search for the end, which is just before the
444 // next whitespace character.
445 if (!done) {
446 while ((index < text.length()) &&
447 !Character.isWhitespace(text.charAt(index))) {
448 index += 1;
449 }
450 }
451
452 String urlText = text.substring(start, index);
453 url += urlText;
454
455 // Figure out the appropriate token type.
456 addURLToken(url, urlText);
457
458 nextChar = index;
459 return true;
460 }
461
462 /**
463 * Adds the appropriate token for the given URL. This might be a simple
464 * link or it might be a recognized media type.
465 */
466 private void addURLToken(String url, String text) {
467 addToken(tokenForUrl(url, text));
468 }
469
470 /**
471 * Deal with formatting characters.
472 *
473 * Parsing is as follows:
474 * - Treat all contiguous strings of formatting characters as one block.
475 * (This method processes one block.)
476 * - Only a single instance of a particular format character within a block
477 * is used to determine whether to turn on/off that type of formatting;
478 * other instances simply print the character itself.
479 * - If the format is to be turned on, we use the _first_ instance; if it
480 * is to be turned off, we use the _last_ instance (by appending the
481 * format.)
482 *
483 * Example:
484 * **string** turns into <b>*string*</b>
485 */
486 private boolean parseFormatting() {
487 if(!parseFormatting) {
488 return false;
489 }
490 int endChar = nextChar;
491 while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) {
492 endChar += 1;
493 }
494
495 if ((endChar == nextChar) || !isWordBreak(endChar)) {
496 return false;
497 }
498
499 // Keeps track of whether we've seen a character (in map if we've seen it)
500 // and whether we should append a closing format token (if value in
501 // map is TRUE). Linked hashmap for consistent ordering.
502 LinkedHashMap<Character, Boolean> seenCharacters =
503 new LinkedHashMap<Character, Boolean>();
504
505 for (int index = nextChar; index < endChar; ++index) {
506 char ch = text.charAt(index);
507 Character key = Character.valueOf(ch);
508 if (seenCharacters.containsKey(key)) {
509 // Already seen this character, just append an unmatched token, which
510 // will print plaintext character
511 addToken(new Format(ch, false));
512 } else {
513 Format start = formatStart.get(key);
514 if (start != null) {
515 // Match the start token, and ask an end token to be appended
516 start.setMatched(true);
517 formatStart.remove(key);
518 seenCharacters.put(key, Boolean.TRUE);
519 } else {
520 // Append start token
521 start = new Format(ch, true);
522 formatStart.put(key, start);
523 addToken(start);
524 seenCharacters.put(key, Boolean.FALSE);
525 }
526 }
527 }
528
529 // Append any necessary end tokens
530 for (Character key : seenCharacters.keySet()) {
531 if (seenCharacters.get(key) == Boolean.TRUE) {
532 Format end = new Format(key.charValue(), false);
533 end.setMatched(true);
534 addToken(end);
535 }
536 }
537
538 nextChar = endChar;
539 return true;
540 }
541
542 /** Determines whether the given index could be a possible word break. */
543 private boolean isWordBreak(int index) {
544 return getCharClass(index - 1) != getCharClass(index);
545 }
546
547 /** Determines whether the given index could be a possible smiley break. */
548 private boolean isSmileyBreak(int index) {
549 if (index > 0 && index < text.length()) {
550 if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) {
551 return true;
552 }
553 }
554
555 return false;
556 }
557
558 /**
559 * Verifies that the character before the given index is end of line,
560 * whitespace, or punctuation.
561 */
562 private boolean isURLBreak(int index) {
563 switch (getCharClass(index - 1)) {
564 case 2:
565 case 3:
566 case 4:
567 return false;
568
569 case 0:
570 case 1:
571 default:
572 return true;
573 }
574 }
575
576 /** Returns the class for the character at the given index. */
577 private int getCharClass(int index) {
578 if ((index < 0) || (text.length() <= index)) {
579 return 0;
580 }
581
582 char ch = text.charAt(index);
583 if (Character.isWhitespace(ch)) {
584 return 1;
585 } else if (Character.isLetter(ch)) {
586 return 2;
587 } else if (Character.isDigit(ch)) {
588 return 3;
589 } else if (isPunctuation(ch)) {
590 // For punctuation, we return a unique value every time so that they are
591 // always different from any other character. Punctuation should always
592 // be considered a possible word break.
593 return ++nextClass;
594 } else {
595 return 4;
596 }
597 }
598
599 /**
600 * Returns true if <code>c1</code> could be the last character of
601 * a smiley and <code>c2</code> could be the first character of
602 * a different smiley, if {@link #isWordBreak} would not already
603 * recognize that this is possible.
604 */
605 private static boolean isSmileyBreak(char c1, char c2) {
606 switch (c1) {
607 /*
608 * These characters can end smileys, but don't normally end words.
609 */
610 case '$': case '&': case '*': case '+': case '-':
611 case '/': case '<': case '=': case '>': case '@':
612 case '[': case '\\': case ']': case '^': case '|':
613 case '}': case '~':
614 switch (c2) {
615 /*
616 * These characters can begin smileys, but don't normally
617 * begin words.
618 */
619 case '#': case '$': case '%': case '*': case '/':
620 case '<': case '=': case '>': case '@': case '[':
621 case '\\': case '^': case '~':
622 return true;
623 }
624 }
625
626 return false;
627 }
628
629 /** Determines whether the given character is punctuation. */
630 private static boolean isPunctuation(char ch) {
631 switch (ch) {
632 case '.': case ',': case '"': case ':': case ';':
633 case '?': case '!': case '(': case ')':
634 return true;
635
636 default:
637 return false;
638 }
639 }
640
641 /**
642 * Determines whether the given character is the beginning or end of a
643 * section with special formatting.
644 */
645 private static boolean isFormatChar(char ch) {
646 switch (ch) {
647 case '*': case '_': case '^':
648 return true;
649
650 default:
651 return false;
652 }
653 }
654
655 /** Represents a unit of parsed output. */
656 public static abstract class Token {
Artur Satayevfc46be72019-11-04 17:50:59 +0000657 @UnsupportedAppUsage(implicitMember =
658 "values()[Lcom/google/android/util/AbstractMessageParser$Token$Type;")
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800659 public enum Type {
Andrei Oneaeecddd52019-03-27 10:32:55 +0000660 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800661 HTML ("html"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000662 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800663 FORMAT ("format"), // subtype of HTML
Andrei Oneaeecddd52019-03-27 10:32:55 +0000664 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800665 LINK ("l"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000666 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800667 SMILEY ("e"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000668 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800669 ACRONYM ("a"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000670 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800671 MUSIC ("m"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000672 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800673 GOOGLE_VIDEO ("v"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000674 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800675 YOUTUBE_VIDEO ("yt"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000676 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800677 PHOTO ("p"),
Andrei Oneaeecddd52019-03-27 10:32:55 +0000678 @UnsupportedAppUsage
The Android Open Source Project9066cfe2009-03-03 19:31:44 -0800679 FLICKR ("f");
680
681 //stringreps for HTML and FORMAT don't really matter
682 //because they don't define getInfo(), which is where it is used
683 //For the other types, code depends on their stringreps
684 private String stringRep;
685
686 Type(String stringRep) {
687 this.stringRep = stringRep;
688 }
689
690 /** {@inheritDoc} */
691 public String toString() {
692 return this.stringRep;
693 }
694 }
695
696 protected Type type;
697 protected String text;
698
699 protected Token(Type type, String text) {
700 this.type = type;
701 this.text = text;
702 }
703
704 /** Returns the type of the token. */
705 public Type getType() { return type; }
706
707 /**
708 * Get the relevant information about a token
709 *
710 * @return a list of strings representing the token, not null
711 * The first item is always a string representation of the type
712 */
713 public List<String> getInfo() {
714 List<String> info = new ArrayList<String>();
715 info.add(getType().toString());
716 return info;
717 }
718
719 /** Returns the raw text of the token. */
720 public String getRawText() { return text; }
721
722 public boolean isMedia() { return false; }
723 public abstract boolean isHtml();
724 public boolean isArray() { return !isHtml(); }
725
726 public String toHtml(boolean caps) { throw new AssertionError("not html"); }
727
728 // The token can change the caps of the text after that point.
729 public boolean controlCaps() { return false; }
730 public boolean setCaps() { return false; }
731 }
732
733 /** Represents a simple string of html text. */
734 public static class Html extends Token {
735 private String html;
736
737 public Html(String text, String html) {
738 super(Type.HTML, text);
739 this.html = html;
740 }
741
742 public boolean isHtml() { return true; }
743 public String toHtml(boolean caps) {
744 return caps ? html.toUpperCase() : html;
745 }
746 /**
747 * Not supported. Info should not be needed for this type
748 */
749 public List<String> getInfo() {
750 throw new UnsupportedOperationException();
751 }
752
753 public void trimLeadingWhitespace() {
754 text = trimLeadingWhitespace(text);
755 html = trimLeadingWhitespace(html);
756 }
757
758 public void trimTrailingWhitespace() {
759 text = trimTrailingWhitespace(text);
760 html = trimTrailingWhitespace(html);
761 }
762
763 private static String trimLeadingWhitespace(String text) {
764 int index = 0;
765 while ((index < text.length()) &&
766 Character.isWhitespace(text.charAt(index))) {
767 ++index;
768 }
769 return text.substring(index);
770 }
771
772 public static String trimTrailingWhitespace(String text) {
773 int index = text.length();
774 while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) {
775 --index;
776 }
777 return text.substring(0, index);
778 }
779 }
780
781 /** Represents a music track token at the beginning. */
782 public static class MusicTrack extends Token {
783 private String track;
784
785 public MusicTrack(String track) {
786 super(Type.MUSIC, track);
787 this.track = track;
788 }
789
790 public String getTrack() { return track; }
791
792 public boolean isHtml() { return false; }
793
794 public List<String> getInfo() {
795 List<String> info = super.getInfo();
796 info.add(getTrack());
797 return info;
798 }
799 }
800
801 /** Represents a link that was found in the input. */
802 public static class Link extends Token {
803 private String url;
804
805 public Link(String url, String text) {
806 super(Type.LINK, text);
807 this.url = url;
808 }
809
810 public String getURL() { return url; }
811
812 public boolean isHtml() { return false; }
813
814 public List<String> getInfo() {
815 List<String> info = super.getInfo();
816 info.add(getURL());
817 info.add(getRawText());
818 return info;
819 }
820 }
821
822 /** Represents a link to a Google Video. */
823 public static class Video extends Token {
824 /** Pattern for a video URL. */
825 private static final Pattern URL_PATTERN = Pattern.compile(
826 "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?"
827 + ".*?\\bdocid=(-?\\d+).*");
828
829 private String docid;
830
831 public Video(String docid, String text) {
832 super(Type.GOOGLE_VIDEO, text);
833 this.docid = docid;
834 }
835
836 public String getDocID() { return docid; }
837
838 public boolean isHtml() { return false; }
839 public boolean isMedia() { return true; }
840
841 /** Returns a Video object if the given url is to a video. */
842 public static Video matchURL(String url, String text) {
843 Matcher m = URL_PATTERN.matcher(url);
844 if (m.matches()) {
845 return new Video(m.group(1), text);
846 } else {
847 return null;
848 }
849 }
850
851 public List<String> getInfo() {
852 List<String> info = super.getInfo();
853 info.add(getRssUrl(docid));
854 info.add(getURL(docid));
855 return info;
856 }
857
858 /** Returns the URL for the RSS description of the given video. */
859 public static String getRssUrl(String docid) {
860 return "http://video.google.com/videofeed"
861 + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid;
862 }
863
864 /** (For testing purposes:) Returns a video URL with the given parts. */
865 public static String getURL(String docid) {
866 return getURL(docid, null);
867 }
868
869 /** (For testing purposes:) Returns a video URL with the given parts. */
870 public static String getURL(String docid, String extraParams) {
871 if (extraParams == null) {
872 extraParams = "";
873 } else if (extraParams.length() > 0) {
874 extraParams += "&";
875 }
876 return "http://video.google.com/videoplay?" + extraParams
877 + "docid=" + docid;
878 }
879 }
880
881 /** Represents a link to a YouTube video. */
882 public static class YouTubeVideo extends Token {
883 /** Pattern for a video URL. */
884 private static final Pattern URL_PATTERN = Pattern.compile(
885 "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?"
886 + ".*\\bv=([-_a-zA-Z0-9=]+).*");
887
888 private String docid;
889
890 public YouTubeVideo(String docid, String text) {
891 super(Type.YOUTUBE_VIDEO, text);
892 this.docid = docid;
893 }
894
895 public String getDocID() { return docid; }
896
897 public boolean isHtml() { return false; }
898 public boolean isMedia() { return true; }
899
900 /** Returns a Video object if the given url is to a video. */
901 public static YouTubeVideo matchURL(String url, String text) {
902 Matcher m = URL_PATTERN.matcher(url);
903 if (m.matches()) {
904 return new YouTubeVideo(m.group(1), text);
905 } else {
906 return null;
907 }
908 }
909
910 public List<String> getInfo() {
911 List<String> info = super.getInfo();
912 info.add(getRssUrl(docid));
913 info.add(getURL(docid));
914 return info;
915 }
916
917 /** Returns the URL for the RSS description of the given video. */
918 public static String getRssUrl(String docid) {
919 return "http://youtube.com/watch?v=" + docid;
920 }
921
922 /** (For testing purposes:) Returns a video URL with the given parts. */
923 public static String getURL(String docid) {
924 return getURL(docid, null);
925 }
926
927 /** (For testing purposes:) Returns a video URL with the given parts. */
928 public static String getURL(String docid, String extraParams) {
929 if (extraParams == null) {
930 extraParams = "";
931 } else if (extraParams.length() > 0) {
932 extraParams += "&";
933 }
934 return "http://youtube.com/watch?" + extraParams + "v=" + docid;
935 }
936
937 /** (For testing purposes:) Returns a video URL with the given parts.
938 * @param http If true, includes http://
939 * @param prefix If non-null/non-blank, adds to URL before youtube.com.
940 * (e.g., prefix="br." --> "br.youtube.com")
941 */
942 public static String getPrefixedURL(boolean http, String prefix,
943 String docid, String extraParams) {
944 String protocol = "";
945
946 if (http) {
947 protocol = "http://";
948 }
949
950 if (prefix == null) {
951 prefix = "";
952 }
953
954 if (extraParams == null) {
955 extraParams = "";
956 } else if (extraParams.length() > 0) {
957 extraParams += "&";
958 }
959
960 return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" +
961 docid;
962 }
963 }
964
965 /** Represents a link to a Picasa photo or album. */
966 public static class Photo extends Token {
967 /** Pattern for an album or photo URL. */
968 // TODO (katyarogers) searchbrowse includes search lists and tags,
969 // it follows a different pattern than albums - would be nice to add later
970 private static final Pattern URL_PATTERN = Pattern.compile(
971 "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?");
972
973 private String user;
974 private String album;
975 private String photo; // null for albums
976
977 public Photo(String user, String album, String photo, String text) {
978 super(Type.PHOTO, text);
979 this.user = user;
980 this.album = album;
981 this.photo = photo;
982 }
983
984 public String getUser() { return user; }
985 public String getAlbum() { return album; }
986 public String getPhoto() { return photo; }
987
988 public boolean isHtml() { return false; }
989 public boolean isMedia() { return true; }
990
991 /** Returns a Photo object if the given url is to a photo or album. */
992 public static Photo matchURL(String url, String text) {
993 Matcher m = URL_PATTERN.matcher(url);
994 if (m.matches()) {
995 return new Photo(m.group(1), m.group(2), m.group(3), text);
996 } else {
997 return null;
998 }
999 }
1000
1001 public List<String> getInfo() {
1002 List<String> info = super.getInfo();
1003 info.add(getRssUrl(getUser()));
1004 info.add(getAlbumURL(getUser(), getAlbum()));
1005 if (getPhoto() != null) {
1006 info.add(getPhotoURL(getUser(), getAlbum(), getPhoto()));
1007 } else {
1008 info.add((String)null);
1009 }
1010 return info;
1011 }
1012
1013 /** Returns the URL for the RSS description of the user's albums. */
1014 public static String getRssUrl(String user) {
1015 return "http://picasaweb.google.com/data/feed/api/user/" + user +
1016 "?category=album&alt=rss";
1017 }
1018
1019 /** Returns the URL for an album. */
1020 public static String getAlbumURL(String user, String album) {
1021 return "http://picasaweb.google.com/" + user + "/" + album;
1022 }
1023
1024 /** Returns the URL for a particular photo. */
1025 public static String getPhotoURL(String user, String album, String photo) {
1026 return "http://picasaweb.google.com/" + user + "/" + album + "/photo#"
1027 + photo;
1028 }
1029 }
1030
1031 /** Represents a link to a Flickr photo or album. */
1032 public static class FlickrPhoto extends Token {
1033 /** Pattern for a user album or photo URL. */
1034 private static final Pattern URL_PATTERN = Pattern.compile(
1035 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*");
1036 private static final Pattern GROUPING_PATTERN = Pattern.compile(
1037 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" +
1038 "([^/?#&]+)/?");
1039
1040 private static final String SETS = "sets";
1041 private static final String TAGS = "tags";
1042
1043 private String user;
1044 private String photo; // null for user album
1045 private String grouping; // either "tags" or "sets"
1046 private String groupingId; // sets or tags identifier
1047
1048 public FlickrPhoto(String user, String photo, String grouping,
1049 String groupingId, String text) {
1050 super(Type.FLICKR, text);
1051
1052 /* System wide tags look like the URL to a Flickr user. */
1053 if (!TAGS.equals(user)) {
1054 this.user = user;
1055 // Don't consider slide show URL a photo
1056 this.photo = (!"show".equals(photo) ? photo : null);
1057 this.grouping = grouping;
1058 this.groupingId = groupingId;
1059 } else {
1060 this.user = null;
1061 this.photo = null;
1062 this.grouping = TAGS;
1063 this.groupingId = photo;
1064 }
1065 }
1066
1067 public String getUser() { return user; }
1068 public String getPhoto() { return photo; }
1069 public String getGrouping() { return grouping; }
1070 public String getGroupingId() { return groupingId; }
1071
1072 public boolean isHtml() { return false; }
1073 public boolean isMedia() { return true; }
1074
1075 /**
1076 * Returns a FlickrPhoto object if the given url is to a photo or Flickr
1077 * user.
1078 */
1079 public static FlickrPhoto matchURL(String url, String text) {
1080 Matcher m = GROUPING_PATTERN.matcher(url);
1081 if (m.matches()) {
1082 return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text);
1083 }
1084
1085 m = URL_PATTERN.matcher(url);
1086 if (m.matches()) {
1087 return new FlickrPhoto(m.group(1), m.group(2), null, null, text);
1088 } else {
1089 return null;
1090 }
1091 }
1092
1093 public List<String> getInfo() {
1094 List<String> info = super.getInfo();
1095 info.add(getUrl());
1096 info.add(getUser() != null ? getUser() : "");
1097 info.add(getPhoto() != null ? getPhoto() : "");
1098 info.add(getGrouping() != null ? getGrouping() : "");
1099 info.add(getGroupingId() != null ? getGroupingId() : "");
1100 return info;
1101 }
1102
1103 public String getUrl() {
1104 if (SETS.equals(grouping)) {
1105 return getUserSetsURL(user, groupingId);
1106 } else if (TAGS.equals(grouping)) {
1107 if (user != null) {
1108 return getUserTagsURL(user, groupingId);
1109 } else {
1110 return getTagsURL(groupingId);
1111 }
1112 } else if (photo != null) {
1113 return getPhotoURL(user, photo);
1114 } else {
1115 return getUserURL(user);
1116 }
1117 }
1118
1119 /** Returns the URL for the RSS description. */
1120 public static String getRssUrl(String user) {
1121 return null;
1122 }
1123
1124 /** Returns the URL for a particular tag. */
1125 public static String getTagsURL(String tag) {
1126 return "http://flickr.com/photos/tags/" + tag;
1127 }
1128
1129 /** Returns the URL to the user's Flickr homepage. */
1130 public static String getUserURL(String user) {
1131 return "http://flickr.com/photos/" + user;
1132 }
1133
1134 /** Returns the URL for a particular photo. */
1135 public static String getPhotoURL(String user, String photo) {
1136 return "http://flickr.com/photos/" + user + "/" + photo;
1137 }
1138
1139 /** Returns the URL for a user tag photo set. */
1140 public static String getUserTagsURL(String user, String tagId) {
1141 return "http://flickr.com/photos/" + user + "/tags/" + tagId;
1142 }
1143
1144 /** Returns the URL for user set. */
1145 public static String getUserSetsURL(String user, String setId) {
1146 return "http://flickr.com/photos/" + user + "/sets/" + setId;
1147 }
1148 }
1149
1150 /** Represents a smiley that was found in the input. */
1151 public static class Smiley extends Token {
1152 // TODO: Pass the SWF URL down to the client.
1153
1154 public Smiley(String text) {
1155 super(Type.SMILEY, text);
1156 }
1157
1158 public boolean isHtml() { return false; }
1159
1160 public List<String> getInfo() {
1161 List<String> info = super.getInfo();
1162 info.add(getRawText());
1163 return info;
1164 }
1165 }
1166
1167 /** Represents an acronym that was found in the input. */
1168 public static class Acronym extends Token {
1169 private String value;
1170 // TODO: SWF
1171
1172 public Acronym(String text, String value) {
1173 super(Type.ACRONYM, text);
1174 this.value = value;
1175 }
1176
1177 public String getValue() { return value; }
1178
1179 public boolean isHtml() { return false; }
1180
1181 public List<String> getInfo() {
1182 List<String> info = super.getInfo();
1183 info.add(getRawText());
1184 info.add(getValue());
1185 return info;
1186 }
1187 }
1188
1189 /** Represents a character that changes formatting. */
1190 public static class Format extends Token {
1191 private char ch;
1192 private boolean start;
1193 private boolean matched;
1194
1195 public Format(char ch, boolean start) {
1196 super(Type.FORMAT, String.valueOf(ch));
1197 this.ch = ch;
1198 this.start = start;
1199 }
1200
1201 public void setMatched(boolean matched) { this.matched = matched; }
1202
1203 public boolean isHtml() { return true; }
1204
1205 public String toHtml(boolean caps) {
1206 // This character only implies special formatting if it was matched.
1207 // Otherwise, it was just a plain old character.
1208 if (matched) {
1209 return start ? getFormatStart(ch) : getFormatEnd(ch);
1210 } else {
1211 // We have to make sure we escape HTML characters as usual.
1212 return (ch == '"') ? "&quot;" : String.valueOf(ch);
1213 }
1214 }
1215
1216 /**
1217 * Not supported. Info should not be needed for this type
1218 */
1219 public List<String> getInfo() {
1220 throw new UnsupportedOperationException();
1221 }
1222
1223 public boolean controlCaps() { return (ch == '^'); }
1224 public boolean setCaps() { return start; }
1225
1226 private String getFormatStart(char ch) {
1227 switch (ch) {
1228 case '*': return "<b>";
1229 case '_': return "<i>";
1230 case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps
1231 case '"': return "<font color=\"#999999\">\u201c";
1232 default: throw new AssertionError("unknown format '" + ch + "'");
1233 }
1234 }
1235
1236 private String getFormatEnd(char ch) {
1237 switch (ch) {
1238 case '*': return "</b>";
1239 case '_': return "</i>";
1240 case '^': return "</font></b>"; // TODO: all caps
1241 case '"': return "\u201d</font>";
1242 default: throw new AssertionError("unknown format '" + ch + "'");
1243 }
1244 }
1245 }
1246
1247 /** Adds the given token to the parsed output. */
1248 private void addToken(Token token) {
1249 tokens.add(token);
1250 }
1251
1252 /** Converts the entire message into a single HTML display string. */
1253 public String toHtml() {
1254 StringBuilder html = new StringBuilder();
1255
1256 for (Part part : parts) {
1257 boolean caps = false;
1258
1259 html.append("<p>");
1260 for (Token token : part.getTokens()) {
1261 if (token.isHtml()) {
1262 html.append(token.toHtml(caps));
1263 } else {
1264 switch (token.getType()) {
1265 case LINK:
1266 html.append("<a href=\"");
1267 html.append(((Link)token).getURL());
1268 html.append("\">");
1269 html.append(token.getRawText());
1270 html.append("</a>");
1271 break;
1272
1273 case SMILEY:
1274 // TODO: link to an appropriate image
1275 html.append(token.getRawText());
1276 break;
1277
1278 case ACRONYM:
1279 html.append(token.getRawText());
1280 break;
1281
1282 case MUSIC:
1283 // TODO: include a music glyph
1284 html.append(((MusicTrack)token).getTrack());
1285 break;
1286
1287 case GOOGLE_VIDEO:
1288 // TODO: include a Google Video icon
1289 html.append("<a href=\"");
1290 html.append(((Video)token).getURL(((Video)token).getDocID()));
1291 html.append("\">");
1292 html.append(token.getRawText());
1293 html.append("</a>");
1294 break;
1295
1296 case YOUTUBE_VIDEO:
1297 // TODO: include a YouTube icon
1298 html.append("<a href=\"");
1299 html.append(((YouTubeVideo)token).getURL(
1300 ((YouTubeVideo)token).getDocID()));
1301 html.append("\">");
1302 html.append(token.getRawText());
1303 html.append("</a>");
1304 break;
1305
1306 case PHOTO: {
1307 // TODO: include a Picasa Web icon
1308 html.append("<a href=\"");
1309 html.append(Photo.getAlbumURL(
1310 ((Photo)token).getUser(), ((Photo)token).getAlbum()));
1311 html.append("\">");
1312 html.append(token.getRawText());
1313 html.append("</a>");
1314 break;
1315 }
1316
1317 case FLICKR:
1318 // TODO: include a Flickr icon
1319 Photo p = (Photo) token;
1320 html.append("<a href=\"");
1321 html.append(((FlickrPhoto)token).getUrl());
1322 html.append("\">");
1323 html.append(token.getRawText());
1324 html.append("</a>");
1325 break;
1326
1327 default:
1328 throw new AssertionError("unknown token type: " + token.getType());
1329 }
1330 }
1331
1332 if (token.controlCaps()) {
1333 caps = token.setCaps();
1334 }
1335 }
1336 html.append("</p>\n");
1337 }
1338
1339 return html.toString();
1340 }
1341
1342 /** Returns the reverse of the given string. */
1343 protected static String reverse(String str) {
1344 StringBuilder buf = new StringBuilder();
1345 for (int i = str.length() - 1; i >= 0; --i) {
1346 buf.append(str.charAt(i));
1347 }
1348 return buf.toString();
1349 }
1350
1351 public static class TrieNode {
1352 private final HashMap<Character,TrieNode> children =
1353 new HashMap<Character,TrieNode>();
1354 private String text;
1355 private String value;
1356
1357 public TrieNode() { this(""); }
1358 public TrieNode(String text) {
1359 this.text = text;
1360 }
1361
1362 public final boolean exists() { return value != null; }
1363 public final String getText() { return text; }
1364 public final String getValue() { return value; }
1365 public void setValue(String value) { this.value = value; }
1366
1367 public TrieNode getChild(char ch) {
1368 return children.get(Character.valueOf(ch));
1369 }
1370
1371 public TrieNode getOrCreateChild(char ch) {
1372 Character key = Character.valueOf(ch);
1373 TrieNode node = children.get(key);
1374 if (node == null) {
1375 node = new TrieNode(text + String.valueOf(ch));
1376 children.put(key, node);
1377 }
1378 return node;
1379 }
1380
1381 /** Adds the given string into the trie. */
1382 public static void addToTrie(TrieNode root, String str, String value) {
1383 int index = 0;
1384 while (index < str.length()) {
1385 root = root.getOrCreateChild(str.charAt(index++));
1386 }
1387 root.setValue(value);
1388 }
1389 }
1390
1391
1392
1393 /** Determines whether the given string is in the given trie. */
1394 private static boolean matches(TrieNode root, String str) {
1395 int index = 0;
1396 while (index < str.length()) {
1397 root = root.getChild(str.charAt(index++));
1398 if (root == null) {
1399 break;
1400 } else if (root.exists()) {
1401 return true;
1402 }
1403 }
1404 return false;
1405 }
1406
1407 /**
1408 * Returns the longest substring of the given string, starting at the given
1409 * index, that exists in the trie.
1410 */
1411 private static TrieNode longestMatch(
1412 TrieNode root, AbstractMessageParser p, int start) {
1413 return longestMatch(root, p, start, false);
1414 }
1415
1416 /**
1417 * Returns the longest substring of the given string, starting at the given
1418 * index, that exists in the trie, with a special tokenizing case for
1419 * smileys if specified.
1420 */
1421 private static TrieNode longestMatch(
1422 TrieNode root, AbstractMessageParser p, int start, boolean smiley) {
1423 int index = start;
1424 TrieNode bestMatch = null;
1425 while (index < p.getRawText().length()) {
1426 root = root.getChild(p.getRawText().charAt(index++));
1427 if (root == null) {
1428 break;
1429 } else if (root.exists()) {
1430 if (p.isWordBreak(index)) {
1431 bestMatch = root;
1432 } else if (smiley && p.isSmileyBreak(index)) {
1433 bestMatch = root;
1434 }
1435 }
1436 }
1437 return bestMatch;
1438 }
1439
1440
1441 /** Represents set of tokens that are delivered as a single message. */
1442 public static class Part {
1443 private String meText;
1444 private ArrayList<Token> tokens;
1445
1446 public Part() {
1447 this.tokens = new ArrayList<Token>();
1448 }
1449
1450 public String getType(boolean isSend) {
1451 return (isSend ? "s" : "r") + getPartType();
1452 }
1453
1454 private String getPartType() {
1455 if (isMedia()) {
1456 return "d";
1457 } else if (meText != null) {
1458 return "m";
1459 } else {
1460 return "";
1461 }
1462 }
1463
1464 public boolean isMedia() {
1465 return (tokens.size() == 1) && tokens.get(0).isMedia();
1466 }
1467 /**
1468 * Convenience method for getting the Token of a Part that represents
1469 * a media Token. Parts of this kind will always only have a single Token
1470 *
1471 * @return if this.isMedia(),
1472 * returns the Token representing the media contained in this Part,
1473 * otherwise returns null;
1474 */
1475 public Token getMediaToken() {
1476 if(isMedia()) {
1477 return tokens.get(0);
1478 }
1479 return null;
1480 }
1481
1482 /** Adds the given token to this part. */
1483 public void add(Token token) {
1484 if (isMedia()) {
1485 throw new AssertionError("media ");
1486 }
1487 tokens.add(token);
1488 }
1489
1490 public void setMeText(String meText) {
1491 this.meText = meText;
1492 }
1493
1494 /** Returns the original text of this part. */
1495 public String getRawText() {
1496 StringBuilder buf = new StringBuilder();
1497 if (meText != null) {
1498 buf.append(meText);
1499 }
1500 for (int i = 0; i < tokens.size(); ++i) {
1501 buf.append(tokens.get(i).getRawText());
1502 }
1503 return buf.toString();
1504 }
1505
1506 /** Returns the tokens in this part. */
1507 public ArrayList<Token> getTokens() { return tokens; }
1508
1509 /** Adds the tokens into the given builder as an array. */
1510// public void toArray(JSArrayBuilder array) {
1511// if (isMedia()) {
1512// // For media, we send its array (i.e., we don't wrap this in another
1513// // array as we do for non-media parts).
1514// tokens.get(0).toArray(array);
1515// } else {
1516// array.beginArray();
1517// addToArray(array);
1518// array.endArray();
1519// }
1520// }
1521 }
1522}