Add back lost python script.
The script is used to generate top level domains' regular expressions.
This is enhanced and used to regenerate the new top level domains.
new file: common/tools/make-iana-tld-pattern.py
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
new file mode 100755
index 0000000..ece4dcf
--- /dev/null
+++ b/common/tools/make-iana-tld-pattern.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+from urllib2 import urlopen
+
+TLD_PREFIX = r"""
+ /**
+ * Regular expression pattern to match all IANA top-level domains.
+ * List accurate as of 2010/02/05. List taken from:
+ * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+ * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
+ */
+ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+"""
+TLD_SUFFIX = '");'
+
+URL_PREFIX = r"""
+ /**
+ * Regular expression pattern to match RFC 1738 URLs
+ * List accurate as of 2010/02/05. List taken from:
+ * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+ * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
+ */
+ public static final Pattern WEB_URL = Pattern.compile(
+ "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
+ + "(?:" // plus top level domain
+"""
+
+URL_SUFFIX = r"""
+ + "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ + "|[1-9][0-9]|[0-9])))"
+ + "(?:\\:\\d{1,5})?)" // plus option port number
+ + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
+ + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ + "(?:\\b|$)"); // and finally, a word boundary or end of
+ // input. This is to stop foo.sure from
+ // matching as foo.su
+"""
+
+class Bucket:
+ def __init__(self, baseLetter):
+ self.base=baseLetter
+ self.words=[]
+ self.letters=[]
+
+ def dump(self, isWebUrl=False, isFirst=False, isLast=False):
+ if (len(self.words) == 0) and (len(self.letters) == 0):
+ return ''
+
+ self.words.sort()
+ self.letters.sort()
+
+ output = ' ';
+
+ if isFirst:
+ if isWebUrl:
+ output += '+ "'
+ else:
+ output += '"('
+ else:
+ output += '+ "|'
+
+ if len(self.words) != 0:
+ output += '('
+
+ if isWebUrl:
+ output += '?:'
+
+ firstWord = 1
+ for word in self.words:
+ if firstWord == 0:
+ output += '|'
+ firstWord = 0
+ for letter in word:
+ if letter == '-':
+ output += '\\\\' # escape the '-' character.
+ output += letter
+
+ if len(self.words) > 0 and len(self.letters) > 0:
+ output += '|'
+
+ if len(self.letters) == 1:
+ output += '%c%c' % (self.base, self.letters[0])
+ elif len(self.letters) > 0:
+ output += '%c[' % self.base
+
+ for letter in self.letters:
+ output += letter
+
+ output += ']'
+
+ if len(self.words) != 0:
+ output += ')'
+
+ if not isLast:
+ output += '"'
+ output += '\n'
+
+ return output;
+
+ def add(self, line):
+ length = len(line)
+
+ if line.startswith('#') or (length == 0):
+ return;
+
+ if length == 2:
+ self.letters.append(line[1:2])
+ else:
+ self.words.append(line)
+
+def getBucket(buckets, line):
+ letter = line[0]
+ bucket = buckets.get(letter)
+
+ if bucket is None:
+ bucket = Bucket(letter)
+ buckets[letter] = bucket
+
+ return bucket
+
+def makePattern(prefix, suffix, buckets, isWebUrl=False):
+ output = prefix
+
+ output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
+
+ for letter in range(ord('b'), ord('z')):
+ output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
+
+ output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
+
+ if isWebUrl:
+ output += '))"'
+ else:
+ output += ')'
+
+ output += suffix
+
+ print output
+
+if __name__ == "__main__":
+ f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
+ domains = f.readlines()
+ f.close()
+
+ buckets = {}
+
+ for domain in domains:
+ domain = domain.lower()
+
+ if len(domain) > 0:
+ getBucket(buckets, domain[0]).add(domain.strip())
+
+ makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
+ makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)