Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py

commit: b4489000d1355d10426bb4ad4eedb5c2fb80e886 [log] [tgz]
author: Shimeng (Simon) Wang <swang@google.com> Wed Feb 10 11:22:01 2010 -0800
committer: Shimeng (Simon) Wang <swang@google.com> Wed Feb 10 11:22:01 2010 -0800
tree: 4f32cbbd94751e9c63755cff815b3f5f69fd349f
parent: 3ed6fbd9e141f20ca382306aa6a355cd544158d1 [diff]
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
new file mode 100755
index 0000000..ece4dcf
--- /dev/null
+++ b/common/tools/make-iana-tld-pattern.py

@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+from urllib2 import urlopen
+
+TLD_PREFIX = r"""
+    /**
+     *  Regular expression pattern to match all IANA top-level domains.
+     *  List accurate as of 2010/02/05.  List taken from:
+     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
+     */
+    public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+"""
+TLD_SUFFIX = '");'
+
+URL_PREFIX = r"""
+    /**
+     *  Regular expression pattern to match RFC 1738 URLs
+     *  List accurate as of 2010/02/05.  List taken from:
+     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+     *  This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
+     */
+    public static final Pattern WEB_URL = Pattern.compile(
+        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+        + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
+        + "(?:"   // plus top level domain
+"""
+
+URL_SUFFIX = r"""
+        + "|(?:(?:25[0-5]|2[0-4]" // or ip address
+        + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+        + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+        + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+        + "|[1-9][0-9]|[0-9])))"
+        + "(?:\\:\\d{1,5})?)" // plus option port number
+        + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
+        + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+        + "(?:\\b|$)"); // and finally, a word boundary or end of
+                        // input.  This is to stop foo.sure from
+                        // matching as foo.su
+"""
+
+class Bucket:
+    def __init__(self, baseLetter):
+        self.base=baseLetter
+        self.words=[]
+        self.letters=[]
+
+    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
+        if (len(self.words) == 0) and (len(self.letters) == 0):
+            return ''
+
+        self.words.sort()
+        self.letters.sort()
+
+        output = '        ';
+
+        if isFirst:
+            if isWebUrl:
+                output += '+ "'
+            else:
+                output += '"('
+        else:
+            output += '+ "|'
+
+        if len(self.words) != 0:
+            output += '('
+
+            if isWebUrl:
+                output += '?:'
+
+        firstWord = 1
+        for word in self.words:
+            if firstWord == 0:
+                output += '|'
+            firstWord = 0
+            for letter in word:
+                if letter == '-':
+                    output += '\\\\'  # escape the '-' character.
+                output += letter
+
+        if len(self.words) > 0 and len(self.letters) > 0:
+            output += '|'
+
+        if len(self.letters) == 1:
+            output += '%c%c' % (self.base, self.letters[0])
+        elif len(self.letters) > 0:
+            output += '%c[' % self.base
+
+            for letter in self.letters:
+                output += letter
+
+            output += ']'
+
+        if len(self.words) != 0:
+            output += ')'
+
+        if not isLast:
+            output += '"'
+            output += '\n'
+
+        return output;
+
+    def add(self, line):
+        length = len(line)
+
+        if line.startswith('#') or (length == 0):
+            return;
+
+        if length == 2:
+            self.letters.append(line[1:2])
+        else:
+            self.words.append(line)
+
+def getBucket(buckets, line):
+    letter = line[0]
+    bucket = buckets.get(letter)
+
+    if bucket is None:
+        bucket = Bucket(letter)
+        buckets[letter] = bucket
+
+    return bucket
+
+def makePattern(prefix, suffix, buckets, isWebUrl=False):
+    output = prefix
+
+    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
+
+    for letter in range(ord('b'), ord('z')):
+        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
+
+    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
+
+    if isWebUrl:
+        output += '))"'
+    else:
+        output += ')'
+
+    output += suffix
+
+    print output
+
+if __name__ == "__main__":
+    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
+    domains = f.readlines()
+    f.close()
+
+    buckets = {}
+
+    for domain in domains:
+        domain = domain.lower()
+
+        if len(domain) > 0:
+            getBucket(buckets, domain[0]).add(domain.strip())
+
+    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
+    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
commit	b4489000d1355d10426bb4ad4eedb5c2fb80e886	[log] [tgz]
author	Shimeng (Simon) Wang <swang@google.com>	Wed Feb 10 11:22:01 2010 -0800
committer	Shimeng (Simon) Wang <swang@google.com>	Wed Feb 10 11:22:01 2010 -0800
tree	4f32cbbd94751e9c63755cff815b3f5f69fd349f
parent	3ed6fbd9e141f20ca382306aa6a355cd544158d1 [diff]