Ying Wang | 823b6f3 | 2010-06-15 13:54:16 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | from urllib2 import urlopen |
| 4 | |
| 5 | TLD_PREFIX = r""" |
| 6 | /** |
| 7 | * Regular expression to match all IANA top-level domains. |
Shimeng (Simon) Wang | 829b84e | 2011-07-18 17:20:09 -0700 | [diff] [blame] | 8 | * List accurate as of 2011/07/18. List taken from: |
Ying Wang | 823b6f3 | 2010-06-15 13:54:16 -0700 | [diff] [blame] | 9 | * http://data.iana.org/TLD/tlds-alpha-by-domain.txt |
Shimeng (Simon) Wang | 829b84e | 2011-07-18 17:20:09 -0700 | [diff] [blame] | 10 | * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py |
Ying Wang | 823b6f3 | 2010-06-15 13:54:16 -0700 | [diff] [blame] | 11 | */ |
| 12 | public static final String TOP_LEVEL_DOMAIN_STR = |
| 13 | """ |
| 14 | TLD_SUFFIX = '";' |
| 15 | |
| 16 | URL_PREFIX = r""" |
| 17 | /** |
| 18 | * Regular expression to match all IANA top-level domains for WEB_URL. |
Shimeng (Simon) Wang | 829b84e | 2011-07-18 17:20:09 -0700 | [diff] [blame] | 19 | * List accurate as of 2011/07/18. List taken from: |
Ying Wang | 823b6f3 | 2010-06-15 13:54:16 -0700 | [diff] [blame] | 20 | * http://data.iana.org/TLD/tlds-alpha-by-domain.txt |
Shimeng (Simon) Wang | 829b84e | 2011-07-18 17:20:09 -0700 | [diff] [blame] | 21 | * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py |
Ying Wang | 823b6f3 | 2010-06-15 13:54:16 -0700 | [diff] [blame] | 22 | */ |
| 23 | public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = |
| 24 | "(?:" |
| 25 | """ |
| 26 | |
| 27 | URL_SUFFIX = ';' |
| 28 | |
| 29 | class Bucket: |
| 30 | def __init__(self, baseLetter): |
| 31 | self.base=baseLetter |
| 32 | self.words=[] |
| 33 | self.letters=[] |
| 34 | |
| 35 | def dump(self, isWebUrl=False, isFirst=False, isLast=False): |
| 36 | if (len(self.words) == 0) and (len(self.letters) == 0): |
| 37 | return '' |
| 38 | |
| 39 | self.words.sort() |
| 40 | self.letters.sort() |
| 41 | |
| 42 | output = ' '; |
| 43 | |
| 44 | if isFirst: |
| 45 | if isWebUrl: |
| 46 | output += '+ "' |
| 47 | else: |
| 48 | output += '"(' |
| 49 | else: |
| 50 | output += '+ "|' |
| 51 | |
| 52 | if len(self.words) != 0: |
| 53 | output += '(' |
| 54 | |
| 55 | if isWebUrl: |
| 56 | output += '?:' |
| 57 | |
| 58 | firstWord = 1 |
| 59 | for word in self.words: |
| 60 | if firstWord == 0: |
| 61 | output += '|' |
| 62 | firstWord = 0 |
| 63 | for letter in word: |
| 64 | if letter == '-': |
| 65 | output += '\\\\' # escape the '-' character. |
| 66 | output += letter |
| 67 | |
| 68 | if len(self.words) > 0 and len(self.letters) > 0: |
| 69 | output += '|' |
| 70 | |
| 71 | if len(self.letters) == 1: |
| 72 | output += '%c%c' % (self.base, self.letters[0]) |
| 73 | elif len(self.letters) > 0: |
| 74 | output += '%c[' % self.base |
| 75 | |
| 76 | for letter in self.letters: |
| 77 | output += letter |
| 78 | |
| 79 | output += ']' |
| 80 | |
| 81 | if len(self.words) != 0: |
| 82 | output += ')' |
| 83 | |
| 84 | if not isLast: |
| 85 | output += '"' |
| 86 | output += '\n' |
| 87 | |
| 88 | return output; |
| 89 | |
| 90 | def add(self, line): |
| 91 | length = len(line) |
| 92 | |
| 93 | if line.startswith('#') or (length == 0): |
| 94 | return; |
| 95 | |
| 96 | if length == 2: |
| 97 | self.letters.append(line[1:2]) |
| 98 | else: |
| 99 | self.words.append(line) |
| 100 | |
| 101 | def getBucket(buckets, line): |
| 102 | letter = line[0] |
| 103 | bucket = buckets.get(letter) |
| 104 | |
| 105 | if bucket is None: |
| 106 | bucket = Bucket(letter) |
| 107 | buckets[letter] = bucket |
| 108 | |
| 109 | return bucket |
| 110 | |
| 111 | def makePattern(prefix, suffix, buckets, isWebUrl=False): |
| 112 | output = prefix |
| 113 | |
| 114 | output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) |
| 115 | |
| 116 | for letter in range(ord('b'), ord('z')): |
| 117 | output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) |
| 118 | |
| 119 | output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) |
| 120 | |
| 121 | if isWebUrl: |
| 122 | output += '))"' |
| 123 | else: |
| 124 | output += ')' |
| 125 | |
| 126 | output += suffix |
| 127 | |
| 128 | print output |
| 129 | |
| 130 | if __name__ == "__main__": |
| 131 | f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') |
| 132 | domains = f.readlines() |
| 133 | f.close() |
| 134 | |
| 135 | buckets = {} |
| 136 | |
| 137 | for domain in domains: |
| 138 | domain = domain.lower() |
| 139 | |
| 140 | if len(domain) > 0: |
| 141 | getBucket(buckets, domain[0]).add(domain.strip()) |
| 142 | |
Shimeng (Simon) Wang | 829b84e | 2011-07-18 17:20:09 -0700 | [diff] [blame] | 143 | if domain.startswith('xn--'): |
| 144 | puny = domain.strip()[4:] |
| 145 | result = puny.decode('punycode') |
| 146 | result = repr(result) |
| 147 | getBucket(buckets, 'xn--').add(result[2:-1]) |
| 148 | |
Ying Wang | 823b6f3 | 2010-06-15 13:54:16 -0700 | [diff] [blame] | 149 | makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) |
| 150 | makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) |