blob: d7cca9337e7eb065b237b5fd1a2f459e6cffcb2b [file] [log] [blame]
Ying Wang823b6f32010-06-15 13:54:16 -07001#!/usr/bin/env python
2
3from urllib2 import urlopen
4
5TLD_PREFIX = r"""
6 /**
7 * Regular expression to match all IANA top-level domains.
Shimeng (Simon) Wang829b84e2011-07-18 17:20:09 -07008 * List accurate as of 2011/07/18. List taken from:
Ying Wang823b6f32010-06-15 13:54:16 -07009 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
Shimeng (Simon) Wang829b84e2011-07-18 17:20:09 -070010 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
Ying Wang823b6f32010-06-15 13:54:16 -070011 */
12 public static final String TOP_LEVEL_DOMAIN_STR =
13"""
14TLD_SUFFIX = '";'
15
16URL_PREFIX = r"""
17 /**
18 * Regular expression to match all IANA top-level domains for WEB_URL.
Shimeng (Simon) Wang829b84e2011-07-18 17:20:09 -070019 * List accurate as of 2011/07/18. List taken from:
Ying Wang823b6f32010-06-15 13:54:16 -070020 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
Shimeng (Simon) Wang829b84e2011-07-18 17:20:09 -070021 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
Ying Wang823b6f32010-06-15 13:54:16 -070022 */
23 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
24 "(?:"
25"""
26
27URL_SUFFIX = ';'
28
29class Bucket:
30 def __init__(self, baseLetter):
31 self.base=baseLetter
32 self.words=[]
33 self.letters=[]
34
35 def dump(self, isWebUrl=False, isFirst=False, isLast=False):
36 if (len(self.words) == 0) and (len(self.letters) == 0):
37 return ''
38
39 self.words.sort()
40 self.letters.sort()
41
42 output = ' ';
43
44 if isFirst:
45 if isWebUrl:
46 output += '+ "'
47 else:
48 output += '"('
49 else:
50 output += '+ "|'
51
52 if len(self.words) != 0:
53 output += '('
54
55 if isWebUrl:
56 output += '?:'
57
58 firstWord = 1
59 for word in self.words:
60 if firstWord == 0:
61 output += '|'
62 firstWord = 0
63 for letter in word:
64 if letter == '-':
65 output += '\\\\' # escape the '-' character.
66 output += letter
67
68 if len(self.words) > 0 and len(self.letters) > 0:
69 output += '|'
70
71 if len(self.letters) == 1:
72 output += '%c%c' % (self.base, self.letters[0])
73 elif len(self.letters) > 0:
74 output += '%c[' % self.base
75
76 for letter in self.letters:
77 output += letter
78
79 output += ']'
80
81 if len(self.words) != 0:
82 output += ')'
83
84 if not isLast:
85 output += '"'
86 output += '\n'
87
88 return output;
89
90 def add(self, line):
91 length = len(line)
92
93 if line.startswith('#') or (length == 0):
94 return;
95
96 if length == 2:
97 self.letters.append(line[1:2])
98 else:
99 self.words.append(line)
100
101def getBucket(buckets, line):
102 letter = line[0]
103 bucket = buckets.get(letter)
104
105 if bucket is None:
106 bucket = Bucket(letter)
107 buckets[letter] = bucket
108
109 return bucket
110
111def makePattern(prefix, suffix, buckets, isWebUrl=False):
112 output = prefix
113
114 output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
115
116 for letter in range(ord('b'), ord('z')):
117 output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
118
119 output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
120
121 if isWebUrl:
122 output += '))"'
123 else:
124 output += ')'
125
126 output += suffix
127
128 print output
129
130if __name__ == "__main__":
131 f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
132 domains = f.readlines()
133 f.close()
134
135 buckets = {}
136
137 for domain in domains:
138 domain = domain.lower()
139
140 if len(domain) > 0:
141 getBucket(buckets, domain[0]).add(domain.strip())
142
Shimeng (Simon) Wang829b84e2011-07-18 17:20:09 -0700143 if domain.startswith('xn--'):
144 puny = domain.strip()[4:]
145 result = puny.decode('punycode')
146 result = repr(result)
147 getBucket(buckets, 'xn--').add(result[2:-1])
148
Ying Wang823b6f32010-06-15 13:54:16 -0700149 makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
150 makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)