common/tools/make-iana-tld-pattern.py - platform/frameworks/base - Gitiles

 #!/usr/bin/env python

 from urllib2 import urlopen

 TLD_PREFIX = r"""
     /**
      *  Regular expression pattern to match all IANA top-level domains.
      *  List accurate as of 2010/02/05.  List taken from:
      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
      *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
      */
     public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
 """
 TLD_SUFFIX = '");'

 URL_PREFIX = r"""
     /**
      *  Regular expression pattern to match RFC 1738 URLs
      *  List accurate as of 2010/02/05.  List taken from:
      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
      *  This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
      */
     public static final Pattern WEB_URL = Pattern.compile(
         "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
         + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
         + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
         + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
         + "(?:"   // plus top level domain
 """

 URL_SUFFIX = r"""
         + "|(?:(?:25[0-5]|2[0-4]" // or ip address
         + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
         + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
         + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
         + "|[1-9][0-9]|[0-9])))"
         + "(?:\\:\\d{1,5})?)" // plus option port number
         + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
         + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
         + "(?:\\b|$)"); // and finally, a word boundary or end of
                         // input.  This is to stop foo.sure from
                         // matching as foo.su
 """

 class Bucket:
     def __init__(self, baseLetter):
         self.base=baseLetter
         self.words=[]
         self.letters=[]

     def dump(self, isWebUrl=False, isFirst=False, isLast=False):
         if (len(self.words) == 0) and (len(self.letters) == 0):
             return ''

         self.words.sort()
         self.letters.sort()

         output = '        ';

         if isFirst:
             if isWebUrl:
                 output += '+ "'
             else:
                 output += '"('
         else:
             output += '+ "|'

         if len(self.words) != 0:
             output += '('

             if isWebUrl:
                 output += '?:'

         firstWord = 1
         for word in self.words:
             if firstWord == 0:
                 output += '|'
             firstWord = 0
             for letter in word:
                 if letter == '-':
                     output += '\\\\'  # escape the '-' character.
                 output += letter

         if len(self.words) > 0 and len(self.letters) > 0:
             output += '|'

         if len(self.letters) == 1:
             output += '%c%c' % (self.base, self.letters[0])
         elif len(self.letters) > 0:
             output += '%c[' % self.base

             for letter in self.letters:
                 output += letter

             output += ']'

         if len(self.words) != 0:
             output += ')'

         if not isLast:
             output += '"'
             output += '\n'

         return output;

     def add(self, line):
         length = len(line)

         if line.startswith('#') or (length == 0):
             return;

         if length == 2:
             self.letters.append(line[1:2])
         else:
             self.words.append(line)

 def getBucket(buckets, line):
     letter = line[0]
     bucket = buckets.get(letter)

     if bucket is None:
         bucket = Bucket(letter)
         buckets[letter] = bucket

     return bucket

 def makePattern(prefix, suffix, buckets, isWebUrl=False):
     output = prefix

     output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)

     for letter in range(ord('b'), ord('z')):
         output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)

     output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)

     if isWebUrl:
         output += '))"'
     else:
         output += ')'

     output += suffix

     print output

 if __name__ == "__main__":
     f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
     domains = f.readlines()
     f.close()

     buckets = {}

     for domain in domains:
         domain = domain.lower()

         if len(domain) > 0:
             getBucket(buckets, domain[0]).add(domain.strip())

     makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
     makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
	#!/usr/bin/env python

	from urllib2 import urlopen

	TLD_PREFIX = r"""
	/**
	* Regular expression pattern to match all IANA top-level domains.
	* List accurate as of 2010/02/05. List taken from:
	* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
	* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
	*/
	public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
	"""
	TLD_SUFFIX = '");'

	URL_PREFIX = r"""
	/**
	* Regular expression pattern to match RFC 1738 URLs
	* List accurate as of 2010/02/05. List taken from:
	* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
	* This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
	*/
	public static final Pattern WEB_URL = Pattern.compile(
	"((?:(http\|https\|Http\|Https\|rtsp\|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
	+ "\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
	+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
	+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
	+ "(?:" // plus top level domain
	"""

	URL_SUFFIX = r"""
	+ "\|(?:(?:25[0-5]\|2[0-4]" // or ip address
	+ "[0-9]\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9])\\.(?:25[0-5]\|2[0-4][0-9]"
	+ "\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1]"
	+ "[0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1][0-9]{2}"
	+ "\|[1-9][0-9]\|[0-9])))"
	+ "(?:\\:\\d{1,5})?)" // plus option port number
	+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
	+ "\\-\\.\\+\\!\\\\'\\(\\)\\,\\_])\|(?:\\%[a-fA-F0-9]{2})))?"
	+ "(?:\\b\|$)"); // and finally, a word boundary or end of
	// input. This is to stop foo.sure from
	// matching as foo.su
	"""

	class Bucket:
	def __init__(self, baseLetter):
	self.base=baseLetter
	self.words=[]
	self.letters=[]

	def dump(self, isWebUrl=False, isFirst=False, isLast=False):
	if (len(self.words) == 0) and (len(self.letters) == 0):
	return ''

	self.words.sort()
	self.letters.sort()

	output = ' ';

	if isFirst:
	if isWebUrl:
	output += '+ "'
	else:
	output += '"('
	else:
	output += '+ "\|'

	if len(self.words) != 0:
	output += '('

	if isWebUrl:
	output += '?:'

	firstWord = 1
	for word in self.words:
	if firstWord == 0:
	output += '\|'
	firstWord = 0
	for letter in word:
	if letter == '-':
	output += '\\\\' # escape the '-' character.
	output += letter

	if len(self.words) > 0 and len(self.letters) > 0:
	output += '\|'

	if len(self.letters) == 1:
	output += '%c%c' % (self.base, self.letters[0])
	elif len(self.letters) > 0:
	output += '%c[' % self.base

	for letter in self.letters:
	output += letter

	output += ']'

	if len(self.words) != 0:
	output += ')'

	if not isLast:
	output += '"'
	output += '\n'

	return output;

	def add(self, line):
	length = len(line)

	if line.startswith('#') or (length == 0):
	return;

	if length == 2:
	self.letters.append(line[1:2])
	else:
	self.words.append(line)

	def getBucket(buckets, line):
	letter = line[0]
	bucket = buckets.get(letter)

	if bucket is None:
	bucket = Bucket(letter)
	buckets[letter] = bucket

	return bucket

	def makePattern(prefix, suffix, buckets, isWebUrl=False):
	output = prefix

	output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)

	for letter in range(ord('b'), ord('z')):
	output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)

	output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)

	if isWebUrl:
	output += '))"'
	else:
	output += ')'

	output += suffix

	print output

	if __name__ == "__main__":
	f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
	domains = f.readlines()
	f.close()

	buckets = {}

	for domain in domains:
	domain = domain.lower()

	if len(domain) > 0:
	getBucket(buckets, domain[0]).add(domain.strip())

	makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
	makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)