| # |
| # Secret Labs' Regular Expression Engine |
| # |
| # re-compatible interface for the sre matching engine |
| # |
| # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. |
| # |
| # This version of the SRE library can be redistributed under CNRI's |
| # Python 1.6 license. For any other use, please contact Secret Labs |
| # AB (info@pythonware.com). |
| # |
| # Portions of this engine have been developed in cooperation with |
| # CNRI. Hewlett-Packard provided funding for 1.6 integration and |
| # other compatibility work. |
| # |
| |
| r"""Support for regular expressions (RE). |
| |
| This module provides regular expression matching operations similar to |
| those found in Perl. It supports both 8-bit and Unicode strings; both |
| the pattern and the strings being processed can contain null bytes and |
| characters outside the US ASCII range. |
| |
| Regular expressions can contain both special and ordinary characters. |
| Most ordinary characters, like "A", "a", or "0", are the simplest |
| regular expressions; they simply match themselves. You can |
| concatenate ordinary characters, so last matches the string 'last'. |
| |
| The special characters are: |
| "." Matches any character except a newline. |
| "^" Matches the start of the string. |
| "$" Matches the end of the string or just before the newline at |
| the end of the string. |
| "*" Matches 0 or more (greedy) repetitions of the preceding RE. |
| Greedy means that it will match as many repetitions as possible. |
| "+" Matches 1 or more (greedy) repetitions of the preceding RE. |
| "?" Matches 0 or 1 (greedy) of the preceding RE. |
| *?,+?,?? Non-greedy versions of the previous three special characters. |
| {m,n} Matches from m to n repetitions of the preceding RE. |
| {m,n}? Non-greedy version of the above. |
| "\\" Either escapes special characters or signals a special sequence. |
| [] Indicates a set of characters. |
| A "^" as the first character indicates a complementing set. |
| "|" A|B, creates an RE that will match either A or B. |
| (...) Matches the RE inside the parentheses. |
| The contents can be retrieved or matched later in the string. |
| (?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below). |
| (?:...) Non-grouping version of regular parentheses. |
| (?P<name>...) The substring matched by the group is accessible by name. |
| (?P=name) Matches the text matched earlier by the group named name. |
| (?#...) A comment; ignored. |
| (?=...) Matches if ... matches next, but doesn't consume the string. |
| (?!...) Matches if ... doesn't match next. |
| (?<=...) Matches if preceded by ... (must be fixed length). |
| (?<!...) Matches if not preceded by ... (must be fixed length). |
| (?(id/name)yes|no) Matches yes pattern if the group with id/name matched, |
| the (optional) no pattern otherwise. |
| |
| The special sequences consist of "\\" and a character from the list |
| below. If the ordinary character is not on the list, then the |
| resulting RE will match the second character. |
| \number Matches the contents of the group of the same number. |
| \A Matches only at the start of the string. |
| \Z Matches only at the end of the string. |
| \b Matches the empty string, but only at the start or end of a word. |
| \B Matches the empty string, but not at the start or end of a word. |
| \d Matches any decimal digit; equivalent to the set [0-9] in |
| bytes patterns or string patterns with the ASCII flag. |
| In string patterns without the ASCII flag, it will match the whole |
| range of Unicode digits. |
| \D Matches any non-digit character; equivalent to [^\d]. |
| \s Matches any whitespace character; equivalent to [ \t\n\r\f\v]. |
| \S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v]. |
| \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_] |
| in bytes patterns or string patterns with the ASCII flag. |
| In string patterns without the ASCII flag, it will match the |
| range of Unicode alphanumeric characters (letters plus digits |
| plus underscore). |
| With LOCALE, it will match the set [0-9_] plus characters defined |
| as letters for the current locale. |
| \W Matches the complement of \w. |
| \\ Matches a literal backslash. |
| |
| This module exports the following functions: |
| match Match a regular expression pattern to the beginning of a string. |
| search Search a string for the presence of a pattern. |
| sub Substitute occurrences of a pattern found in a string. |
| subn Same as sub, but also return the number of substitutions made. |
| split Split a string by the occurrences of a pattern. |
| findall Find all occurrences of a pattern in a string. |
| finditer Return an iterator yielding a match object for each match. |
| compile Compile a pattern into a RegexObject. |
| purge Clear the regular expression cache. |
| escape Backslash all non-alphanumerics in a string. |
| |
| Some of the functions in this module takes flags as optional parameters: |
| A ASCII For string patterns, make \w, \W, \b, \B, \d, \D |
| match the corresponding ASCII character categories |
| (rather than the whole Unicode categories, which is the |
| default). |
| For bytes patterns, this flag is the only available |
| behaviour and needn't be specified. |
| I IGNORECASE Perform case-insensitive matching. |
| L LOCALE Make \w, \W, \b, \B, dependent on the current locale. |
| M MULTILINE "^" matches the beginning of lines (after a newline) |
| as well as the string. |
| "$" matches the end of lines (before a newline) as well |
| as the end of the string. |
| S DOTALL "." matches any character at all, including the newline. |
| X VERBOSE Ignore whitespace and comments for nicer looking RE's. |
| U UNICODE For compatibility only. Ignored for string patterns (it |
| is the default), and forbidden for bytes patterns. |
| |
| This module also defines an exception 'error'. |
| |
| """ |
| |
| import sys |
| import sre_compile |
| import sre_parse |
| |
| # public symbols |
| __all__ = [ "match", "search", "sub", "subn", "split", "findall", |
| "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X", |
| "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", |
| "UNICODE", "error" ] |
| |
| __version__ = "2.2.1" |
| |
| # flags |
| A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale" |
| I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case |
| L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale |
| U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale" |
| M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline |
| S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline |
| X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |
| |
| # sre extensions (experimental, don't rely on these) |
| T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking |
| DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation |
| |
| # sre exception |
| error = sre_compile.error |
| |
| # -------------------------------------------------------------------- |
| # public interface |
| |
| def match(pattern, string, flags=0): |
| """Try to apply the pattern at the start of the string, returning |
| a match object, or None if no match was found.""" |
| return _compile(pattern, flags).match(string) |
| |
| def search(pattern, string, flags=0): |
| """Scan through string looking for a match to the pattern, returning |
| a match object, or None if no match was found.""" |
| return _compile(pattern, flags).search(string) |
| |
| def sub(pattern, repl, string, count=0, flags=0): |
| """Return the string obtained by replacing the leftmost |
| non-overlapping occurrences of the pattern in string by the |
| replacement repl. repl can be either a string or a callable; |
| if a string, backslash escapes in it are processed. If it is |
| a callable, it's passed the match object and must return |
| a replacement string to be used.""" |
| return _compile(pattern, flags).sub(repl, string, count) |
| |
| def subn(pattern, repl, string, count=0, flags=0): |
| """Return a 2-tuple containing (new_string, number). |
| new_string is the string obtained by replacing the leftmost |
| non-overlapping occurrences of the pattern in the source |
| string by the replacement repl. number is the number of |
| substitutions that were made. repl can be either a string or a |
| callable; if a string, backslash escapes in it are processed. |
| If it is a callable, it's passed the match object and must |
| return a replacement string to be used.""" |
| return _compile(pattern, flags).subn(repl, string, count) |
| |
| def split(pattern, string, maxsplit=0, flags=0): |
| """Split the source string by the occurrences of the pattern, |
| returning a list containing the resulting substrings.""" |
| return _compile(pattern, flags).split(string, maxsplit) |
| |
| def findall(pattern, string, flags=0): |
| """Return a list of all non-overlapping matches in the string. |
| |
| If one or more groups are present in the pattern, return a |
| list of groups; this will be a list of tuples if the pattern |
| has more than one group. |
| |
| Empty matches are included in the result.""" |
| return _compile(pattern, flags).findall(string) |
| |
| if sys.hexversion >= 0x02020000: |
| __all__.append("finditer") |
| def finditer(pattern, string, flags=0): |
| """Return an iterator over all non-overlapping matches in the |
| string. For each match, the iterator returns a match object. |
| |
| Empty matches are included in the result.""" |
| return _compile(pattern, flags).finditer(string) |
| |
| def compile(pattern, flags=0): |
| "Compile a regular expression pattern, returning a pattern object." |
| return _compile(pattern, flags) |
| |
| def purge(): |
| "Clear the regular expression cache" |
| _cache.clear() |
| _cache_repl.clear() |
| |
| def template(pattern, flags=0): |
| "Compile a template pattern, returning a pattern object" |
| return _compile(pattern, flags|T) |
| |
| _alphanum_str = frozenset( |
| "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") |
| _alphanum_bytes = frozenset( |
| b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") |
| |
| def escape(pattern): |
| "Escape all non-alphanumeric characters in pattern." |
| if isinstance(pattern, str): |
| alphanum = _alphanum_str |
| s = list(pattern) |
| for i in range(len(pattern)): |
| c = pattern[i] |
| if c not in alphanum: |
| if c == "\000": |
| s[i] = "\\000" |
| else: |
| s[i] = "\\" + c |
| return "".join(s) |
| else: |
| alphanum = _alphanum_bytes |
| s = [] |
| esc = ord(b"\\") |
| for c in pattern: |
| if c in alphanum: |
| s.append(c) |
| else: |
| if c == 0: |
| s.extend(b"\\000") |
| else: |
| s.append(esc) |
| s.append(c) |
| return bytes(s) |
| |
| # -------------------------------------------------------------------- |
| # internals |
| |
| _cache = {} |
| _cache_repl = {} |
| |
| _pattern_type = type(sre_compile.compile("", 0)) |
| |
| _MAXCACHE = 500 |
| |
| def _shrink_cache(cache_dict, max_length, divisor=5): |
| """Make room in the given cache. |
| |
| Args: |
| cache_dict: The cache dictionary to modify. |
| max_length: Maximum # of entries in cache_dict before it is shrunk. |
| divisor: Cache will shrink to max_length - 1/divisor*max_length items. |
| """ |
| # Toss out a fraction of the entries at random to make room for new ones. |
| # A random algorithm was chosen as opposed to simply cache_dict.popitem() |
| # as popitem could penalize the same regular expression repeatedly based |
| # on its internal hash value. Being random should spread the cache miss |
| # love around. |
| cache_keys = tuple(cache_dict.keys()) |
| overage = len(cache_keys) - max_length |
| if overage < 0: |
| # Cache is already within limits. Normally this should not happen |
| # but it could due to multithreading. |
| return |
| number_to_toss = max_length // divisor + overage |
| # The import is done here to avoid a circular depencency. |
| import random |
| if not hasattr(random, 'sample'): |
| # Do nothing while resolving the circular dependency: |
| # re->random->warnings->tokenize->string->re |
| return |
| for doomed_key in random.sample(cache_keys, number_to_toss): |
| try: |
| del cache_dict[doomed_key] |
| except KeyError: |
| # Ignore problems if the cache changed from another thread. |
| pass |
| |
| def _compile(*key): |
| # internal: compile pattern |
| cachekey = (type(key[0]),) + key |
| p = _cache.get(cachekey) |
| if p is not None: |
| return p |
| pattern, flags = key |
| if isinstance(pattern, _pattern_type): |
| if flags: |
| raise ValueError( |
| "Cannot process flags argument with a compiled pattern") |
| return pattern |
| if not sre_compile.isstring(pattern): |
| raise TypeError("first argument must be string or compiled pattern") |
| p = sre_compile.compile(pattern, flags) |
| if len(_cache) >= _MAXCACHE: |
| _shrink_cache(_cache, _MAXCACHE) |
| _cache[cachekey] = p |
| return p |
| |
| def _compile_repl(*key): |
| # internal: compile replacement pattern |
| p = _cache_repl.get(key) |
| if p is not None: |
| return p |
| repl, pattern = key |
| p = sre_parse.parse_template(repl, pattern) |
| if len(_cache_repl) >= _MAXCACHE: |
| _shrink_cache(_cache_repl, _MAXCACHE) |
| _cache_repl[key] = p |
| return p |
| |
| def _expand(pattern, match, template): |
| # internal: match.expand implementation hook |
| template = sre_parse.parse_template(template, pattern) |
| return sre_parse.expand_template(template, match) |
| |
| def _subx(pattern, template): |
| # internal: pattern.sub/subn implementation helper |
| template = _compile_repl(template, pattern) |
| if not template[0] and len(template[1]) == 1: |
| # literal replacement |
| return template[1][0] |
| def filter(match, template=template): |
| return sre_parse.expand_template(template, match) |
| return filter |
| |
| # register myself for pickling |
| |
| import copyreg |
| |
| def _pickle(p): |
| return _compile, (p.pattern, p.flags) |
| |
| copyreg.pickle(_pattern_type, _pickle, _compile) |
| |
| # -------------------------------------------------------------------- |
| # experimental stuff (see python-dev discussions for details) |
| |
| class Scanner: |
| def __init__(self, lexicon, flags=0): |
| from sre_constants import BRANCH, SUBPATTERN |
| self.lexicon = lexicon |
| # combine phrases into a compound pattern |
| p = [] |
| s = sre_parse.Pattern() |
| s.flags = flags |
| for phrase, action in lexicon: |
| p.append(sre_parse.SubPattern(s, [ |
| (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), |
| ])) |
| s.groups = len(p)+1 |
| p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |
| self.scanner = sre_compile.compile(p) |
| def scan(self, string): |
| result = [] |
| append = result.append |
| match = self.scanner.scanner(string).match |
| i = 0 |
| while 1: |
| m = match() |
| if not m: |
| break |
| j = m.end() |
| if i == j: |
| break |
| action = self.lexicon[m.lastindex-1][1] |
| if hasattr(action, "__call__"): |
| self.match = m |
| action = action(self, m.group()) |
| if action is not None: |
| append(action) |
| i = j |
| return result, string[i:] |