Rewritten using the tokenize module, which gives us a real tokenizer
rather than a number of approximating regular expressions.
Alas, it is 3-4 times slower. Let that be a challenge for the
tokenize module.
diff --git a/Lib/pyclbr.py b/Lib/pyclbr.py
index 9bc68c5..1901a82 100644
--- a/Lib/pyclbr.py
+++ b/Lib/pyclbr.py
@@ -4,10 +4,11 @@
definitions and to find out the superclasses of a class.
The interface consists of a single function:
- readmodule(module, path)
+ readmodule_ex(module [, path[, inpackage]])
module is the name of a Python module, path is an optional list of
directories where the module is to be searched. If present, path is
-prepended to the system search path sys.path.
+prepended to the system search path sys.path. (inpackage is used
+internally to search for a submodule of a package.)
The return value is a dictionary. The keys of the dictionary are
the names of the classes defined in the module (including classes
that are defined via the from XXX import YYY construct). The values
@@ -28,12 +29,10 @@
are recognized and imported modules are scanned as well, this
shouldn't happen often.
+XXX describe the Function class.
+
BUGS
-- Continuation lines are not dealt with at all, except inside strings.
- Nested classes and functions can confuse it.
-- Code that doesn't pass tabnanny or python -t will confuse it, unless
- you set the module TABWIDTH vrbl (default 8) to the correct tab width
- for the file.
PACKAGE RELATED BUGS
- If you have a package and a module inside that or another package
@@ -52,69 +51,11 @@
import sys
import imp
-import re
-import string
+import tokenize # Python tokenizer
+from token import NAME
__all__ = ["readmodule"]
-TABWIDTH = 8
-
-_getnext = re.compile(r"""
- (?P<String>
- \""" [^"\\]* (?:
- (?: \\. | "(?!"") )
- [^"\\]*
- )*
- \"""
-
- | ''' [^'\\]* (?:
- (?: \\. | '(?!'') )
- [^'\\]*
- )*
- '''
-
- | " [^"\\\n]* (?: \\. [^"\\\n]*)* "
-
- | ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
- )
-
-| (?P<Method>
- ^
- (?P<MethodIndent> [ \t]* )
- def [ \t]+
- (?P<MethodName> [a-zA-Z_] \w* )
- [ \t]* \(
- )
-
-| (?P<Class>
- ^
- (?P<ClassIndent> [ \t]* )
- class [ \t]+
- (?P<ClassName> [a-zA-Z_] \w* )
- [ \t]*
- (?P<ClassSupers> \( [^)\n]* \) )?
- [ \t]* :
- )
-
-| (?P<Import>
- ^ import [ \t]+
- (?P<ImportList> [^#;\n]+ )
- )
-
-| (?P<ImportFrom>
- ^ from [ \t]+
- (?P<ImportFromPath>
- [a-zA-Z_] \w*
- (?:
- [ \t]* \. [ \t]* [a-zA-Z_] \w*
- )*
- )
- [ \t]+
- import [ \t]+
- (?P<ImportFromList> [^#;\n]+ )
- )
-""", re.VERBOSE | re.DOTALL | re.MULTILINE).search
-
_modules = {} # cache of modules we've seen
# each Python class is represented by an instance of this class
@@ -140,7 +81,7 @@
def _addmethod(self, name, lineno):
assert 0, "Function._addmethod() shouldn't be called"
-def readmodule(module, path=[], inpackage=0):
+def readmodule(module, path=[], inpackage=False):
'''Backwards compatible interface.
Like readmodule_ex() but strips Function objects from the
@@ -153,7 +94,7 @@
res[key] = value
return res
-def readmodule_ex(module, path=[], inpackage=0):
+def readmodule_ex(module, path=[], inpackage=False):
'''Read a module file and return a dictionary of classes.
Search for MODULE in PATH and sys.path, read and parse the
@@ -168,7 +109,7 @@
package = module[:i].strip()
submodule = module[i+1:].strip()
parent = readmodule_ex(package, path, inpackage)
- child = readmodule_ex(submodule, parent['__path__'], 1)
+ child = readmodule_ex(submodule, parent['__path__'], True)
return child
if module in _modules:
@@ -204,129 +145,144 @@
_modules[module] = dict
classstack = [] # stack of (class, indent) pairs
- src = f.read()
- f.close()
- # To avoid having to stop the regexp at each newline, instead
- # when we need a line number we simply count the number of
- # newlines in the string since the last time we did this; i.e.,
- # lineno += src.count('\n', last_lineno_pos, here)
- # last_lineno_pos = here
- lineno, last_lineno_pos = 1, 0
- i = 0
- while 1:
- m = _getnext(src, i)
- if not m:
- break
- start, i = m.span()
-
- if m.start("Method") >= 0:
- # found a method definition or function
- thisindent = _indent(m.group("MethodIndent"))
- meth_name = m.group("MethodName")
- lineno += src.count('\n', last_lineno_pos, start)
- last_lineno_pos = start
- # close all classes indented at least as much
- while classstack and \
- classstack[-1][1] >= thisindent:
- del classstack[-1]
- if classstack:
- # it's a class method
- cur_class = classstack[-1][0]
- cur_class._addmethod(meth_name, lineno)
- else:
- # it's a function
- f = Function(module, meth_name,
- file, lineno)
- dict[meth_name] = f
-
- elif m.start("String") >= 0:
- pass
-
- elif m.start("Class") >= 0:
- # we found a class definition
- thisindent = _indent(m.group("ClassIndent"))
- # close all classes indented at least as much
- while classstack and \
- classstack[-1][1] >= thisindent:
- del classstack[-1]
- lineno += src.count('\n', last_lineno_pos, start)
- last_lineno_pos = start
- class_name = m.group("ClassName")
- inherit = m.group("ClassSupers")
- if inherit:
- # the class inherits from other classes
- inherit = inherit[1:-1].strip()
- names = []
- for n in inherit.split(','):
- n = n.strip()
- if n in dict:
- # we know this super class
- n = dict[n]
- else:
- c = n.split('.')
- if len(c) > 1:
- # super class
- # is of the
- # form module.class:
- # look in
- # module for class
- m = c[-2]
- c = c[-1]
- if m in _modules:
- d = _modules[m]
- if c in d:
- n = d[c]
- names.append(n)
- inherit = names
- # remember this class
- cur_class = Class(module, class_name, inherit,
- file, lineno)
- dict[class_name] = cur_class
- classstack.append((cur_class, thisindent))
-
- elif m.start("Import") >= 0:
- # import module
- for n in m.group("ImportList").split(','):
- n = n.strip()
+ g = tokenize.generate_tokens(f.readline)
+ try:
+ for tokentype, token, start, end, line in g:
+ if token == 'def':
+ lineno, thisindent = start
+ tokentype, meth_name, start, end, line = g.next()
+ if tokentype != NAME:
+ continue # Syntax error
+ # close all classes indented at least as much
+ while classstack and \
+ classstack[-1][1] >= thisindent:
+ del classstack[-1]
+ if classstack:
+ # it's a class method
+ cur_class = classstack[-1][0]
+ cur_class._addmethod(meth_name, lineno)
+ else:
+ # it's a function
+ dict[meth_name] = Function(module, meth_name, file, lineno)
+ elif token == 'class':
+ lineno, thisindent = start
+ tokentype, class_name, start, end, line = g.next()
+ if tokentype != NAME:
+ continue # Syntax error
+ # close all classes indented at least as much
+ while classstack and \
+ classstack[-1][1] >= thisindent:
+ del classstack[-1]
+ # parse what follows the class name
+ tokentype, token, start, end, line = g.next()
+ inherit = None
+ if token == '(':
+ names = [] # List of superclasses
+ # there's a list of superclasses
+ level = 1
+ super = [] # Tokens making up current superclass
+ while True:
+ tokentype, token, start, end, line = g.next()
+ if token in (')', ',') and level == 1:
+ n = "".join(super)
+ if n in dict:
+ # we know this super class
+ n = dict[n]
+ else:
+ c = n.split('.')
+ if len(c) > 1:
+ # super class is of the form
+ # module.class: look in module for
+ # class
+ m = c[-2]
+ c = c[-1]
+ if m in _modules:
+ d = _modules[m]
+ if c in d:
+ n = d[c]
+ names.append(n)
+ if token == '(':
+ level += 1
+ elif token == ')':
+ level -= 1
+ if level == 0:
+ break
+ elif token == ',' and level == 1:
+ pass
+ else:
+ super.append(token)
+ inherit = names
+ cur_class = Class(module, class_name, inherit, file, lineno)
+ dict[class_name] = cur_class
+ classstack.append((cur_class, thisindent))
+ elif token == 'import' and start[1] == 0:
+ modules = _getnamelist(g)
+ for mod, mod2 in modules:
+ readmodule_ex(mod, path, inpackage)
+ elif token == 'from' and start[1] == 0:
+ mod, token = _getname(g)
+ if not mod or token != "import":
+ continue
+ names = _getnamelist(g)
try:
# recursively read the imported module
- d = readmodule_ex(n, path, inpackage)
+ d = readmodule_ex(mod, path, inpackage)
except:
- ##print 'module', n, 'not found'
- pass
+ continue
+ # add any classes that were defined in the imported module
+ # to our name space if they were mentioned in the list
+ for n, n2 in names:
+ if n in d:
+ dict[n2 or n] = d[n]
+ elif n == '*':
+ # only add a name if not already there (to mimic
+ # what Python does internally) also don't add
+ # names that start with _
+ for n in d:
+ if n[0] != '_' and not n in dict:
+ dict[n] = d[n]
+ except StopIteration:
+ pass
- elif m.start("ImportFrom") >= 0:
- # from module import stuff
- mod = m.group("ImportFromPath")
- names = m.group("ImportFromList").split(',')
- try:
- # recursively read the imported module
- d = readmodule_ex(mod, path, inpackage)
- except:
- ##print 'module', mod, 'not found'
- continue
- # add any classes that were defined in the
- # imported module to our name space if they
- # were mentioned in the list
- for n in names:
- n = n.strip()
- if n in d:
- dict[n] = d[n]
- elif n == '*':
- # only add a name if not
- # already there (to mimic what
- # Python does internally)
- # also don't add names that
- # start with _
- for n in d:
- if n[0] != '_' and \
- not n in dict:
- dict[n] = d[n]
- else:
- assert 0, "regexp _getnext found something unexpected"
-
+ f.close()
return dict
-def _indent(ws, _expandtabs=string.expandtabs):
- return len(_expandtabs(ws, TABWIDTH))
+def _getnamelist(g):
+ # Helper to get a comma-separated list of dotted names plus 'as'
+ # clauses. Return a list of pairs (name, name2) where name2 is
+ # the 'as' name, or None if there is no 'as' clause.
+ names = []
+ while True:
+ name, token = _getname(g)
+ if not name:
+ break
+ if token == 'as':
+ name2, token = _getname(g)
+ else:
+ name2 = None
+ names.append((name, name2))
+ while token != "," and "\n" not in token:
+ tokentype, token, start, end, line = g.next()
+ if token != ",":
+ break
+ return names
+
+def _getname(g):
+ # Helper to get a dotted name, return a pair (name, token) where
+ # name is the dotted name, or None if there was no dotted name,
+ # and token is the next input token.
+ parts = []
+ tokentype, token, start, end, line = g.next()
+ if tokentype != NAME and token != '*':
+ return (None, token)
+ parts.append(token)
+ while True:
+ tokentype, token, start, end, line = g.next()
+ if token != '.':
+ break
+ tokentype, token, start, end, line = g.next()
+ if tokentype != NAME:
+ break
+ parts.append(token)
+ return (".".join(parts), token)