- fixed split
(test_sre still complains about split, but that's caused by
the group reset bug, not split itself)
- added more mark slots
(should be dynamically allocated, but 100 is better than 32.
and checking for the upper limit is better than overwriting
the memory ;-)
- internal: renamed the cursor helper class
- internal: removed some bloat from sre_compile
diff --git a/Lib/sre.py b/Lib/sre.py
index 32b3e8f..e0a51e3 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -26,7 +26,7 @@
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
# sre exception
-error = sre_parse.error
+error = sre_compile.error
# --------------------------------------------------------------------
# public interface
@@ -105,7 +105,7 @@
n = i = 0
s = []
append = s.append
- c = pattern.cursor(string)
+ c = pattern.scanner(string)
while not count or n < count:
m = c.search()
if not m:
@@ -127,16 +127,20 @@
n = i = 0
s = []
append = s.append
- c = pattern.cursor(string)
+ extend = s.extend
+ c = pattern.scanner(string)
+ g = c.groups
while not maxsplit or n < maxsplit:
m = c.search()
if not m:
break
- j = m.start()
- append(string[i:j])
- i = m.end()
- if i <= j:
- break
+ b, e = m.span()
+ if e == i:
+ continue
+ append(string[i:b])
+ if g and b != e:
+ extend(m.groups())
+ i = e
n = n + 1
if i < len(string):
append(string[i:])
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 2d7c021..a51531b 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -11,8 +11,7 @@
# other compatibility work.
#
-import array, string, sys
-
+import array
import _sre
from sre_constants import *
@@ -24,123 +23,101 @@
else:
raise RuntimeError, "cannot find a useable array type"
-# FIXME: <fl> should move some optimizations from the parser to here!
-
-class Code:
- def __init__(self):
- self.data = []
- def __len__(self):
- return len(self.data)
- def __getitem__(self, index):
- return self.data[index]
- def __setitem__(self, index, code):
- self.data[index] = code
- def append(self, code):
- self.data.append(code)
- def todata(self):
- # print self.data
- try:
- return array.array(WORDSIZE, self.data).tostring()
- except OverflowError:
- print self.data
- raise
-
def _compile(code, pattern, flags):
- append = code.append
+ emit = code.append
for op, av in pattern:
if op is ANY:
if flags & SRE_FLAG_DOTALL:
- append(OPCODES[op]) # any character at all!
+ emit(OPCODES[op])
else:
- append(OPCODES[CATEGORY])
- append(CHCODES[CATEGORY_NOT_LINEBREAK])
+ emit(OPCODES[CATEGORY])
+ emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE):
- append(OPCODES[op])
+ emit(OPCODES[op])
elif op is AT:
- append(OPCODES[op])
+ emit(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
- append(ATCODES[AT_MULTILINE[av]])
+ emit(ATCODES[AT_MULTILINE[av]])
else:
- append(ATCODES[av])
+ emit(ATCODES[av])
elif op is BRANCH:
- append(OPCODES[op])
+ emit(OPCODES[op])
tail = []
for av in av[1]:
- skip = len(code); append(0)
+ skip = len(code); emit(0)
_compile(code, av, flags)
-## append(OPCODES[SUCCESS])
- append(OPCODES[JUMP])
- tail.append(len(code)); append(0)
+ emit(OPCODES[JUMP])
+ tail.append(len(code)); emit(0)
code[skip] = len(code) - skip
- append(0) # end of branch
+ emit(0) # end of branch
for tail in tail:
code[tail] = len(code) - tail
elif op is CALL:
- append(OPCODES[op])
- skip = len(code); append(0)
+ emit(OPCODES[op])
+ skip = len(code); emit(0)
_compile(code, av, flags)
- append(OPCODES[SUCCESS])
+ emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CATEGORY:
- append(OPCODES[op])
+ emit(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
- append(CH_LOCALE[CHCODES[av]])
+ emit(CH_LOCALE[CHCODES[av]])
elif flags & SRE_FLAG_UNICODE:
- append(CH_UNICODE[CHCODES[av]])
+ emit(CH_UNICODE[CHCODES[av]])
else:
- append(CHCODES[av])
+ emit(CHCODES[av])
elif op is GROUP:
if flags & SRE_FLAG_IGNORECASE:
- append(OPCODES[OP_IGNORE[op]])
+ emit(OPCODES[OP_IGNORE[op]])
else:
- append(OPCODES[op])
- append(av-1)
+ emit(OPCODES[op])
+ emit(av-1)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
- append(OPCODES[OP_IGNORE[op]])
+ emit(OPCODES[OP_IGNORE[op]])
def fixup(literal, flags=flags):
return _sre.getlower(ord(literal), flags)
else:
- append(OPCODES[op])
+ emit(OPCODES[op])
fixup = ord
- skip = len(code); append(0)
+ skip = len(code); emit(0)
for op, av in av:
- append(OPCODES[op])
+ emit(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
- append(fixup(av))
+ emit(fixup(av))
elif op is RANGE:
- append(fixup(av[0]))
- append(fixup(av[1]))
+ emit(fixup(av[0]))
+ emit(fixup(av[1]))
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
- append(CH_LOCALE[CHCODES[av]])
+ emit(CH_LOCALE[CHCODES[av]])
elif flags & SRE_FLAG_UNICODE:
- append(CH_UNICODE[CHCODES[av]])
+ emit(CH_UNICODE[CHCODES[av]])
else:
- append(CHCODES[av])
+ emit(CHCODES[av])
else:
- raise ValueError, "unsupported set operator"
- append(OPCODES[FAILURE])
+ raise error, "internal: unsupported set operator"
+ emit(OPCODES[FAILURE])
code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL):
if flags & SRE_FLAG_IGNORECASE:
- append(OPCODES[OP_IGNORE[op]])
+ emit(OPCODES[OP_IGNORE[op]])
else:
- append(OPCODES[op])
- append(ord(av))
+ emit(OPCODES[op])
+ emit(ord(av))
elif op is MARK:
- append(OPCODES[op])
- append(av)
+ emit(OPCODES[op])
+ emit(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE:
- append(OPCODES[REPEAT])
- skip = len(code); append(0)
- append(av[0])
- append(av[1])
+ emit(OPCODES[REPEAT])
+ skip = len(code); emit(0)
+ emit(av[0])
+ emit(av[1])
_compile(code, av[2], flags)
- append(OPCODES[SUCCESS])
+ emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
lo, hi = av[2].getwidth()
@@ -149,35 +126,35 @@
if 0 and lo == hi == 1 and op is MAX_REPEAT:
# FIXME: <fl> need a better way to figure out when
# it's safe to use this one (in the parser, probably)
- append(OPCODES[MAX_REPEAT_ONE])
- skip = len(code); append(0)
- append(av[0])
- append(av[1])
+ emit(OPCODES[MAX_REPEAT_ONE])
+ skip = len(code); emit(0)
+ emit(av[0])
+ emit(av[1])
_compile(code, av[2], flags)
- append(OPCODES[SUCCESS])
+ emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
- append(OPCODES[op])
- skip = len(code); append(0)
- append(av[0])
- append(av[1])
+ emit(OPCODES[op])
+ skip = len(code); emit(0)
+ emit(av[0])
+ emit(av[1])
_compile(code, av[2], flags)
- append(OPCODES[SUCCESS])
+ emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is SUBPATTERN:
group = av[0]
if group:
- append(OPCODES[MARK])
- append((group-1)*2)
+ emit(OPCODES[MARK])
+ emit((group-1)*2)
_compile(code, av[1], flags)
if group:
- append(OPCODES[MARK])
- append((group-1)*2+1)
+ emit(OPCODES[MARK])
+ emit((group-1)*2+1)
else:
raise ValueError, ("unsupported operand type", op)
def compile(p, flags=0):
- # convert pattern list to internal format
+ # internal: convert pattern list to internal format
if type(p) in (type(""), type(u"")):
import sre_parse
pattern = p
@@ -185,18 +162,14 @@
else:
pattern = None
flags = p.pattern.flags | flags
- code = Code()
+ code = []
_compile(code, p.data, flags)
code.append(OPCODES[SUCCESS])
- data = code.todata()
- if 0: # debugging
- print
- print "-" * 68
- import sre_disasm
- sre_disasm.disasm(data)
- print "-" * 68
+ # FIXME: <fl> get rid of this limitation
+ assert p.pattern.groups <= 100,\
+ "sorry, but this version only supports 100 named groups"
return _sre.compile(
pattern, flags,
- data,
+ array.array(WORDSIZE, code).tostring(),
p.pattern.groups-1, p.pattern.groupdict
)
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 21214ed..dba2afd 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -14,11 +14,12 @@
* 00-03-06 fl first alpha, sort of (0.5)
* 00-03-14 fl removed most compatibility stuff (0.6)
* 00-05-10 fl towards third alpha (0.8.2)
- * 00-05-13 fl added experimental cursor stuff (0.8.3)
+ * 00-05-13 fl added experimental scanner stuff (0.8.3)
* 00-05-27 fl final bug hunt (0.8.4)
* 00-06-21 fl less bugs, more taste (0.8.5)
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
+ * 00-06-29 fl fixed split, added more scanner features (0.9.2)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@@ -384,7 +385,7 @@
int i, count;
/* FIXME: this is a hack! */
- void* mark_copy[64];
+ void* mark_copy[SRE_MARK_SIZE];
void* mark = NULL;
TRACE(("%8d: enter\n", PTR(ptr)));
@@ -954,7 +955,7 @@
staticforward PyTypeObject Pattern_Type;
staticforward PyTypeObject Match_Type;
-staticforward PyTypeObject Cursor_Type;
+staticforward PyTypeObject Scanner_Type;
static PyObject *
_compile(PyObject* self_, PyObject* args)
@@ -1074,7 +1075,7 @@
state->lastmark = 0;
/* FIXME: dynamic! */
- for (i = 0; i < 64; i++)
+ for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL;
state->stack = NULL;
@@ -1176,15 +1177,15 @@
}
static PyObject*
-pattern_cursor(PatternObject* pattern, PyObject* args)
+pattern_scanner(PatternObject* pattern, PyObject* args)
{
/* create search state object */
- CursorObject* self;
+ ScannerObject* self;
PyObject* string;
/* create match object (with room for extra group marks) */
- self = PyObject_NEW(CursorObject, &Cursor_Type);
+ self = PyObject_NEW(ScannerObject, &Scanner_Type);
if (self == NULL)
return NULL;
@@ -1431,7 +1432,7 @@
{"split", (PyCFunction) pattern_split, 1},
{"findall", (PyCFunction) pattern_findall, 1},
/* experimental */
- {"cursor", (PyCFunction) pattern_cursor, 1},
+ {"scanner", (PyCFunction) pattern_scanner, 1},
{NULL, NULL}
};
@@ -1467,7 +1468,7 @@
statichere PyTypeObject Pattern_Type = {
PyObject_HEAD_INIT(NULL)
- 0, "Pattern", sizeof(PatternObject), 0,
+ 0, "SRE_Pattern", sizeof(PatternObject), 0,
(destructor)pattern_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)pattern_getattr, /*tp_getattr*/
@@ -1761,7 +1762,7 @@
statichere PyTypeObject Match_Type = {
PyObject_HEAD_INIT(NULL)
- 0, "Match",
+ 0, "SRE_Match",
sizeof(MatchObject), /* size of basic object */
sizeof(int), /* space for group item */
(destructor)match_dealloc, /*tp_dealloc*/
@@ -1770,10 +1771,10 @@
};
/* -------------------------------------------------------------------- */
-/* cursor methods (experimental) */
+/* scanner methods (experimental) */
static void
-cursor_dealloc(CursorObject* self)
+scanner_dealloc(ScannerObject* self)
{
state_fini(&self->state);
Py_DECREF(self->string);
@@ -1782,7 +1783,7 @@
}
static PyObject*
-cursor_match(CursorObject* self, PyObject* args)
+scanner_match(ScannerObject* self, PyObject* args)
{
SRE_STATE* state = &self->state;
PyObject* match;
@@ -1811,7 +1812,7 @@
static PyObject*
-cursor_search(CursorObject* self, PyObject* args)
+scanner_search(ScannerObject* self, PyObject* args)
{
SRE_STATE* state = &self->state;
PyObject* match;
@@ -1830,24 +1831,26 @@
match = pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
- if (status >= 0)
+ if (status == 0 || state->ptr == state->start)
+ state->start = (void*) ((char*) state->ptr + state->charsize);
+ else
state->start = state->ptr;
return match;
}
-static PyMethodDef cursor_methods[] = {
- {"match", (PyCFunction) cursor_match, 0},
- {"search", (PyCFunction) cursor_search, 0},
+static PyMethodDef scanner_methods[] = {
+ {"match", (PyCFunction) scanner_match, 0},
+ {"search", (PyCFunction) scanner_search, 0},
{NULL, NULL}
};
static PyObject*
-cursor_getattr(CursorObject* self, char* name)
+scanner_getattr(ScannerObject* self, char* name)
{
PyObject* res;
- res = Py_FindMethod(cursor_methods, (PyObject*) self, name);
+ res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
if (res)
return res;
@@ -1859,18 +1862,21 @@
return self->pattern;
}
+ if (!strcmp(name, "groups"))
+ return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups);
+
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
}
-statichere PyTypeObject Cursor_Type = {
+statichere PyTypeObject Scanner_Type = {
PyObject_HEAD_INIT(NULL)
- 0, "Cursor",
- sizeof(CursorObject), /* size of basic object */
+ 0, "SRE_Scanner",
+ sizeof(ScannerObject), /* size of basic object */
0,
- (destructor)cursor_dealloc, /*tp_dealloc*/
+ (destructor)scanner_dealloc, /*tp_dealloc*/
0, /*tp_print*/
- (getattrfunc)cursor_getattr, /*tp_getattr*/
+ (getattrfunc)scanner_getattr, /*tp_getattr*/
};
static PyMethodDef _functions[] = {
@@ -1888,7 +1894,7 @@
{
/* Patch object types */
Pattern_Type.ob_type = Match_Type.ob_type =
- Cursor_Type.ob_type = &PyType_Type;
+ Scanner_Type.ob_type = &PyType_Type;
Py_InitModule("_" MODULE, _functions);
}
diff --git a/Modules/sre.h b/Modules/sre.h
index a7786ed..722f890 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -46,6 +46,9 @@
void* ptr;
} SRE_STACK;
+/* FIXME: <fl> shouldn't be a constant, really... */
+#define SRE_MARK_SIZE 200
+
typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
@@ -56,7 +59,7 @@
int charsize;
/* registers */
int lastmark;
- void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
+ void* mark[SRE_MARK_SIZE];
/* backtracking stack */
SRE_STACK* stack;
int stacksize;
@@ -66,11 +69,11 @@
} SRE_STATE;
typedef struct {
- /* search helper */
+ /* scanner (internal helper object) */
PyObject_HEAD
PyObject* pattern;
PyObject* string;
SRE_STATE state;
-} CursorObject;
+} ScannerObject;
#endif