Blame - Lib/regsub.py - platform/external/python/cpython3

blob: 9b8fae5854f17d11032ce3ce5b7649810deb1fec [file] [log] [blame]

Guido van Rossum	e7b146f	2000-02-04 15:28:42 +0000	[diff] [blame]	1	"""Regexp-based split and replace using the obsolete regex module.
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	2
Guido van Rossum	e7b146f	2000-02-04 15:28:42 +0000	[diff] [blame]	3	This module is only for backward compatibility. These operations
				4	are now provided by the new regular expression module, "re".
				5
				6	sub(pat, repl, str): replace first occurrence of pattern in string
				7	gsub(pat, repl, str): replace all occurrences of pattern in string
				8	split(str, pat, maxsplit): split string using pattern as delimiter
				9	splitx(str, pat, maxsplit): split string using pattern as delimiter plus
				10	return delimiters
				11	"""
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	12
Guido van Rossum	7292e92	2000-12-19 18:25:58 +0000	[diff] [blame]	13	import warnings
				14	warnings.warn("the regsub module is deprecated; please use re.sub()",
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	15	DeprecationWarning)
Guido van Rossum	7292e92	2000-12-19 18:25:58 +0000	[diff] [blame]	16
				17	# Ignore further deprecation warnings about this module
				18	warnings.filterwarnings("ignore", "", DeprecationWarning, __name__)
				19
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	20	import regex
				21
Skip Montanaro	0de6580	2001-02-15 22:15:14 +0000	[diff] [blame]	22	__all__ = ["sub","gsub","split","splitx","capwords"]
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	23
				24	# Replace first occurrence of pattern pat in string str by replacement
				25	# repl. If the pattern isn't found, the string is returned unchanged.
				26	# The replacement may contain references \digit to subpatterns and
				27	# escaped backslashes. The pattern may be a string or an already
				28	# compiled pattern.
				29
				30	def sub(pat, repl, str):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	31	prog = compile(pat)
				32	if prog.search(str) >= 0:
				33	regs = prog.regs
				34	a, b = regs[0]
				35	str = str[:a] + expand(repl, regs, str) + str[b:]
				36	return str
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	37
				38
				39	# Replace all (non-overlapping) occurrences of pattern pat in string
				40	# str by replacement repl. The same rules as for sub() apply.
				41	# Empty matches for the pattern are replaced only when not adjacent to
				42	# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
				43
				44	def gsub(pat, repl, str):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	45	prog = compile(pat)
				46	new = ''
				47	start = 0
				48	first = 1
				49	while prog.search(str, start) >= 0:
				50	regs = prog.regs
				51	a, b = regs[0]
				52	if a == b == start and not first:
				53	if start >= len(str) or prog.search(str, start+1) < 0:
				54	break
				55	regs = prog.regs
				56	a, b = regs[0]
				57	new = new + str[start:a] + expand(repl, regs, str)
				58	start = b
				59	first = 0
				60	new = new + str[start:]
				61	return new
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	62
				63
				64	# Split string str in fields separated by delimiters matching pattern
				65	# pat. Only non-empty matches for the pattern are considered, so e.g.
				66	# split('abc', '') returns ['abc'].
Guido van Rossum	a59d3e6	1996-08-08 18:39:18 +0000	[diff] [blame]	67	# The optional 3rd argument sets the number of splits that are performed.
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	68
Guido van Rossum	a59d3e6	1996-08-08 18:39:18 +0000	[diff] [blame]	69	def split(str, pat, maxsplit = 0):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	70	return intsplit(str, pat, maxsplit, 0)
Guido van Rossum	a59d3e6	1996-08-08 18:39:18 +0000	[diff] [blame]	71
				72	# Split string str in fields separated by delimiters matching pattern
				73	# pat. Only non-empty matches for the pattern are considered, so e.g.
				74	# split('abc', '') returns ['abc']. The delimiters are also included
				75	# in the list.
				76	# The optional 3rd argument sets the number of splits that are performed.
				77
				78
				79	def splitx(str, pat, maxsplit = 0):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	80	return intsplit(str, pat, maxsplit, 1)
				81
Guido van Rossum	a59d3e6	1996-08-08 18:39:18 +0000	[diff] [blame]	82	# Internal function used to implement split() and splitx().
				83
				84	def intsplit(str, pat, maxsplit, retain):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	85	prog = compile(pat)
				86	res = []
				87	start = next = 0
				88	splitcount = 0
				89	while prog.search(str, next) >= 0:
				90	regs = prog.regs
				91	a, b = regs[0]
				92	if a == b:
				93	next = next + 1
				94	if next >= len(str):
				95	break
				96	else:
				97	res.append(str[start:a])
				98	if retain:
				99	res.append(str[a:b])
				100	start = next = b
				101	splitcount = splitcount + 1
				102	if (maxsplit and (splitcount >= maxsplit)):
				103	break
				104	res.append(str[start:])
				105	return res
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	106
				107
Guido van Rossum	4cc4ab1	1996-06-11 18:45:15 +0000	[diff] [blame]	108	# Capitalize words split using a pattern
				109
Guido van Rossum	7a7d5d8	1996-08-09 21:32:29 +0000	[diff] [blame]	110	def capwords(str, pat='[^a-zA-Z0-9_]+'):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	111	words = splitx(str, pat)
				112	for i in range(0, len(words), 2):
Eric S. Raymond	66d9919	2001-02-09 09:19:27 +0000	[diff] [blame]	113	words[i] = words[i].capitalize()
Eric S. Raymond	92852ad	2001-02-09 09:21:01 +0000	[diff] [blame]	114	return "".join(words)
Guido van Rossum	4cc4ab1	1996-06-11 18:45:15 +0000	[diff] [blame]	115
				116
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	117	# Internal subroutines:
				118	# compile(pat): compile a pattern, caching already compiled patterns
				119	# expand(repl, regs, str): expand \digit escapes in replacement string
				120
				121
				122	# Manage a cache of compiled regular expressions.
Barry Warsaw	b67a25c	1997-02-18 18:52:55 +0000	[diff] [blame]	123	#
				124	# If the pattern is a string a compiled version of it is returned. If
				125	# the pattern has been used before we return an already compiled
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	126	# version from the cache; otherwise we compile it now and save the
Barry Warsaw	b67a25c	1997-02-18 18:52:55 +0000	[diff] [blame]	127	# compiled version in the cache, along with the syntax it was compiled
				128	# with. Instead of a string, a compiled regular expression can also
				129	# be passed.
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	130
				131	cache = {}
				132
				133	def compile(pat):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	134	if type(pat) != type(''):
				135	return pat # Assume it is a compiled regex
				136	key = (pat, regex.get_syntax())
Raymond Hettinger	54f0222	2002-06-01 14:18:47 +0000	[diff] [blame]	137	if key in cache:
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	138	prog = cache[key] # Get it from the cache
				139	else:
				140	prog = cache[key] = regex.compile(pat)
				141	return prog
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	142
				143
Barry Warsaw	b67a25c	1997-02-18 18:52:55 +0000	[diff] [blame]	144	def clear_cache():
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	145	global cache
				146	cache = {}
Barry Warsaw	b67a25c	1997-02-18 18:52:55 +0000	[diff] [blame]	147
				148
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	149	# Expand \digit in the replacement.
				150	# Each occurrence of \digit is replaced by the substring of str
				151	# indicated by regs[digit]. To include a literal \ in the
				152	# replacement, double it; other \ escapes are left unchanged (i.e.
				153	# the \ and the following character are both copied).
				154
				155	def expand(repl, regs, str):
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	156	if '\\' not in repl:
				157	return repl
				158	new = ''
				159	i = 0
				160	ord0 = ord('0')
				161	while i < len(repl):
				162	c = repl[i]; i = i+1
				163	if c != '\\' or i >= len(repl):
				164	new = new + c
				165	else:
				166	c = repl[i]; i = i+1
				167	if '0' <= c <= '9':
				168	a, b = regs[ord(c)-ord0]
				169	new = new + str[a:b]
				170	elif c == '\\':
				171	new = new + c
				172	else:
				173	new = new + '\\' + c
				174	return new
Guido van Rossum	7a461e5	1992-09-20 21:41:09 +0000	[diff] [blame]	175
				176
				177	# Test program, reads sequences "pat repl str" from stdin.
				178	# Optional argument specifies pattern used to split lines.
				179
				180	def test():
Tim Peters	0c9886d	2001-01-15 01:18:21 +0000	[diff] [blame]	181	import sys
				182	if sys.argv[1:]:
				183	delpat = sys.argv[1]
				184	else:
				185	delpat = '[ \t\n]+'
				186	while 1:
				187	if sys.stdin.isatty(): sys.stderr.write('--> ')
				188	line = sys.stdin.readline()
				189	if not line: break
				190	if line[-1] == '\n': line = line[:-1]
				191	fields = split(line, delpat)
				192	if len(fields) != 3:
				193	print 'Sorry, not three fields'
				194	print 'split:', `fields`
				195	continue
				196	[pat, repl, str] = split(line, delpat)
				197	print 'sub :', `sub(pat, repl, str)`
				198	print 'gsub:', `gsub(pat, repl, str)`