Blame - common/py3-stdlib/difflib.py - platform/prebuilts/build-tools

blob: 0dda80d38757399ffd9746bb2fa35a07b8aaf6ad [file] [log] [blame]

Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	1	"""
				2	Module difflib -- helpers for computing deltas between objects.
				3
				4	Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
				5	Use SequenceMatcher to return list of the best "good enough" matches.
				6
				7	Function context_diff(a, b):
				8	For two lists of strings, return a delta in context diff format.
				9
				10	Function ndiff(a, b):
				11	Return a delta: the difference between `a` and `b` (lists of strings).
				12
				13	Function restore(delta, which):
				14	Return one of the two sequences that generated an ndiff delta.
				15
				16	Function unified_diff(a, b):
				17	For two lists of strings, return a delta in unified diff format.
				18
				19	Class SequenceMatcher:
				20	A flexible class for comparing pairs of sequences of any type.
				21
				22	Class Differ:
				23	For producing human-readable deltas from sequences of lines of text.
				24
				25	Class HtmlDiff:
				26	For producing HTML side by side comparison with change highlights.
				27	"""
				28
				29	__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
				30	'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
				31	'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
				32
				33	from heapq import nlargest as _nlargest
				34	from collections import namedtuple as _namedtuple
Elliott Hughes	96c2b6b	2021-01-26 11:15:15 -0800	[diff] [blame]	35	from types import GenericAlias
Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	36
				37	Match = _namedtuple('Match', 'a b size')
				38
				39	def _calculate_ratio(matches, length):
				40	if length:
				41	return 2.0 * matches / length
				42	return 1.0
				43
				44	class SequenceMatcher:
				45
				46	"""
				47	SequenceMatcher is a flexible class for comparing pairs of sequences of
				48	any type, so long as the sequence elements are hashable. The basic
				49	algorithm predates, and is a little fancier than, an algorithm
				50	published in the late 1980's by Ratcliff and Obershelp under the
				51	hyperbolic name "gestalt pattern matching". The basic idea is to find
				52	the longest contiguous matching subsequence that contains no "junk"
				53	elements (R-O doesn't address junk). The same idea is then applied
				54	recursively to the pieces of the sequences to the left and to the right
				55	of the matching subsequence. This does not yield minimal edit
				56	sequences, but does tend to yield matches that "look right" to people.
				57
				58	SequenceMatcher tries to compute a "human-friendly diff" between two
				59	sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the
				60	longest contiguous & junk-free matching subsequence. That's what
				61	catches peoples' eyes. The Windows(tm) windiff has another interesting
				62	notion, pairing up elements that appear uniquely in each sequence.
				63	That, and the method here, appear to yield more intuitive difference
				64	reports than does diff. This method appears to be the least vulnerable
				65	to synching up on blocks of "junk lines", though (like blank lines in
				66	ordinary text files, or maybe "<P>" lines in HTML files). That may be
				67	because this is the only method of the 3 that has a concept of
				68	"junk" <wink>.
				69
				70	Example, comparing two strings, and considering blanks to be "junk":
				71
				72	>>> s = SequenceMatcher(lambda x: x == " ",
				73	... "private Thread currentThread;",
				74	... "private volatile Thread currentThread;")
				75	>>>
				76
				77	.ratio() returns a float in [0, 1], measuring the "similarity" of the
				78	sequences. As a rule of thumb, a .ratio() value over 0.6 means the
				79	sequences are close matches:
				80
				81	>>> print(round(s.ratio(), 3))
				82	0.866
				83	>>>
				84
				85	If you're only interested in where the sequences match,
				86	.get_matching_blocks() is handy:
				87
				88	>>> for block in s.get_matching_blocks():
				89	... print("a[%d] and b[%d] match for %d elements" % block)
				90	a[0] and b[0] match for 8 elements
				91	a[8] and b[17] match for 21 elements
				92	a[29] and b[38] match for 0 elements
				93
				94	Note that the last tuple returned by .get_matching_blocks() is always a
				95	dummy, (len(a), len(b), 0), and this is the only case in which the last
				96	tuple element (number of elements matched) is 0.
				97
				98	If you want to know how to change the first sequence into the second,
				99	use .get_opcodes():
				100
				101	>>> for opcode in s.get_opcodes():
				102	... print("%6s a[%d:%d] b[%d:%d]" % opcode)
				103	equal a[0:8] b[0:8]
				104	insert a[8:8] b[8:17]
				105	equal a[8:29] b[17:38]
				106
				107	See the Differ class for a fancy human-friendly file differencer, which
				108	uses SequenceMatcher both to compare sequences of lines, and to compare
				109	sequences of characters within similar (near-matching) lines.
				110
				111	See also function get_close_matches() in this module, which shows how
				112	simple code building on SequenceMatcher can be used to do useful work.
				113
				114	Timing: Basic R-O is cubic time worst case and quadratic time expected
				115	case. SequenceMatcher is quadratic time for the worst case and has
				116	expected-case behavior dependent in a complicated way on how many
				117	elements the sequences have in common; best case time is linear.
				118
				119	Methods:
				120
				121	__init__(isjunk=None, a='', b='')
				122	Construct a SequenceMatcher.
				123
				124	set_seqs(a, b)
				125	Set the two sequences to be compared.
				126
				127	set_seq1(a)
				128	Set the first sequence to be compared.
				129
				130	set_seq2(b)
				131	Set the second sequence to be compared.
				132
Elliott Hughes	96c2b6b	2021-01-26 11:15:15 -0800	[diff] [blame]	133	find_longest_match(alo=0, ahi=None, blo=0, bhi=None)
Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	134	Find longest matching block in a[alo:ahi] and b[blo:bhi].
				135
				136	get_matching_blocks()
				137	Return list of triples describing matching subsequences.
				138
				139	get_opcodes()
				140	Return list of 5-tuples describing how to turn a into b.
				141
				142	ratio()
				143	Return a measure of the sequences' similarity (float in [0,1]).
				144
				145	quick_ratio()
				146	Return an upper bound on .ratio() relatively quickly.
				147
				148	real_quick_ratio()
				149	Return an upper bound on ratio() very quickly.
				150	"""
				151
				152	def __init__(self, isjunk=None, a='', b='', autojunk=True):
				153	"""Construct a SequenceMatcher.
				154
				155	Optional arg isjunk is None (the default), or a one-argument
				156	function that takes a sequence element and returns true iff the
				157	element is junk. None is equivalent to passing "lambda x: 0", i.e.
				158	no elements are considered to be junk. For example, pass
				159	lambda x: x in " \\t"
				160	if you're comparing lines as sequences of characters, and don't
				161	want to synch up on blanks or hard tabs.
				162
				163	Optional arg a is the first of two sequences to be compared. By
				164	default, an empty string. The elements of a must be hashable. See
				165	also .set_seqs() and .set_seq1().
				166
				167	Optional arg b is the second of two sequences to be compared. By
				168	default, an empty string. The elements of b must be hashable. See
				169	also .set_seqs() and .set_seq2().
				170
				171	Optional arg autojunk should be set to False to disable the
				172	"automatic junk heuristic" that treats popular elements as junk
				173	(see module documentation for more information).
				174	"""
				175
				176	# Members:
				177	# a
				178	# first sequence
				179	# b
				180	# second sequence; differences are computed as "what do
				181	# we need to do to 'a' to change it into 'b'?"
				182	# b2j
				183	# for x in b, b2j[x] is a list of the indices (into b)
				184	# at which x appears; junk and popular elements do not appear
				185	# fullbcount
				186	# for x in b, fullbcount[x] == the number of times x
				187	# appears in b; only materialized if really needed (used
				188	# only for computing quick_ratio())
				189	# matching_blocks
				190	# a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
				191	# ascending & non-overlapping in i and in j; terminated by
				192	# a dummy (len(a), len(b), 0) sentinel
				193	# opcodes
				194	# a list of (tag, i1, i2, j1, j2) tuples, where tag is
				195	# one of
				196	# 'replace' a[i1:i2] should be replaced by b[j1:j2]
				197	# 'delete' a[i1:i2] should be deleted
				198	# 'insert' b[j1:j2] should be inserted
				199	# 'equal' a[i1:i2] == b[j1:j2]
				200	# isjunk
				201	# a user-supplied function taking a sequence element and
				202	# returning true iff the element is "junk" -- this has
				203	# subtle but helpful effects on the algorithm, which I'll
				204	# get around to writing up someday <0.9 wink>.
				205	# DON'T USE! Only __chain_b uses this. Use "in self.bjunk".
				206	# bjunk
				207	# the items in b for which isjunk is True.
				208	# bpopular
				209	# nonjunk items in b treated as junk by the heuristic (if used).
				210
				211	self.isjunk = isjunk
				212	self.a = self.b = None
				213	self.autojunk = autojunk
				214	self.set_seqs(a, b)
				215
				216	def set_seqs(self, a, b):
				217	"""Set the two sequences to be compared.
				218
				219	>>> s = SequenceMatcher()
				220	>>> s.set_seqs("abcd", "bcde")
				221	>>> s.ratio()
				222	0.75
				223	"""
				224
				225	self.set_seq1(a)
				226	self.set_seq2(b)
				227
				228	def set_seq1(self, a):
				229	"""Set the first sequence to be compared.
				230
				231	The second sequence to be compared is not changed.
				232
				233	>>> s = SequenceMatcher(None, "abcd", "bcde")
				234	>>> s.ratio()
				235	0.75
				236	>>> s.set_seq1("bcde")
				237	>>> s.ratio()
				238	1.0
				239	>>>
				240
				241	SequenceMatcher computes and caches detailed information about the
				242	second sequence, so if you want to compare one sequence S against
				243	many sequences, use .set_seq2(S) once and call .set_seq1(x)
				244	repeatedly for each of the other sequences.
				245
				246	See also set_seqs() and set_seq2().
				247	"""
				248
				249	if a is self.a:
				250	return
				251	self.a = a
				252	self.matching_blocks = self.opcodes = None
				253
				254	def set_seq2(self, b):
				255	"""Set the second sequence to be compared.
				256
				257	The first sequence to be compared is not changed.
				258
				259	>>> s = SequenceMatcher(None, "abcd", "bcde")
				260	>>> s.ratio()
				261	0.75
				262	>>> s.set_seq2("abcd")
				263	>>> s.ratio()
				264	1.0
				265	>>>
				266
				267	SequenceMatcher computes and caches detailed information about the
				268	second sequence, so if you want to compare one sequence S against
				269	many sequences, use .set_seq2(S) once and call .set_seq1(x)
				270	repeatedly for each of the other sequences.
				271
				272	See also set_seqs() and set_seq1().
				273	"""
				274
				275	if b is self.b:
				276	return
				277	self.b = b
				278	self.matching_blocks = self.opcodes = None
				279	self.fullbcount = None
				280	self.__chain_b()
				281
				282	# For each element x in b, set b2j[x] to a list of the indices in
				283	# b where x appears; the indices are in increasing order; note that
				284	# the number of times x appears in b is len(b2j[x]) ...
				285	# when self.isjunk is defined, junk elements don't show up in this
				286	# map at all, which stops the central find_longest_match method
				287	# from starting any matching block at a junk element ...
				288	# b2j also does not contain entries for "popular" elements, meaning
				289	# elements that account for more than 1 + 1% of the total elements, and
				290	# when the sequence is reasonably large (>= 200 elements); this can
				291	# be viewed as an adaptive notion of semi-junk, and yields an enormous
				292	# speedup when, e.g., comparing program files with hundreds of
				293	# instances of "return NULL;" ...
				294	# note that this is only called when b changes; so for cross-product
				295	# kinds of matches, it's best to call set_seq2 once, then set_seq1
				296	# repeatedly
				297
				298	def __chain_b(self):
				299	# Because isjunk is a user-defined (not C) function, and we test
				300	# for junk a LOT, it's important to minimize the number of calls.
				301	# Before the tricks described here, __chain_b was by far the most
				302	# time-consuming routine in the whole module! If anyone sees
				303	# Jim Roskind, thank him again for profile.py -- I never would
				304	# have guessed that.
				305	# The first trick is to build b2j ignoring the possibility
				306	# of junk. I.e., we don't call isjunk at all yet. Throwing
				307	# out the junk later is much cheaper than building b2j "right"
				308	# from the start.
				309	b = self.b
				310	self.b2j = b2j = {}
				311
				312	for i, elt in enumerate(b):
				313	indices = b2j.setdefault(elt, [])
				314	indices.append(i)
				315
				316	# Purge junk elements
				317	self.bjunk = junk = set()
				318	isjunk = self.isjunk
				319	if isjunk:
				320	for elt in b2j.keys():
				321	if isjunk(elt):
				322	junk.add(elt)
				323	for elt in junk: # separate loop avoids separate list of keys
				324	del b2j[elt]
				325
				326	# Purge popular elements that are not junk
				327	self.bpopular = popular = set()
				328	n = len(b)
				329	if self.autojunk and n >= 200:
				330	ntest = n // 100 + 1
				331	for elt, idxs in b2j.items():
				332	if len(idxs) > ntest:
				333	popular.add(elt)
				334	for elt in popular: # ditto; as fast for 1% deletion
				335	del b2j[elt]
				336
Elliott Hughes	96c2b6b	2021-01-26 11:15:15 -0800	[diff] [blame]	337	def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	338	"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
				339
Elliott Hughes	96c2b6b	2021-01-26 11:15:15 -0800	[diff] [blame]	340	By default it will find the longest match in the entirety of a and b.
				341
Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	342	If isjunk is not defined:
				343
				344	Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
				345	alo <= i <= i+k <= ahi
				346	blo <= j <= j+k <= bhi
				347	and for all (i',j',k') meeting those conditions,
				348	k >= k'
				349	i <= i'
				350	and if i == i', j <= j'
				351
				352	In other words, of all maximal matching blocks, return one that
				353	starts earliest in a, and of all those maximal matching blocks that
				354	start earliest in a, return the one that starts earliest in b.
				355
				356	>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
				357	>>> s.find_longest_match(0, 5, 0, 9)
				358	Match(a=0, b=4, size=5)
				359
				360	If isjunk is defined, first the longest matching block is
				361	determined as above, but with the additional restriction that no
				362	junk element appears in the block. Then that block is extended as
				363	far as possible by matching (only) junk elements on both sides. So
				364	the resulting block never matches on junk except as identical junk
				365	happens to be adjacent to an "interesting" match.
				366
				367	Here's the same example as before, but considering blanks to be
				368	junk. That prevents " abcd" from matching the " abcd" at the tail
				369	end of the second sequence directly. Instead only the "abcd" can
				370	match, and matches the leftmost "abcd" in the second sequence:
				371
				372	>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
				373	>>> s.find_longest_match(0, 5, 0, 9)
				374	Match(a=1, b=0, size=4)
				375
				376	If no blocks match, return (alo, blo, 0).
				377
				378	>>> s = SequenceMatcher(None, "ab", "c")
				379	>>> s.find_longest_match(0, 2, 0, 1)
				380	Match(a=0, b=0, size=0)
				381	"""
				382
				383	# CAUTION: stripping common prefix or suffix would be incorrect.
				384	# E.g.,
				385	# ab
				386	# acab
				387	# Longest matching block is "ab", but if common prefix is
				388	# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
				389	# strip, so ends up claiming that ab is changed to acab by
				390	# inserting "ca" in the middle. That's minimal but unintuitive:
				391	# "it's obvious" that someone inserted "ac" at the front.
				392	# Windiff ends up at the same place as diff, but by pairing up
				393	# the unique 'b's and then matching the first two 'a's.
				394
				395	a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
Elliott Hughes	96c2b6b	2021-01-26 11:15:15 -0800	[diff] [blame]	396	if ahi is None:
				397	ahi = len(a)
				398	if bhi is None:
				399	bhi = len(b)
Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	400	besti, bestj, bestsize = alo, blo, 0
				401	# find longest junk-free match
				402	# during an iteration of the loop, j2len[j] = length of longest
				403	# junk-free match ending with a[i-1] and b[j]
				404	j2len = {}
				405	nothing = []
				406	for i in range(alo, ahi):
				407	# look at all instances of a[i] in b; note that because
				408	# b2j has no junk keys, the loop is skipped if a[i] is junk
				409	j2lenget = j2len.get
				410	newj2len = {}
				411	for j in b2j.get(a[i], nothing):
				412	# a[i] matches b[j]
				413	if j < blo:
				414	continue
				415	if j >= bhi:
				416	break
				417	k = newj2len[j] = j2lenget(j-1, 0) + 1
				418	if k > bestsize:
				419	besti, bestj, bestsize = i-k+1, j-k+1, k
				420	j2len = newj2len
				421
				422	# Extend the best by non-junk elements on each end. In particular,
				423	# "popular" non-junk elements aren't in b2j, which greatly speeds
				424	# the inner loop above, but also means "the best" match so far
				425	# doesn't contain any junk or popular non-junk elements.
				426	while besti > alo and bestj > blo and \
				427	not isbjunk(b[bestj-1]) and \
				428	a[besti-1] == b[bestj-1]:
				429	besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
				430	while besti+bestsize < ahi and bestj+bestsize < bhi and \
				431	not isbjunk(b[bestj+bestsize]) and \
				432	a[besti+bestsize] == b[bestj+bestsize]:
				433	bestsize += 1
				434
				435	# Now that we have a wholly interesting match (albeit possibly
				436	# empty!), we may as well suck up the matching junk on each
				437	# side of it too. Can't think of a good reason not to, and it
				438	# saves post-processing the (possibly considerable) expense of
				439	# figuring out what to do with it. In the case of an empty
				440	# interesting match, this is clearly the right thing to do,
				441	# because no other kind of match is possible in the regions.
				442	while besti > alo and bestj > blo and \
				443	isbjunk(b[bestj-1]) and \
				444	a[besti-1] == b[bestj-1]:
				445	besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
				446	while besti+bestsize < ahi and bestj+bestsize < bhi and \
				447	isbjunk(b[bestj+bestsize]) and \
				448	a[besti+bestsize] == b[bestj+bestsize]:
				449	bestsize = bestsize + 1
				450
				451	return Match(besti, bestj, bestsize)
				452
				453	def get_matching_blocks(self):
				454	"""Return list of triples describing matching subsequences.
				455
				456	Each triple is of the form (i, j, n), and means that
				457	a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
				458	i and in j. New in Python 2.5, it's also guaranteed that if
				459	(i, j, n) and (i', j', n') are adjacent triples in the list, and
				460	the second is not the last triple in the list, then i+n != i' or
				461	j+n != j'. IOW, adjacent triples never describe adjacent equal
				462	blocks.
				463
				464	The last triple is a dummy, (len(a), len(b), 0), and is the only
				465	triple with n==0.
				466
				467	>>> s = SequenceMatcher(None, "abxcd", "abcd")
				468	>>> list(s.get_matching_blocks())
				469	[Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
				470	"""
				471
				472	if self.matching_blocks is not None:
				473	return self.matching_blocks
				474	la, lb = len(self.a), len(self.b)
				475
				476	# This is most naturally expressed as a recursive algorithm, but
				477	# at least one user bumped into extreme use cases that exceeded
				478	# the recursion limit on their box. So, now we maintain a list
				479	# ('queue`) of blocks we still need to look at, and append partial
				480	# results to `matching_blocks` in a loop; the matches are sorted
				481	# at the end.
				482	queue = [(0, la, 0, lb)]
				483	matching_blocks = []
				484	while queue:
				485	alo, ahi, blo, bhi = queue.pop()
				486	i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
				487	# a[alo:i] vs b[blo:j] unknown
				488	# a[i:i+k] same as b[j:j+k]
				489	# a[i+k:ahi] vs b[j+k:bhi] unknown
				490	if k: # if k is 0, there was no matching block
				491	matching_blocks.append(x)
				492	if alo < i and blo < j:
				493	queue.append((alo, i, blo, j))
				494	if i+k < ahi and j+k < bhi:
				495	queue.append((i+k, ahi, j+k, bhi))
				496	matching_blocks.sort()
				497
				498	# It's possible that we have adjacent equal blocks in the
				499	# matching_blocks list now. Starting with 2.5, this code was added
				500	# to collapse them.
				501	i1 = j1 = k1 = 0
				502	non_adjacent = []
				503	for i2, j2, k2 in matching_blocks:
				504	# Is this block adjacent to i1, j1, k1?
				505	if i1 + k1 == i2 and j1 + k1 == j2:
				506	# Yes, so collapse them -- this just increases the length of
				507	# the first block by the length of the second, and the first
				508	# block so lengthened remains the block to compare against.
				509	k1 += k2
				510	else:
				511	# Not adjacent. Remember the first block (k1==0 means it's
				512	# the dummy we started with), and make the second block the
				513	# new block to compare against.
				514	if k1:
				515	non_adjacent.append((i1, j1, k1))
				516	i1, j1, k1 = i2, j2, k2
				517	if k1:
				518	non_adjacent.append((i1, j1, k1))
				519
				520	non_adjacent.append( (la, lb, 0) )
				521	self.matching_blocks = list(map(Match._make, non_adjacent))
				522	return self.matching_blocks
				523
				524	def get_opcodes(self):
				525	"""Return list of 5-tuples describing how to turn a into b.
				526
				527	Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
				528	has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
				529	tuple preceding it, and likewise for j1 == the previous j2.
				530
				531	The tags are strings, with these meanings:
				532
				533	'replace': a[i1:i2] should be replaced by b[j1:j2]
				534	'delete': a[i1:i2] should be deleted.
				535	Note that j1==j2 in this case.
				536	'insert': b[j1:j2] should be inserted at a[i1:i1].
				537	Note that i1==i2 in this case.
				538	'equal': a[i1:i2] == b[j1:j2]
				539
				540	>>> a = "qabxcd"
				541	>>> b = "abycdf"
				542	>>> s = SequenceMatcher(None, a, b)
				543	>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
				544	... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
				545	... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
				546	delete a[0:1] (q) b[0:0] ()
				547	equal a[1:3] (ab) b[0:2] (ab)
				548	replace a[3:4] (x) b[2:3] (y)
				549	equal a[4:6] (cd) b[3:5] (cd)
				550	insert a[6:6] () b[5:6] (f)
				551	"""
				552
				553	if self.opcodes is not None:
				554	return self.opcodes
				555	i = j = 0
				556	self.opcodes = answer = []
				557	for ai, bj, size in self.get_matching_blocks():
				558	# invariant: we've pumped out correct diffs to change
				559	# a[:i] into b[:j], and the next matching block is
				560	# a[ai:ai+size] == b[bj:bj+size]. So we need to pump
				561	# out a diff to change a[i:ai] into b[j:bj], pump out
				562	# the matching block, and move (i,j) beyond the match
				563	tag = ''
				564	if i < ai and j < bj:
				565	tag = 'replace'
				566	elif i < ai:
				567	tag = 'delete'
				568	elif j < bj:
				569	tag = 'insert'
				570	if tag:
				571	answer.append( (tag, i, ai, j, bj) )
				572	i, j = ai+size, bj+size
				573	# the list of matching blocks is terminated by a
				574	# sentinel with size 0
				575	if size:
				576	answer.append( ('equal', ai, i, bj, j) )
				577	return answer
				578
				579	def get_grouped_opcodes(self, n=3):
				580	""" Isolate change clusters by eliminating ranges with no changes.
				581
				582	Return a generator of groups with up to n lines of context.
				583	Each group is in the same format as returned by get_opcodes().
				584
				585	>>> from pprint import pprint
				586	>>> a = list(map(str, range(1,40)))
				587	>>> b = a[:]
				588	>>> b[8:8] = ['i'] # Make an insertion
				589	>>> b[20] += 'x' # Make a replacement
				590	>>> b[23:28] = [] # Make a deletion
				591	>>> b[30] += 'y' # Make another replacement
				592	>>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
				593	[[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
				594	[('equal', 16, 19, 17, 20),
				595	('replace', 19, 20, 20, 21),
				596	('equal', 20, 22, 21, 23),
				597	('delete', 22, 27, 23, 23),
				598	('equal', 27, 30, 23, 26)],
				599	[('equal', 31, 34, 27, 30),
				600	('replace', 34, 35, 30, 31),
				601	('equal', 35, 38, 31, 34)]]
				602	"""
				603
				604	codes = self.get_opcodes()
				605	if not codes:
				606	codes = [("equal", 0, 1, 0, 1)]
				607	# Fixup leading and trailing groups if they show no changes.
				608	if codes[0][0] == 'equal':
				609	tag, i1, i2, j1, j2 = codes[0]
				610	codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
				611	if codes[-1][0] == 'equal':
				612	tag, i1, i2, j1, j2 = codes[-1]
				613	codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
				614
				615	nn = n + n
				616	group = []
				617	for tag, i1, i2, j1, j2 in codes:
				618	# End the current group and start a new one whenever
				619	# there is a large range with no changes.
				620	if tag == 'equal' and i2-i1 > nn:
				621	group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
				622	yield group
				623	group = []
				624	i1, j1 = max(i1, i2-n), max(j1, j2-n)
				625	group.append((tag, i1, i2, j1 ,j2))
				626	if group and not (len(group)==1 and group[0][0] == 'equal'):
				627	yield group
				628
				629	def ratio(self):
				630	"""Return a measure of the sequences' similarity (float in [0,1]).
				631
				632	Where T is the total number of elements in both sequences, and
				633	M is the number of matches, this is 2.0*M / T.
				634	Note that this is 1 if the sequences are identical, and 0 if
				635	they have nothing in common.
				636
				637	.ratio() is expensive to compute if you haven't already computed
				638	.get_matching_blocks() or .get_opcodes(), in which case you may
				639	want to try .quick_ratio() or .real_quick_ratio() first to get an
				640	upper bound.
				641
				642	>>> s = SequenceMatcher(None, "abcd", "bcde")
				643	>>> s.ratio()
				644	0.75
				645	>>> s.quick_ratio()
				646	0.75
				647	>>> s.real_quick_ratio()
				648	1.0
				649	"""
				650
				651	matches = sum(triple[-1] for triple in self.get_matching_blocks())
				652	return _calculate_ratio(matches, len(self.a) + len(self.b))
				653
				654	def quick_ratio(self):
				655	"""Return an upper bound on ratio() relatively quickly.
				656
				657	This isn't defined beyond that it is an upper bound on .ratio(), and
				658	is faster to compute.
				659	"""
				660
				661	# viewing a and b as multisets, set matches to the cardinality
				662	# of their intersection; this counts the number of matches
				663	# without regard to order, so is clearly an upper bound
				664	if self.fullbcount is None:
				665	self.fullbcount = fullbcount = {}
				666	for elt in self.b:
				667	fullbcount[elt] = fullbcount.get(elt, 0) + 1
				668	fullbcount = self.fullbcount
				669	# avail[x] is the number of times x appears in 'b' less the
				670	# number of times we've seen it in 'a' so far ... kinda
				671	avail = {}
				672	availhas, matches = avail.__contains__, 0
				673	for elt in self.a:
				674	if availhas(elt):
				675	numb = avail[elt]
				676	else:
				677	numb = fullbcount.get(elt, 0)
				678	avail[elt] = numb - 1
				679	if numb > 0:
				680	matches = matches + 1
				681	return _calculate_ratio(matches, len(self.a) + len(self.b))
				682
				683	def real_quick_ratio(self):
				684	"""Return an upper bound on ratio() very quickly.
				685
				686	This isn't defined beyond that it is an upper bound on .ratio(), and
				687	is faster to compute than either .ratio() or .quick_ratio().
				688	"""
				689
				690	la, lb = len(self.a), len(self.b)
				691	# can't have more matches than the number of elements in the
				692	# shorter sequence
				693	return _calculate_ratio(min(la, lb), la + lb)
				694
Elliott Hughes	96c2b6b	2021-01-26 11:15:15 -0800	[diff] [blame]	695	__class_getitem__ = classmethod(GenericAlias)
				696
				697
Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	698	def get_close_matches(word, possibilities, n=3, cutoff=0.6):
				699	"""Use SequenceMatcher to return list of the best "good enough" matches.
				700
				701	word is a sequence for which close matches are desired (typically a
				702	string).
				703
				704	possibilities is a list of sequences against which to match word
				705	(typically a list of strings).
				706
				707	Optional arg n (default 3) is the maximum number of close matches to
				708	return. n must be > 0.
				709
				710	Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
				711	that don't score at least that similar to word are ignored.
				712
				713	The best (no more than n) matches among the possibilities are returned
				714	in a list, sorted by similarity score, most similar first.
				715
				716	>>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
				717	['apple', 'ape']
				718	>>> import keyword as _keyword
				719	>>> get_close_matches("wheel", _keyword.kwlist)
				720	['while']
				721	>>> get_close_matches("Apple", _keyword.kwlist)
				722	[]
				723	>>> get_close_matches("accept", _keyword.kwlist)
				724	['except']
				725	"""
				726
				727	if not n > 0:
				728	raise ValueError("n must be > 0: %r" % (n,))
				729	if not 0.0 <= cutoff <= 1.0:
				730	raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
				731	result = []
				732	s = SequenceMatcher()
				733	s.set_seq2(word)
				734	for x in possibilities:
				735	s.set_seq1(x)
				736	if s.real_quick_ratio() >= cutoff and \
				737	s.quick_ratio() >= cutoff and \
				738	s.ratio() >= cutoff:
				739	result.append((s.ratio(), x))
				740
				741	# Move the best scorers to head of list
				742	result = _nlargest(n, result)
				743	# Strip scores for the best n matches
				744	return [x for score, x in result]
				745
				746
				747	def _keep_original_ws(s, tag_s):
				748	"""Replace whitespace with the original whitespace characters in `s`"""
				749	return ''.join(
				750	c if tag_c == " " and c.isspace() else tag_c
				751	for c, tag_c in zip(s, tag_s)
				752	)
				753
				754
				755
				756	class Differ:
				757	r"""
				758	Differ is a class for comparing sequences of lines of text, and
				759	producing human-readable differences or deltas. Differ uses
				760	SequenceMatcher both to compare sequences of lines, and to compare
				761	sequences of characters within similar (near-matching) lines.
				762
				763	Each line of a Differ delta begins with a two-letter code:
				764
				765	'- ' line unique to sequence 1
				766	'+ ' line unique to sequence 2
				767	' ' line common to both sequences
				768	'? ' line not present in either input sequence
				769
				770	Lines beginning with '? ' attempt to guide the eye to intraline
				771	differences, and were not present in either input sequence. These lines
				772	can be confusing if the sequences contain tab characters.
				773
				774	Note that Differ makes no claim to produce a minimal diff. To the
				775	contrary, minimal diffs are often counter-intuitive, because they synch
				776	up anywhere possible, sometimes accidental matches 100 pages apart.
				777	Restricting synch points to contiguous matches preserves some notion of
				778	locality, at the occasional cost of producing a longer diff.
				779
				780	Example: Comparing two texts.
				781
				782	First we set up the texts, sequences of individual single-line strings
				783	ending with newlines (such sequences can also be obtained from the
				784	`readlines()` method of file-like objects):
				785
				786	>>> text1 = ''' 1. Beautiful is better than ugly.
				787	... 2. Explicit is better than implicit.
				788	... 3. Simple is better than complex.
				789	... 4. Complex is better than complicated.
				790	... '''.splitlines(keepends=True)
				791	>>> len(text1)
				792	4
				793	>>> text1[0][-1]
				794	'\n'
				795	>>> text2 = ''' 1. Beautiful is better than ugly.
				796	... 3. Simple is better than complex.
				797	... 4. Complicated is better than complex.
				798	... 5. Flat is better than nested.
				799	... '''.splitlines(keepends=True)
				800
				801	Next we instantiate a Differ object:
				802
				803	>>> d = Differ()
				804
				805	Note that when instantiating a Differ object we may pass functions to
				806	filter out line and character 'junk'. See Differ.__init__ for details.
				807
				808	Finally, we compare the two:
				809
				810	>>> result = list(d.compare(text1, text2))
				811
				812	'result' is a list of strings, so let's pretty-print it:
				813
				814	>>> from pprint import pprint as _pprint
				815	>>> _pprint(result)
				816	[' 1. Beautiful is better than ugly.\n',
				817	'- 2. Explicit is better than implicit.\n',
				818	'- 3. Simple is better than complex.\n',
				819	'+ 3. Simple is better than complex.\n',
				820	'? ++\n',
				821	'- 4. Complex is better than complicated.\n',
				822	'? ^ ---- ^\n',
				823	'+ 4. Complicated is better than complex.\n',
				824	'? ++++ ^ ^\n',
				825	'+ 5. Flat is better than nested.\n']
				826
				827	As a single multi-line string it looks like this:
				828
				829	>>> print(''.join(result), end="")
				830	1. Beautiful is better than ugly.
				831	- 2. Explicit is better than implicit.
				832	- 3. Simple is better than complex.
				833	+ 3. Simple is better than complex.
				834	? ++
				835	- 4. Complex is better than complicated.
				836	? ^ ---- ^
				837	+ 4. Complicated is better than complex.
				838	? ++++ ^ ^
				839	+ 5. Flat is better than nested.
				840
				841	Methods:
				842
				843	__init__(linejunk=None, charjunk=None)
				844	Construct a text differencer, with optional filters.
				845
				846	compare(a, b)
				847	Compare two sequences of lines; generate the resulting delta.
				848	"""
				849
				850	def __init__(self, linejunk=None, charjunk=None):
				851	"""
				852	Construct a text differencer, with optional filters.
				853
				854	The two optional keyword parameters are for filter functions:
				855
				856	- `linejunk`: A function that should accept a single string argument,
				857	and return true iff the string is junk. The module-level function
				858	`IS_LINE_JUNK` may be used to filter out lines without visible
				859	characters, except for at most one splat ('#'). It is recommended
				860	to leave linejunk None; the underlying SequenceMatcher class has
				861	an adaptive notion of "noise" lines that's better than any static
				862	definition the author has ever been able to craft.
				863
				864	- `charjunk`: A function that should accept a string of length 1. The
				865	module-level function `IS_CHARACTER_JUNK` may be used to filter out
				866	whitespace characters (a blank or tab; note: bad idea to include
				867	newline in this!). Use of IS_CHARACTER_JUNK is recommended.
				868	"""
				869
				870	self.linejunk = linejunk
				871	self.charjunk = charjunk
				872
				873	def compare(self, a, b):
				874	r"""
				875	Compare two sequences of lines; generate the resulting delta.
				876
				877	Each sequence must contain individual single-line strings ending with
				878	newlines. Such sequences can be obtained from the `readlines()` method
				879	of file-like objects. The delta generated also consists of newline-
				880	terminated strings, ready to be printed as-is via the writeline()
				881	method of a file-like object.
				882
				883	Example:
				884
				885	>>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
				886	... 'ore\ntree\nemu\n'.splitlines(True))),
				887	... end="")
				888	- one
				889	? ^
				890	+ ore
				891	? ^
				892	- two
				893	- three
				894	? -
				895	+ tree
				896	+ emu
				897	"""
				898
				899	cruncher = SequenceMatcher(self.linejunk, a, b)
				900	for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
				901	if tag == 'replace':
				902	g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
				903	elif tag == 'delete':
				904	g = self._dump('-', a, alo, ahi)
				905	elif tag == 'insert':
				906	g = self._dump('+', b, blo, bhi)
				907	elif tag == 'equal':
				908	g = self._dump(' ', a, alo, ahi)
				909	else:
				910	raise ValueError('unknown tag %r' % (tag,))
				911
				912	yield from g
				913
				914	def _dump(self, tag, x, lo, hi):
				915	"""Generate comparison results for a same-tagged range."""
				916	for i in range(lo, hi):
				917	yield '%s %s' % (tag, x[i])
				918
				919	def _plain_replace(self, a, alo, ahi, b, blo, bhi):
				920	assert alo < ahi and blo < bhi
				921	# dump the shorter block first -- reduces the burden on short-term
				922	# memory if the blocks are of very different sizes
				923	if bhi - blo < ahi - alo:
				924	first = self._dump('+', b, blo, bhi)
				925	second = self._dump('-', a, alo, ahi)
				926	else:
				927	first = self._dump('-', a, alo, ahi)
				928	second = self._dump('+', b, blo, bhi)
				929
				930	for g in first, second:
				931	yield from g
				932
				933	def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
				934	r"""
				935	When replacing one block of lines with another, search the blocks
				936	for similar lines; the best-matching pair (if any) is used as a
				937	synch point, and intraline difference marking is done on the
				938	similar pair. Lots of work, but often worth it.
				939
				940	Example:
				941
				942	>>> d = Differ()
				943	>>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
				944	... ['abcdefGhijkl\n'], 0, 1)
				945	>>> print(''.join(results), end="")
				946	- abcDefghiJkl
				947	? ^ ^ ^
				948	+ abcdefGhijkl
				949	? ^ ^ ^
				950	"""
				951
				952	# don't synch up unless the lines have a similarity score of at
				953	# least cutoff; best_ratio tracks the best score seen so far
				954	best_ratio, cutoff = 0.74, 0.75
				955	cruncher = SequenceMatcher(self.charjunk)
				956	eqi, eqj = None, None # 1st indices of equal lines (if any)
				957
				958	# search for the pair that matches best without being identical
				959	# (identical lines must be junk lines, & we don't want to synch up
				960	# on junk -- unless we have to)
				961	for j in range(blo, bhi):
				962	bj = b[j]
				963	cruncher.set_seq2(bj)
				964	for i in range(alo, ahi):
				965	ai = a[i]
				966	if ai == bj:
				967	if eqi is None:
				968	eqi, eqj = i, j
				969	continue
				970	cruncher.set_seq1(ai)
				971	# computing similarity is expensive, so use the quick
				972	# upper bounds first -- have seen this speed up messy
				973	# compares by a factor of 3.
				974	# note that ratio() is only expensive to compute the first
				975	# time it's called on a sequence pair; the expensive part
				976	# of the computation is cached by cruncher
				977	if cruncher.real_quick_ratio() > best_ratio and \
				978	cruncher.quick_ratio() > best_ratio and \
				979	cruncher.ratio() > best_ratio:
				980	best_ratio, best_i, best_j = cruncher.ratio(), i, j
				981	if best_ratio < cutoff:
				982	# no non-identical "pretty close" pair
				983	if eqi is None:
				984	# no identical pair either -- treat it as a straight replace
				985	yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
				986	return
				987	# no close pair, but an identical pair -- synch up on that
				988	best_i, best_j, best_ratio = eqi, eqj, 1.0
				989	else:
				990	# there's a close pair, so forget the identical pair (if any)
				991	eqi = None
				992
				993	# a[best_i] very similar to b[best_j]; eqi is None iff they're not
				994	# identical
				995
				996	# pump out diffs from before the synch point
				997	yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
				998
				999	# do intraline marking on the synch pair
				1000	aelt, belt = a[best_i], b[best_j]
				1001	if eqi is None:
				1002	# pump out a '-', '?', '+', '?' quad for the synched lines
				1003	atags = btags = ""
				1004	cruncher.set_seqs(aelt, belt)
				1005	for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
				1006	la, lb = ai2 - ai1, bj2 - bj1
				1007	if tag == 'replace':
				1008	atags += '^' * la
				1009	btags += '^' * lb
				1010	elif tag == 'delete':
				1011	atags += '-' * la
				1012	elif tag == 'insert':
				1013	btags += '+' * lb
				1014	elif tag == 'equal':
				1015	atags += ' ' * la
				1016	btags += ' ' * lb
				1017	else:
				1018	raise ValueError('unknown tag %r' % (tag,))
				1019	yield from self._qformat(aelt, belt, atags, btags)
				1020	else:
				1021	# the synch pair is identical
				1022	yield ' ' + aelt
				1023
				1024	# pump out diffs from after the synch point
				1025	yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
				1026
				1027	def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
				1028	g = []
				1029	if alo < ahi:
				1030	if blo < bhi:
				1031	g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
				1032	else:
				1033	g = self._dump('-', a, alo, ahi)
				1034	elif blo < bhi:
				1035	g = self._dump('+', b, blo, bhi)
				1036
				1037	yield from g
				1038
				1039	def _qformat(self, aline, bline, atags, btags):
				1040	r"""
				1041	Format "?" output and deal with tabs.
				1042
				1043	Example:
				1044
				1045	>>> d = Differ()
				1046	>>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
				1047	... ' ^ ^ ^ ', ' ^ ^ ^ ')
				1048	>>> for line in results: print(repr(line))
				1049	...
				1050	'- \tabcDefghiJkl\n'
				1051	'? \t ^ ^ ^\n'
				1052	'+ \tabcdefGhijkl\n'
				1053	'? \t ^ ^ ^\n'
				1054	"""
				1055	atags = _keep_original_ws(aline, atags).rstrip()
				1056	btags = _keep_original_ws(bline, btags).rstrip()
				1057
				1058	yield "- " + aline
				1059	if atags:
				1060	yield f"? {atags}\n"
				1061
				1062	yield "+ " + bline
				1063	if btags:
				1064	yield f"? {btags}\n"
				1065
				1066	# With respect to junk, an earlier version of ndiff simply refused to
				1067	# start a match with a junk element. The result was cases like this:
				1068	# before: private Thread currentThread;
				1069	# after: private volatile Thread currentThread;
				1070	# If you consider whitespace to be junk, the longest contiguous match
				1071	# not starting with junk is "e Thread currentThread". So ndiff reported
				1072	# that "e volatil" was inserted between the 't' and the 'e' in "private".
				1073	# While an accurate view, to people that's absurd. The current version
				1074	# looks for matching blocks that are entirely junk-free, then extends the
				1075	# longest one of those as far as possible but only with matching junk.
				1076	# So now "currentThread" is matched, then extended to suck up the
				1077	# preceding blank; then "private" is matched, and extended to suck up the
				1078	# following blank; then "Thread" is matched; and finally ndiff reports
				1079	# that "volatile " was inserted before "Thread". The only quibble
				1080	# remaining is that perhaps it was really the case that " volatile"
				1081	# was inserted after "private". I can live with that <wink>.
				1082
				1083	import re
				1084
				1085	def IS_LINE_JUNK(line, pat=re.compile(r"\s(?:#\s)?$").match):
				1086	r"""
				1087	Return True for ignorable line: iff `line` is blank or contains a single '#'.
				1088
				1089	Examples:
				1090
				1091	>>> IS_LINE_JUNK('\n')
				1092	True
				1093	>>> IS_LINE_JUNK(' # \n')
				1094	True
				1095	>>> IS_LINE_JUNK('hello\n')
				1096	False
				1097	"""
				1098
				1099	return pat(line) is not None
				1100
				1101	def IS_CHARACTER_JUNK(ch, ws=" \t"):
				1102	r"""
				1103	Return True for ignorable character: iff `ch` is a space or tab.
				1104
				1105	Examples:
				1106
				1107	>>> IS_CHARACTER_JUNK(' ')
				1108	True
				1109	>>> IS_CHARACTER_JUNK('\t')
				1110	True
				1111	>>> IS_CHARACTER_JUNK('\n')
				1112	False
				1113	>>> IS_CHARACTER_JUNK('x')
				1114	False
				1115	"""
				1116
				1117	return ch in ws
				1118
				1119
				1120	########################################################################
				1121	### Unified Diff
				1122	########################################################################
				1123
				1124	def _format_range_unified(start, stop):
				1125	'Convert range to the "ed" format'
				1126	# Per the diff spec at http://www.unix.org/single_unix_specification/
				1127	beginning = start + 1 # lines start numbering with one
				1128	length = stop - start
				1129	if length == 1:
				1130	return '{}'.format(beginning)
				1131	if not length:
				1132	beginning -= 1 # empty ranges begin at line just before the range
				1133	return '{},{}'.format(beginning, length)
				1134
				1135	def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
				1136	tofiledate='', n=3, lineterm='\n'):
				1137	r"""
				1138	Compare two sequences of lines; generate the delta as a unified diff.
				1139
				1140	Unified diffs are a compact way of showing line changes and a few
				1141	lines of context. The number of context lines is set by 'n' which
				1142	defaults to three.
				1143
				1144	By default, the diff control lines (those with ---, +++, or @@) are
				1145	created with a trailing newline. This is helpful so that inputs
				1146	created from file.readlines() result in diffs that are suitable for
				1147	file.writelines() since both the inputs and outputs have trailing
				1148	newlines.
				1149
				1150	For inputs that do not have trailing newlines, set the lineterm
				1151	argument to "" so that the output will be uniformly newline free.
				1152
				1153	The unidiff format normally has a header for filenames and modification
				1154	times. Any or all of these may be specified using strings for
				1155	'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
				1156	The modification times are normally expressed in the ISO 8601 format.
				1157
				1158	Example:
				1159
				1160	>>> for line in unified_diff('one two three four'.split(),
				1161	... 'zero one tree four'.split(), 'Original', 'Current',
				1162	... '2005-01-26 23:30:50', '2010-04-02 10:20:52',
				1163	... lineterm=''):
				1164	... print(line) # doctest: +NORMALIZE_WHITESPACE
				1165	--- Original 2005-01-26 23:30:50
				1166	+++ Current 2010-04-02 10:20:52
				1167	@@ -1,4 +1,4 @@
				1168	+zero
				1169	one
				1170	-two
				1171	-three
				1172	+tree
				1173	four
				1174	"""
				1175
				1176	_check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
				1177	started = False
				1178	for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
				1179	if not started:
				1180	started = True
				1181	fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
				1182	todate = '\t{}'.format(tofiledate) if tofiledate else ''
				1183	yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
				1184	yield '+++ {}{}{}'.format(tofile, todate, lineterm)
				1185
				1186	first, last = group[0], group[-1]
				1187	file1_range = _format_range_unified(first[1], last[2])
				1188	file2_range = _format_range_unified(first[3], last[4])
				1189	yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
				1190
				1191	for tag, i1, i2, j1, j2 in group:
				1192	if tag == 'equal':
				1193	for line in a[i1:i2]:
				1194	yield ' ' + line
				1195	continue
				1196	if tag in {'replace', 'delete'}:
				1197	for line in a[i1:i2]:
				1198	yield '-' + line
				1199	if tag in {'replace', 'insert'}:
				1200	for line in b[j1:j2]:
				1201	yield '+' + line
				1202
				1203
				1204	########################################################################
				1205	### Context Diff
				1206	########################################################################
				1207
				1208	def _format_range_context(start, stop):
				1209	'Convert range to the "ed" format'
				1210	# Per the diff spec at http://www.unix.org/single_unix_specification/
				1211	beginning = start + 1 # lines start numbering with one
				1212	length = stop - start
				1213	if not length:
				1214	beginning -= 1 # empty ranges begin at line just before the range
				1215	if length <= 1:
				1216	return '{}'.format(beginning)
				1217	return '{},{}'.format(beginning, beginning + length - 1)
				1218
				1219	# See http://www.unix.org/single_unix_specification/
				1220	def context_diff(a, b, fromfile='', tofile='',
				1221	fromfiledate='', tofiledate='', n=3, lineterm='\n'):
				1222	r"""
				1223	Compare two sequences of lines; generate the delta as a context diff.
				1224
				1225	Context diffs are a compact way of showing line changes and a few
				1226	lines of context. The number of context lines is set by 'n' which
				1227	defaults to three.
				1228
				1229	By default, the diff control lines (those with *** or ---) are
				1230	created with a trailing newline. This is helpful so that inputs
				1231	created from file.readlines() result in diffs that are suitable for
				1232	file.writelines() since both the inputs and outputs have trailing
				1233	newlines.
				1234
				1235	For inputs that do not have trailing newlines, set the lineterm
				1236	argument to "" so that the output will be uniformly newline free.
				1237
				1238	The context diff format normally has a header for filenames and
				1239	modification times. Any or all of these may be specified using
				1240	strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
				1241	The modification times are normally expressed in the ISO 8601 format.
				1242	If not specified, the strings default to blanks.
				1243
				1244	Example:
				1245
				1246	>>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
				1247	... 'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
				1248	... end="")
				1249	*** Original
				1250	--- Current
				1251	***************
				1252	* 1,4 **
				1253	one
				1254	! two
				1255	! three
				1256	four
				1257	--- 1,4 ----
				1258	+ zero
				1259	one
				1260	! tree
				1261	four
				1262	"""
				1263
				1264	_check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
				1265	prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')
				1266	started = False
				1267	for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
				1268	if not started:
				1269	started = True
				1270	fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
				1271	todate = '\t{}'.format(tofiledate) if tofiledate else ''
				1272	yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
				1273	yield '--- {}{}{}'.format(tofile, todate, lineterm)
				1274
				1275	first, last = group[0], group[-1]
				1276	yield '***************' + lineterm
				1277
				1278	file1_range = _format_range_context(first[1], last[2])
				1279	yield '* {} **{}'.format(file1_range, lineterm)
				1280
				1281	if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
				1282	for tag, i1, i2, _, _ in group:
				1283	if tag != 'insert':
				1284	for line in a[i1:i2]:
				1285	yield prefix[tag] + line
				1286
				1287	file2_range = _format_range_context(first[3], last[4])
				1288	yield '--- {} ----{}'.format(file2_range, lineterm)
				1289
				1290	if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
				1291	for tag, _, _, j1, j2 in group:
				1292	if tag != 'delete':
				1293	for line in b[j1:j2]:
				1294	yield prefix[tag] + line
				1295
				1296	def _check_types(a, b, *args):
				1297	# Checking types is weird, but the alternative is garbled output when
				1298	# someone passes mixed bytes and str to {unified,context}_diff(). E.g.
				1299	# without this check, passing filenames as bytes results in output like
				1300	# --- b'oldfile.txt'
				1301	# +++ b'newfile.txt'
				1302	# because of how str.format() incorporates bytes objects.
				1303	if a and not isinstance(a[0], str):
				1304	raise TypeError('lines to compare must be str, not %s (%r)' %
				1305	(type(a[0]).__name__, a[0]))
				1306	if b and not isinstance(b[0], str):
				1307	raise TypeError('lines to compare must be str, not %s (%r)' %
				1308	(type(b[0]).__name__, b[0]))
				1309	for arg in args:
				1310	if not isinstance(arg, str):
				1311	raise TypeError('all arguments must be str, not: %r' % (arg,))
				1312
				1313	def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
				1314	fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
				1315	r"""
				1316	Compare `a` and `b`, two sequences of lines represented as bytes rather
				1317	than str. This is a wrapper for `dfunc`, which is typically either
				1318	unified_diff() or context_diff(). Inputs are losslessly converted to
				1319	strings so that `dfunc` only has to worry about strings, and encoded
				1320	back to bytes on return. This is necessary to compare files with
				1321	unknown or inconsistent encoding. All other inputs (except `n`) must be
				1322	bytes rather than str.
				1323	"""
				1324	def decode(s):
				1325	try:
				1326	return s.decode('ascii', 'surrogateescape')
				1327	except AttributeError as err:
				1328	msg = ('all arguments must be bytes, not %s (%r)' %
				1329	(type(s).__name__, s))
				1330	raise TypeError(msg) from err
				1331	a = list(map(decode, a))
				1332	b = list(map(decode, b))
				1333	fromfile = decode(fromfile)
				1334	tofile = decode(tofile)
				1335	fromfiledate = decode(fromfiledate)
				1336	tofiledate = decode(tofiledate)
				1337	lineterm = decode(lineterm)
				1338
				1339	lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
				1340	for line in lines:
				1341	yield line.encode('ascii', 'surrogateescape')
				1342
				1343	def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
				1344	r"""
				1345	Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
				1346
				1347	Optional keyword parameters `linejunk` and `charjunk` are for filter
				1348	functions, or can be None:
				1349
				1350	- linejunk: A function that should accept a single string argument and
				1351	return true iff the string is junk. The default is None, and is
				1352	recommended; the underlying SequenceMatcher class has an adaptive
				1353	notion of "noise" lines.
				1354
				1355	- charjunk: A function that accepts a character (string of length
				1356	1), and returns true iff the character is junk. The default is
				1357	the module-level function IS_CHARACTER_JUNK, which filters out
				1358	whitespace characters (a blank or tab; note: it's a bad idea to
				1359	include newline in this!).
				1360
				1361	Tools/scripts/ndiff.py is a command-line front-end to this function.
				1362
				1363	Example:
				1364
				1365	>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
				1366	... 'ore\ntree\nemu\n'.splitlines(keepends=True))
				1367	>>> print(''.join(diff), end="")
				1368	- one
				1369	? ^
				1370	+ ore
				1371	? ^
				1372	- two
				1373	- three
				1374	? -
				1375	+ tree
				1376	+ emu
				1377	"""
				1378	return Differ(linejunk, charjunk).compare(a, b)
				1379
				1380	def _mdiff(fromlines, tolines, context=None, linejunk=None,
				1381	charjunk=IS_CHARACTER_JUNK):
				1382	r"""Returns generator yielding marked up from/to side by side differences.
				1383
				1384	Arguments:
				1385	fromlines -- list of text lines to compared to tolines
				1386	tolines -- list of text lines to be compared to fromlines
				1387	context -- number of context lines to display on each side of difference,
				1388	if None, all from/to text lines will be generated.
				1389	linejunk -- passed on to ndiff (see ndiff documentation)
				1390	charjunk -- passed on to ndiff (see ndiff documentation)
				1391
				1392	This function returns an iterator which returns a tuple:
				1393	(from line tuple, to line tuple, boolean flag)
				1394
				1395	from/to line tuple -- (line num, line text)
				1396	line num -- integer or None (to indicate a context separation)
				1397	line text -- original line text with following markers inserted:
				1398	'\0+' -- marks start of added text
				1399	'\0-' -- marks start of deleted text
				1400	'\0^' -- marks start of changed text
				1401	'\1' -- marks end of added/deleted/changed text
				1402
				1403	boolean flag -- None indicates context separation, True indicates
				1404	either "from" or "to" line contains a change, otherwise False.
				1405
				1406	This function/iterator was originally developed to generate side by side
				1407	file difference for making HTML pages (see HtmlDiff class for example
				1408	usage).
				1409
				1410	Note, this function utilizes the ndiff function to generate the side by
				1411	side difference markup. Optional ndiff arguments may be passed to this
				1412	function and they in turn will be passed to ndiff.
				1413	"""
				1414	import re
				1415
				1416	# regular expression for finding intraline change indices
				1417	change_re = re.compile(r'(\++\|\-+\|\^+)')
				1418
				1419	# create the difference iterator to generate the differences
				1420	diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
				1421
				1422	def _make_line(lines, format_key, side, num_lines=[0,0]):
				1423	"""Returns line of text with user's change markup and line formatting.
				1424
				1425	lines -- list of lines from the ndiff generator to produce a line of
				1426	text from. When producing the line of text to return, the
				1427	lines used are removed from this list.
				1428	format_key -- '+' return first line in list with "add" markup around
				1429	the entire line.
				1430	'-' return first line in list with "delete" markup around
				1431	the entire line.
				1432	'?' return first line in list with add/delete/change
				1433	intraline markup (indices obtained from second line)
				1434	None return first line in list with no markup
				1435	side -- indice into the num_lines list (0=from,1=to)
				1436	num_lines -- from/to current line number. This is NOT intended to be a
				1437	passed parameter. It is present as a keyword argument to
				1438	maintain memory of the current line numbers between calls
				1439	of this function.
				1440
				1441	Note, this function is purposefully not defined at the module scope so
				1442	that data it needs from its parent function (within whose context it
				1443	is defined) does not need to be of module scope.
				1444	"""
				1445	num_lines[side] += 1
				1446	# Handle case where no user markup is to be added, just return line of
				1447	# text with user's line format to allow for usage of the line number.
				1448	if format_key is None:
				1449	return (num_lines[side],lines.pop(0)[2:])
				1450	# Handle case of intraline changes
				1451	if format_key == '?':
				1452	text, markers = lines.pop(0), lines.pop(0)
				1453	# find intraline changes (store change type and indices in tuples)
				1454	sub_info = []
				1455	def record_sub_info(match_object,sub_info=sub_info):
				1456	sub_info.append([match_object.group(1)[0],match_object.span()])
				1457	return match_object.group(1)
				1458	change_re.sub(record_sub_info,markers)
				1459	# process each tuple inserting our special marks that won't be
				1460	# noticed by an xml/html escaper.
				1461	for key,(begin,end) in reversed(sub_info):
				1462	text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
				1463	text = text[2:]
				1464	# Handle case of add/delete entire line
				1465	else:
				1466	text = lines.pop(0)[2:]
				1467	# if line of text is just a newline, insert a space so there is
				1468	# something for the user to highlight and see.
				1469	if not text:
				1470	text = ' '
				1471	# insert marks that won't be noticed by an xml/html escaper.
				1472	text = '\0' + format_key + text + '\1'
				1473	# Return line of text, first allow user's line formatter to do its
				1474	# thing (such as adding the line number) then replace the special
				1475	# marks with what the user's change markup.
				1476	return (num_lines[side],text)
				1477
				1478	def _line_iterator():
				1479	"""Yields from/to lines of text with a change indication.
				1480
				1481	This function is an iterator. It itself pulls lines from a
				1482	differencing iterator, processes them and yields them. When it can
				1483	it yields both a "from" and a "to" line, otherwise it will yield one
				1484	or the other. In addition to yielding the lines of from/to text, a
				1485	boolean flag is yielded to indicate if the text line(s) have
				1486	differences in them.
				1487
				1488	Note, this function is purposefully not defined at the module scope so
				1489	that data it needs from its parent function (within whose context it
				1490	is defined) does not need to be of module scope.
				1491	"""
				1492	lines = []
				1493	num_blanks_pending, num_blanks_to_yield = 0, 0
				1494	while True:
				1495	# Load up next 4 lines so we can look ahead, create strings which
				1496	# are a concatenation of the first character of each of the 4 lines
				1497	# so we can do some very readable comparisons.
				1498	while len(lines) < 4:
				1499	lines.append(next(diff_lines_iterator, 'X'))
				1500	s = ''.join([line[0] for line in lines])
				1501	if s.startswith('X'):
				1502	# When no more lines, pump out any remaining blank lines so the
				1503	# corresponding add/delete lines get a matching blank line so
				1504	# all line pairs get yielded at the next level.
				1505	num_blanks_to_yield = num_blanks_pending
				1506	elif s.startswith('-?+?'):
				1507	# simple intraline change
				1508	yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
				1509	continue
				1510	elif s.startswith('--++'):
				1511	# in delete block, add block coming: we do NOT want to get
				1512	# caught up on blank lines yet, just process the delete line
				1513	num_blanks_pending -= 1
				1514	yield _make_line(lines,'-',0), None, True
				1515	continue
				1516	elif s.startswith(('--?+', '--+', '- ')):
				1517	# in delete block and see an intraline change or unchanged line
				1518	# coming: yield the delete line and then blanks
				1519	from_line,to_line = _make_line(lines,'-',0), None
				1520	num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
				1521	elif s.startswith('-+?'):
				1522	# intraline change
				1523	yield _make_line(lines,None,0), _make_line(lines,'?',1), True
				1524	continue
				1525	elif s.startswith('-?+'):
				1526	# intraline change
				1527	yield _make_line(lines,'?',0), _make_line(lines,None,1), True
				1528	continue
				1529	elif s.startswith('-'):
				1530	# delete FROM line
				1531	num_blanks_pending -= 1
				1532	yield _make_line(lines,'-',0), None, True
				1533	continue
				1534	elif s.startswith('+--'):
				1535	# in add block, delete block coming: we do NOT want to get
				1536	# caught up on blank lines yet, just process the add line
				1537	num_blanks_pending += 1
				1538	yield None, _make_line(lines,'+',1), True
				1539	continue
				1540	elif s.startswith(('+ ', '+-')):
				1541	# will be leaving an add block: yield blanks then add line
				1542	from_line, to_line = None, _make_line(lines,'+',1)
				1543	num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
				1544	elif s.startswith('+'):
				1545	# inside an add block, yield the add line
				1546	num_blanks_pending += 1
				1547	yield None, _make_line(lines,'+',1), True
				1548	continue
				1549	elif s.startswith(' '):
				1550	# unchanged text, yield it to both sides
				1551	yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
				1552	continue
				1553	# Catch up on the blank lines so when we yield the next from/to
				1554	# pair, they are lined up.
				1555	while(num_blanks_to_yield < 0):
				1556	num_blanks_to_yield += 1
				1557	yield None,('','\n'),True
				1558	while(num_blanks_to_yield > 0):
				1559	num_blanks_to_yield -= 1
				1560	yield ('','\n'),None,True
				1561	if s.startswith('X'):
				1562	return
				1563	else:
				1564	yield from_line,to_line,True
				1565
				1566	def _line_pair_iterator():
				1567	"""Yields from/to lines of text with a change indication.
				1568
				1569	This function is an iterator. It itself pulls lines from the line
				1570	iterator. Its difference from that iterator is that this function
				1571	always yields a pair of from/to text lines (with the change
				1572	indication). If necessary it will collect single from/to lines
				1573	until it has a matching pair from/to pair to yield.
				1574
				1575	Note, this function is purposefully not defined at the module scope so
				1576	that data it needs from its parent function (within whose context it
				1577	is defined) does not need to be of module scope.
				1578	"""
				1579	line_iterator = _line_iterator()
				1580	fromlines,tolines=[],[]
				1581	while True:
				1582	# Collecting lines of text until we have a from/to pair
				1583	while (len(fromlines)==0 or len(tolines)==0):
				1584	try:
				1585	from_line, to_line, found_diff = next(line_iterator)
				1586	except StopIteration:
				1587	return
				1588	if from_line is not None:
				1589	fromlines.append((from_line,found_diff))
				1590	if to_line is not None:
				1591	tolines.append((to_line,found_diff))
				1592	# Once we have a pair, remove them from the collection and yield it
				1593	from_line, fromDiff = fromlines.pop(0)
				1594	to_line, to_diff = tolines.pop(0)
				1595	yield (from_line,to_line,fromDiff or to_diff)
				1596
				1597	# Handle case where user does not want context differencing, just yield
				1598	# them up without doing anything else with them.
				1599	line_pair_iterator = _line_pair_iterator()
				1600	if context is None:
				1601	yield from line_pair_iterator
				1602	# Handle case where user wants context differencing. We must do some
				1603	# storage of lines until we know for sure that they are to be yielded.
				1604	else:
				1605	context += 1
				1606	lines_to_write = 0
				1607	while True:
				1608	# Store lines up until we find a difference, note use of a
				1609	# circular queue because we only need to keep around what
				1610	# we need for context.
				1611	index, contextLines = 0, [None]*(context)
				1612	found_diff = False
				1613	while(found_diff is False):
				1614	try:
				1615	from_line, to_line, found_diff = next(line_pair_iterator)
				1616	except StopIteration:
				1617	return
				1618	i = index % context
				1619	contextLines[i] = (from_line, to_line, found_diff)
				1620	index += 1
				1621	# Yield lines that we have collected so far, but first yield
				1622	# the user's separator.
				1623	if index > context:
				1624	yield None, None, None
				1625	lines_to_write = context
				1626	else:
				1627	lines_to_write = index
				1628	index = 0
				1629	while(lines_to_write):
				1630	i = index % context
				1631	index += 1
				1632	yield contextLines[i]
				1633	lines_to_write -= 1
				1634	# Now yield the context lines after the change
				1635	lines_to_write = context-1
				1636	try:
				1637	while(lines_to_write):
				1638	from_line, to_line, found_diff = next(line_pair_iterator)
				1639	# If another change within the context, extend the context
				1640	if found_diff:
				1641	lines_to_write = context-1
				1642	else:
				1643	lines_to_write -= 1
				1644	yield from_line, to_line, found_diff
				1645	except StopIteration:
				1646	# Catch exception from next() and return normally
				1647	return
				1648
				1649
				1650	_file_template = """
				1651	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
				1652	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
				1653
				1654	<html>
				1655
				1656	<head>
				1657	<meta http-equiv="Content-Type"
				1658	content="text/html; charset=%(charset)s" />
				1659	<title></title>
				1660	<style type="text/css">%(styles)s
				1661	</style>
				1662	</head>
				1663
				1664	<body>
				1665	%(table)s%(legend)s
				1666	</body>
				1667
				1668	</html>"""
				1669
				1670	_styles = """
				1671	table.diff {font-family:Courier; border:medium;}
				1672	.diff_header {background-color:#e0e0e0}
				1673	td.diff_header {text-align:right}
				1674	.diff_next {background-color:#c0c0c0}
				1675	.diff_add {background-color:#aaffaa}
				1676	.diff_chg {background-color:#ffff77}
				1677	.diff_sub {background-color:#ffaaaa}"""
				1678
				1679	_table_template = """
				1680	<table class="diff" id="difflib_chg_%(prefix)s_top"
				1681	cellspacing="0" cellpadding="0" rules="groups" >
				1682	<colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
				1683	<colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
				1684	%(header_row)s
				1685	<tbody>
				1686	%(data_rows)s </tbody>
				1687	</table>"""
				1688
				1689	_legend = """
				1690	<table class="diff" summary="Legends">
				1691	<tr> <th colspan="2"> Legends </th> </tr>
				1692	<tr> <td> <table border="" summary="Colors">
				1693	<tr><th> Colors </th> </tr>
				1694	<tr><td class="diff_add"> Added </td></tr>
				1695	<tr><td class="diff_chg">Changed</td> </tr>
				1696	<tr><td class="diff_sub">Deleted</td> </tr>
				1697	</table></td>
				1698	<td> <table border="" summary="Links">
				1699	<tr><th colspan="2"> Links </th> </tr>
				1700	<tr><td>(f)irst change</td> </tr>
				1701	<tr><td>(n)ext change</td> </tr>
				1702	<tr><td>(t)op</td> </tr>
				1703	</table></td> </tr>
				1704	</table>"""
				1705
				1706	class HtmlDiff(object):
				1707	"""For producing HTML side by side comparison with change highlights.
				1708
				1709	This class can be used to create an HTML table (or a complete HTML file
				1710	containing the table) showing a side by side, line by line comparison
				1711	of text with inter-line and intra-line change highlights. The table can
				1712	be generated in either full or contextual difference mode.
				1713
				1714	The following methods are provided for HTML generation:
				1715
				1716	make_table -- generates HTML for a single side by side table
				1717	make_file -- generates complete HTML file with a single side by side table
				1718
				1719	See tools/scripts/diff.py for an example usage of this class.
				1720	"""
				1721
				1722	_file_template = _file_template
				1723	_styles = _styles
				1724	_table_template = _table_template
				1725	_legend = _legend
				1726	_default_prefix = 0
				1727
				1728	def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
				1729	charjunk=IS_CHARACTER_JUNK):
				1730	"""HtmlDiff instance initializer
				1731
				1732	Arguments:
				1733	tabsize -- tab stop spacing, defaults to 8.
				1734	wrapcolumn -- column number where lines are broken and wrapped,
				1735	defaults to None where lines are not wrapped.
				1736	linejunk,charjunk -- keyword arguments passed into ndiff() (used by
				1737	HtmlDiff() to generate the side by side HTML differences). See
				1738	ndiff() documentation for argument default values and descriptions.
				1739	"""
				1740	self._tabsize = tabsize
				1741	self._wrapcolumn = wrapcolumn
				1742	self._linejunk = linejunk
				1743	self._charjunk = charjunk
				1744
				1745	def make_file(self, fromlines, tolines, fromdesc='', todesc='',
				1746	context=False, numlines=5, *, charset='utf-8'):
				1747	"""Returns HTML file of side by side comparison with change highlights
				1748
				1749	Arguments:
				1750	fromlines -- list of "from" lines
				1751	tolines -- list of "to" lines
				1752	fromdesc -- "from" file column header string
				1753	todesc -- "to" file column header string
				1754	context -- set to True for contextual differences (defaults to False
				1755	which shows full differences).
				1756	numlines -- number of context lines. When context is set True,
				1757	controls number of lines displayed before and after the change.
				1758	When context is False, controls the number of lines to place
				1759	the "next" link anchors before the next change (so click of
				1760	"next" link jumps to just before the change).
				1761	charset -- charset of the HTML document
				1762	"""
				1763
				1764	return (self._file_template % dict(
				1765	styles=self._styles,
				1766	legend=self._legend,
				1767	table=self.make_table(fromlines, tolines, fromdesc, todesc,
				1768	context=context, numlines=numlines),
				1769	charset=charset
				1770	)).encode(charset, 'xmlcharrefreplace').decode(charset)
				1771
				1772	def _tab_newline_replace(self,fromlines,tolines):
				1773	"""Returns from/to line lists with tabs expanded and newlines removed.
				1774
				1775	Instead of tab characters being replaced by the number of spaces
				1776	needed to fill in to the next tab stop, this function will fill
				1777	the space with tab characters. This is done so that the difference
				1778	algorithms can identify changes in a file when tabs are replaced by
				1779	spaces and vice versa. At the end of the HTML generation, the tab
				1780	characters will be replaced with a nonbreakable space.
				1781	"""
				1782	def expand_tabs(line):
				1783	# hide real spaces
				1784	line = line.replace(' ','\0')
				1785	# expand tabs into spaces
				1786	line = line.expandtabs(self._tabsize)
				1787	# replace spaces from expanded tabs back into tab characters
				1788	# (we'll replace them with markup after we do differencing)
				1789	line = line.replace(' ','\t')
				1790	return line.replace('\0',' ').rstrip('\n')
				1791	fromlines = [expand_tabs(line) for line in fromlines]
				1792	tolines = [expand_tabs(line) for line in tolines]
				1793	return fromlines,tolines
				1794
				1795	def _split_line(self,data_list,line_num,text):
				1796	"""Builds list of text lines by splitting text lines at wrap point
				1797
				1798	This function will determine if the input text line needs to be
				1799	wrapped (split) into separate lines. If so, the first wrap point
				1800	will be determined and the first line appended to the output
				1801	text line list. This function is used recursively to handle
				1802	the second part of the split line to further split it.
				1803	"""
				1804	# if blank line or context separator, just add it to the output list
				1805	if not line_num:
				1806	data_list.append((line_num,text))
				1807	return
				1808
				1809	# if line text doesn't need wrapping, just add it to the output list
				1810	size = len(text)
				1811	max = self._wrapcolumn
				1812	if (size <= max) or ((size -(text.count('\0')*3)) <= max):
				1813	data_list.append((line_num,text))
				1814	return
				1815
				1816	# scan text looking for the wrap point, keeping track if the wrap
				1817	# point is inside markers
				1818	i = 0
				1819	n = 0
				1820	mark = ''
				1821	while n < max and i < size:
				1822	if text[i] == '\0':
				1823	i += 1
				1824	mark = text[i]
				1825	i += 1
				1826	elif text[i] == '\1':
				1827	i += 1
				1828	mark = ''
				1829	else:
				1830	i += 1
				1831	n += 1
				1832
				1833	# wrap point is inside text, break it up into separate lines
				1834	line1 = text[:i]
				1835	line2 = text[i:]
				1836
				1837	# if wrap point is inside markers, place end marker at end of first
				1838	# line and start marker at beginning of second line because each
				1839	# line will have its own table tag markup around it.
				1840	if mark:
				1841	line1 = line1 + '\1'
				1842	line2 = '\0' + mark + line2
				1843
				1844	# tack on first line onto the output list
				1845	data_list.append((line_num,line1))
				1846
				1847	# use this routine again to wrap the remaining text
				1848	self._split_line(data_list,'>',line2)
				1849
				1850	def _line_wrapper(self,diffs):
				1851	"""Returns iterator that splits (wraps) mdiff text lines"""
				1852
				1853	# pull from/to data and flags from mdiff iterator
				1854	for fromdata,todata,flag in diffs:
				1855	# check for context separators and pass them through
				1856	if flag is None:
				1857	yield fromdata,todata,flag
				1858	continue
				1859	(fromline,fromtext),(toline,totext) = fromdata,todata
				1860	# for each from/to line split it at the wrap column to form
				1861	# list of text lines.
				1862	fromlist,tolist = [],[]
				1863	self._split_line(fromlist,fromline,fromtext)
				1864	self._split_line(tolist,toline,totext)
				1865	# yield from/to line in pairs inserting blank lines as
				1866	# necessary when one side has more wrapped lines
				1867	while fromlist or tolist:
				1868	if fromlist:
				1869	fromdata = fromlist.pop(0)
				1870	else:
				1871	fromdata = ('',' ')
				1872	if tolist:
				1873	todata = tolist.pop(0)
				1874	else:
				1875	todata = ('',' ')
				1876	yield fromdata,todata,flag
				1877
				1878	def _collect_lines(self,diffs):
				1879	"""Collects mdiff output into separate lists
				1880
				1881	Before storing the mdiff from/to data into a list, it is converted
				1882	into a single line of text with HTML markup.
				1883	"""
				1884
				1885	fromlist,tolist,flaglist = [],[],[]
				1886	# pull from/to data and flags from mdiff style iterator
				1887	for fromdata,todata,flag in diffs:
				1888	try:
				1889	# store HTML markup of the lines into the lists
				1890	fromlist.append(self._format_line(0,flag,*fromdata))
				1891	tolist.append(self._format_line(1,flag,*todata))
				1892	except TypeError:
				1893	# exceptions occur for lines where context separators go
				1894	fromlist.append(None)
				1895	tolist.append(None)
				1896	flaglist.append(flag)
				1897	return fromlist,tolist,flaglist
				1898
				1899	def _format_line(self,side,flag,linenum,text):
				1900	"""Returns HTML markup of "from" / "to" text lines
				1901
				1902	side -- 0 or 1 indicating "from" or "to" text
				1903	flag -- indicates if difference on line
				1904	linenum -- line number (used for line number column)
				1905	text -- line text to be marked up
				1906	"""
				1907	try:
				1908	linenum = '%d' % linenum
				1909	id = ' id="%s%s"' % (self._prefix[side],linenum)
				1910	except TypeError:
				1911	# handle blank lines where linenum is '>' or ''
				1912	id = ''
				1913	# replace those things that would get confused with HTML symbols
				1914	text=text.replace("&","&").replace(">",">").replace("<","<")
				1915
				1916	# make space non-breakable so they don't get compressed or line wrapped
				1917	text = text.replace(' ',' ').rstrip()
				1918
				1919	return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
				1920	% (id,linenum,text)
				1921
				1922	def _make_prefix(self):
				1923	"""Create unique anchor prefixes"""
				1924
				1925	# Generate a unique anchor prefix so multiple tables
				1926	# can exist on the same HTML page without conflicts.
				1927	fromprefix = "from%d_" % HtmlDiff._default_prefix
				1928	toprefix = "to%d_" % HtmlDiff._default_prefix
				1929	HtmlDiff._default_prefix += 1
				1930	# store prefixes so line format method has access
				1931	self._prefix = [fromprefix,toprefix]
				1932
				1933	def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
				1934	"""Makes list of "next" links"""
				1935
				1936	# all anchor names will be generated using the unique "to" prefix
				1937	toprefix = self._prefix[1]
				1938
				1939	# process change flags, generating middle column of next anchors/links
				1940	next_id = ['']*len(flaglist)
				1941	next_href = ['']*len(flaglist)
				1942	num_chg, in_change = 0, False
				1943	last = 0
				1944	for i,flag in enumerate(flaglist):
				1945	if flag:
				1946	if not in_change:
				1947	in_change = True
				1948	last = i
				1949	# at the beginning of a change, drop an anchor a few lines
				1950	# (the context lines) before the change for the previous
				1951	# link
				1952	i = max([0,i-numlines])
				1953	next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
				1954	# at the beginning of a change, drop a link to the next
				1955	# change
				1956	num_chg += 1
				1957	next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
				1958	toprefix,num_chg)
				1959	else:
				1960	in_change = False
				1961	# check for cases where there is no content to avoid exceptions
				1962	if not flaglist:
				1963	flaglist = [False]
				1964	next_id = ['']
				1965	next_href = ['']
				1966	last = 0
				1967	if context:
				1968	fromlist = ['<td></td><td> No Differences Found </td>']
				1969	tolist = fromlist
				1970	else:
				1971	fromlist = tolist = ['<td></td><td> Empty File </td>']
				1972	# if not a change on first line, drop a link
				1973	if not flaglist[0]:
				1974	next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
				1975	# redo the last link to link to the top
				1976	next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
				1977
				1978	return fromlist,tolist,flaglist,next_href,next_id
				1979
				1980	def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
				1981	numlines=5):
				1982	"""Returns HTML table of side by side comparison with change highlights
				1983
				1984	Arguments:
				1985	fromlines -- list of "from" lines
				1986	tolines -- list of "to" lines
				1987	fromdesc -- "from" file column header string
				1988	todesc -- "to" file column header string
				1989	context -- set to True for contextual differences (defaults to False
				1990	which shows full differences).
				1991	numlines -- number of context lines. When context is set True,
				1992	controls number of lines displayed before and after the change.
				1993	When context is False, controls the number of lines to place
				1994	the "next" link anchors before the next change (so click of
				1995	"next" link jumps to just before the change).
				1996	"""
				1997
				1998	# make unique anchor prefixes so that multiple tables may exist
				1999	# on the same page without conflict.
				2000	self._make_prefix()
				2001
				2002	# change tabs to spaces before it gets more difficult after we insert
				2003	# markup
				2004	fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
				2005
				2006	# create diffs iterator which generates side by side from/to data
				2007	if context:
				2008	context_lines = numlines
				2009	else:
				2010	context_lines = None
				2011	diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
				2012	charjunk=self._charjunk)
				2013
				2014	# set up iterator to wrap lines that exceed desired width
				2015	if self._wrapcolumn:
				2016	diffs = self._line_wrapper(diffs)
				2017
				2018	# collect up from/to lines and flags into lists (also format the lines)
				2019	fromlist,tolist,flaglist = self._collect_lines(diffs)
				2020
				2021	# process change flags, generating middle column of next anchors/links
				2022	fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
				2023	fromlist,tolist,flaglist,context,numlines)
				2024
				2025	s = []
				2026	fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \
				2027	'<td class="diff_next">%s</td>%s</tr>\n'
				2028	for i in range(len(flaglist)):
				2029	if flaglist[i] is None:
				2030	# mdiff yields None on separator lines skip the bogus ones
				2031	# generated for the first line
				2032	if i > 0:
				2033	s.append(' </tbody> \n <tbody>\n')
				2034	else:
				2035	s.append( fmt % (next_id[i],next_href[i],fromlist[i],
				2036	next_href[i],tolist[i]))
				2037	if fromdesc or todesc:
				2038	header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
				2039	'<th class="diff_next"><br /></th>',
				2040	'<th colspan="2" class="diff_header">%s</th>' % fromdesc,
				2041	'<th class="diff_next"><br /></th>',
				2042	'<th colspan="2" class="diff_header">%s</th>' % todesc)
				2043	else:
				2044	header_row = ''
				2045
				2046	table = self._table_template % dict(
				2047	data_rows=''.join(s),
				2048	header_row=header_row,
				2049	prefix=self._prefix[1])
				2050
				2051	return table.replace('\0+','<span class="diff_add">'). \
				2052	replace('\0-','<span class="diff_sub">'). \
				2053	replace('\0^','<span class="diff_chg">'). \
				2054	replace('\1','</span>'). \
				2055	replace('\t',' ')
				2056
				2057	del re
				2058
				2059	def restore(delta, which):
				2060	r"""
				2061	Generate one of the two sequences that generated a delta.
				2062
				2063	Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
				2064	lines originating from file 1 or 2 (parameter `which`), stripping off line
				2065	prefixes.
				2066
				2067	Examples:
				2068
				2069	>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
				2070	... 'ore\ntree\nemu\n'.splitlines(keepends=True))
				2071	>>> diff = list(diff)
				2072	>>> print(''.join(restore(diff, 1)), end="")
				2073	one
				2074	two
				2075	three
				2076	>>> print(''.join(restore(diff, 2)), end="")
				2077	ore
				2078	tree
				2079	emu
				2080	"""
				2081	try:
				2082	tag = {1: "- ", 2: "+ "}[int(which)]
				2083	except KeyError:
				2084	raise ValueError('unknown delta choice (must be 1 or 2): %r'
				2085	% which) from None
				2086	prefixes = (" ", tag)
				2087	for line in delta:
				2088	if line[:2] in prefixes:
				2089	yield line[2:]
				2090
				2091	def _test():
				2092	import doctest, difflib
				2093	return doctest.testmod(difflib)
				2094
				2095	if __name__ == "__main__":
				2096	_test()