Load all OpenType sub-structures lazily

Huge win for subsetter and anyone else who doesn't need the entire
tables.  Subsetting a huge font (eg. NotoSans-Regular.ttf) to a
small set (say, main Cyrillic characters) is something like six
times faster now.  Bulk of time was being spent in blowing up the
GPOS kerning pairs and attach points.  Now we don't load those if
they won't be in the final subset.

Slight slowdown for usecases that need the entire table.
diff --git a/Lib/fontTools/ttLib/tables/otBase.py b/Lib/fontTools/ttLib/tables/otBase.py
index 6afb5c7..f752845 100644
--- a/Lib/fontTools/ttLib/tables/otBase.py
+++ b/Lib/fontTools/ttLib/tables/otBase.py
@@ -491,8 +491,8 @@
 class TableStack:
 	"""A stack of table dicts, working as a stack of namespaces so we can
 	retrieve values from (and store values to) tables higher up the stack."""
-	def __init__(self):
-		self.stack = []
+	def __init__(self, other=None):
+		self.stack = other.stack[:] if other else []
 	def push(self, table):
 		self.stack.append(table)
 	def pop(self):
@@ -537,9 +537,7 @@
 			# this guards against self.decompile NOT setting compileStatus to other than 1.
 			raise AttributeError, attr 
 		if self.compileStatus == 1:
-			# table.read() has been called, but table has not yet been decompiled
-			# This happens only for extension tables.
-			self.decompile(self.reader, self.font)
+			self.ensureDecompiled()
 			val = getattr(self, attr)
 			self.recurse -=1
 			return val
@@ -581,6 +579,12 @@
 		self.postRead(table, font)
 		del self.__rawTable  # succeeded, get rid of debugging info
 
+	def ensureDecompiled(self):
+		if self.compileStatus != 1:
+			return
+		self.decompile(self.reader, self.font, self.tableStack)
+		del self.reader, self.font, self.tableStack
+
 	def preCompile(self):
 		pass # used only by the LookupList class
 
@@ -626,6 +630,7 @@
 		self.__dict__.update(table)
 	
 	def preWrite(self, font):
+		self.ensureDecompiled()
 		return self.__dict__.copy()
 	
 	def toXML(self, xmlWriter, font, attrs=None):
@@ -673,6 +678,8 @@
 		if type(self) != type(other): return cmp(type(self), type(other))
 		if self.__class__ != other.__class__: return cmp(self.__class__, other.__class__)
 
+		self.ensureDecompiled()
+
 		return cmp(self.__dict__, other.__dict__)