Load all OpenType sub-structures lazily

Huge win for subsetter and anyone else who doesn't need the entire
tables.  Subsetting a huge font (eg. NotoSans-Regular.ttf) to a
small set (say, main Cyrillic characters) is something like six
times faster now.  Bulk of time was being spent in blowing up the
GPOS kerning pairs and attach points.  Now we don't load those if
they won't be in the final subset.

Slight slowdown for usecases that need the entire table.
diff --git a/Lib/fontTools/ttLib/tables/otBase.py b/Lib/fontTools/ttLib/tables/otBase.py
index 6afb5c7..f752845 100644
--- a/Lib/fontTools/ttLib/tables/otBase.py
+++ b/Lib/fontTools/ttLib/tables/otBase.py
@@ -491,8 +491,8 @@
 class TableStack:
 	"""A stack of table dicts, working as a stack of namespaces so we can
 	retrieve values from (and store values to) tables higher up the stack."""
-	def __init__(self):
-		self.stack = []
+	def __init__(self, other=None):
+		self.stack = other.stack[:] if other else []
 	def push(self, table):
 		self.stack.append(table)
 	def pop(self):
@@ -537,9 +537,7 @@
 			# this guards against self.decompile NOT setting compileStatus to other than 1.
 			raise AttributeError, attr 
 		if self.compileStatus == 1:
-			# table.read() has been called, but table has not yet been decompiled
-			# This happens only for extension tables.
-			self.decompile(self.reader, self.font)
+			self.ensureDecompiled()
 			val = getattr(self, attr)
 			self.recurse -=1
 			return val
@@ -581,6 +579,12 @@
 		self.postRead(table, font)
 		del self.__rawTable  # succeeded, get rid of debugging info
 
+	def ensureDecompiled(self):
+		if self.compileStatus != 1:
+			return
+		self.decompile(self.reader, self.font, self.tableStack)
+		del self.reader, self.font, self.tableStack
+
 	def preCompile(self):
 		pass # used only by the LookupList class
 
@@ -626,6 +630,7 @@
 		self.__dict__.update(table)
 	
 	def preWrite(self, font):
+		self.ensureDecompiled()
 		return self.__dict__.copy()
 	
 	def toXML(self, xmlWriter, font, attrs=None):
@@ -673,6 +678,8 @@
 		if type(self) != type(other): return cmp(type(self), type(other))
 		if self.__class__ != other.__class__: return cmp(self.__class__, other.__class__)
 
+		self.ensureDecompiled()
+
 		return cmp(self.__dict__, other.__dict__)
 
 
diff --git a/Lib/fontTools/ttLib/tables/otConverters.py b/Lib/fontTools/ttLib/tables/otConverters.py
index 11a8773..047babb 100644
--- a/Lib/fontTools/ttLib/tables/otConverters.py
+++ b/Lib/fontTools/ttLib/tables/otConverters.py
@@ -1,5 +1,6 @@
 from types import TupleType
 from fontTools.misc.textTools import safeEval
+from otBase import TableStack
 
 
 def buildConverters(tableSpec, tableNamespace):
@@ -169,7 +170,16 @@
 			return None
 		subReader = reader.getSubReader(offset)
 		table = self.tableClass()
-		table.decompile(subReader, font, tableStack)
+		# For now, we lazy-decompile all tables.  Perhaps we should
+		# use a more sophisticated heuristic here.
+		if 1:
+			# Lazy decompile
+			table.reader = subReader
+			table.font = font
+			table.compileStatus = 1
+			table.tableStack = TableStack(tableStack)
+		else:
+			table.decompile(subReader, font, tableStack)
 		return table
 	
 	def write(self, writer, font, tableStack, value, repeatIndex=None):
@@ -206,6 +216,7 @@
 		table.reader = subReader
 		table.font = font
 		table.compileStatus = 1
+		table.tableStack = TableStack(tableStack)
 		table.start = table.reader.offset
 		return table