cPickle.c:  Full support for the new LONG1 and LONG4.  Added comments.
Assorted code cleanups; e.g., sizeof(char) is 1 by definition, so there's
no need to do things like multiply by sizeof(char) in hairy malloc
arguments.  Fixed an undetected-overflow bug in readline_file().

longobject.c:  Fixed a really stupid bug in the new _PyLong_NumBits.

pickle.py:  Fixed stupid bug in save_long():  When proto is 2, it
wrote LONG1 or LONG4, but forgot to return then -- it went on to
append the proto 1 LONG opcode too.
Fixed equally stupid cancelling bugs in load_long1() and
load_long4():  they *returned* the unpickled long instead of pushing
it on the stack.  The return values were ignored.  Tests passed
before only because save_long() pickled the long twice.

Fixed bugs in encode_long().

Noted that decode_long() is quadratic-time despite our hopes,
because long(string, 16) is still quadratic-time in len(string).
It's hex() that's linear-time.  I don't know a way to make decode_long()
linear-time in Python, short of maybe transforming the 256's-complement
bytes into marshal's funky internal format, and letting marshal decode
that.  It would be more valuable to make long(string, 16) linear time.

pickletester.py:  Added a global "protocols" vector so tests can try
all the protocols in a sane way.  Changed test_ints() and test_unicode()
to do so.  Added a new test_long(), but the tail end of it is disabled
because it "takes forever" under pickle.py (but runs very quickly under
cPickle:  cPickle proto 2 for longs is linear-time).
diff --git a/Lib/pickle.py b/Lib/pickle.py
index 92b4802..ba0e38b 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -553,6 +553,7 @@
                 self.write(LONG1 + chr(n) + bytes)
             else:
                 self.write(LONG4 + pack("<i", n) + bytes)
+            return
         self.write(LONG + `obj` + '\n')
     dispatch[LongType] = save_long
 
@@ -1042,13 +1043,13 @@
     def load_long1(self):
         n = ord(self.read(1))
         bytes = self.read(n)
-        return decode_long(bytes)
+        self.append(decode_long(bytes))
     dispatch[LONG1] = load_long1
 
     def load_long4(self):
         n = mloads('i' + self.read(4))
         bytes = self.read(n)
-        return decode_long(bytes)
+        self.append(decode_long(bytes))
     dispatch[LONG4] = load_long4
 
     def load_float(self):
@@ -1404,24 +1405,31 @@
         njunkchars = 2 + ashex.endswith('L')
         nibbles = len(ashex) - njunkchars
         if nibbles & 1:
-            # need an even # of nibbles for unhexlify
+            # Extend to a full byte.
             nibbles += 1
         nbits = nibbles * 4
         x += 1L << nbits
         assert x > 0
         ashex = hex(x)
-        if x >> (nbits - 1) == 0:
+        njunkchars = 2 + ashex.endswith('L')
+        newnibbles = len(ashex) - njunkchars
+        if newnibbles < nibbles:
+            ashex = "0x" + "0" * (nibbles - newnibbles) + ashex[2:]
+        if int(ashex[2], 16) < 8:
             # "looks positive", so need a byte of sign bits
-            ashex = "0xff" + x[2:]
+            ashex = "0xff" + ashex[2:]
 
     if ashex.endswith('L'):
         ashex = ashex[2:-1]
     else:
         ashex = ashex[2:]
-    assert len(ashex) & 1 == 0
+    assert len(ashex) & 1 == 0, (x, ashex)
     binary = _binascii.unhexlify(ashex)
     return binary[::-1]
 
+# XXX OOPS!  This is still quadratic-time.  While hex(n) is linear-time
+# XXX in the # of digits in n, long(s, 16) is still quadratic-time
+# XXX in len(s).
 def decode_long(data):
     r"""Decode a long from a two's complement little-endian binary string.
 
@@ -1445,7 +1453,7 @@
     if nbytes == 0:
         return 0L
     ashex = _binascii.hexlify(data[::-1])
-    n = long(ashex, 16)
+    n = long(ashex, 16) # quadratic time
     if data[-1] >= '\x80':
         n -= 1L << (nbytes * 8)
     return n
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 8211dcf..6615307 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1,6 +1,11 @@
 import unittest
 from test.test_support import TestFailed, have_unicode, TESTFN
 
+# Tests that try a number of pickle protocols should have a
+#     for proto in protocols:
+# kind of outer loop.  Bump the 3 to 4 if/when protocol 3 is invented.
+protocols = range(3)
+
 class C:
     def __cmp__(self, other):
         return cmp(self.__dict__, other.__dict__)
@@ -28,6 +33,9 @@
 class use_metaclass(object):
     __metaclass__ = metaclass
 
+# DATA and BINDATA are the protocol 0 and protocol 1 pickles of the object
+# returned by create_data().
+
 # break into multiple strings to avoid confusing font-lock-mode
 DATA = """(lp1
 I0
@@ -210,20 +218,22 @@
         def test_unicode(self):
             endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
                         unicode('<\n>'),  unicode('<\\>')]
-            for u in endcases:
-                p = self.dumps(u)
-                u2 = self.loads(p)
-                self.assertEqual(u2, u)
+            for proto in protocols:
+                for u in endcases:
+                    p = self.dumps(u, proto)
+                    u2 = self.loads(p)
+                    self.assertEqual(u2, u)
 
     def test_ints(self):
         import sys
-        n = sys.maxint
-        while n:
-            for expected in (-n, n):
-                s = self.dumps(expected)
-                n2 = self.loads(s)
-                self.assertEqual(expected, n2)
-            n = n >> 1
+        for proto in protocols:
+            n = sys.maxint
+            while n:
+                for expected in (-n, n):
+                    s = self.dumps(expected, proto)
+                    n2 = self.loads(s)
+                    self.assertEqual(expected, n2)
+                n = n >> 1
 
     def test_maxint64(self):
         maxint64 = (1L << 63) - 1
@@ -235,6 +245,34 @@
         data = 'I' + str(maxint64) + 'JUNK\n.'
         self.assertRaises(ValueError, self.loads, data)
 
+    def test_long(self):
+        for proto in protocols:
+            # 256 bytes is where LONG4 begins
+            for nbits in 1, 8, 8*254, 8*255, 8*256, 8*257:
+                nbase = 1L << nbits
+                for npos in nbase-1, nbase, nbase+1:
+                    for n in npos, -npos:
+                        pickle = self.dumps(n, proto)
+                        got = self.loads(pickle)
+                        self.assertEqual(n, got)
+        # Try a monster.  This is quadratic-time in protos 0 & 1, so don't
+        # bother with those.
+        # XXX Damn.  pickle.py is still quadratic-time here, due to
+        # XXX long(string, 16).  cPickle runs this in an eyeblink, but I
+        # XXX gave up waiting for pickle.py to get beyond "loading".  Giving
+        # XXX up for now.
+        return
+        print "building long"
+        nbase = long("deadbeeffeedface", 16)
+        nbase += nbase << 1000000
+        for n in nbase, -nbase:
+            print "dumping"
+            p = self.dumps(n, 2)
+            print "loading"
+            got = self.loads(p)
+            print "checking"
+            self.assertEqual(n, got)
+
     def test_reduce(self):
         pass