Add LZO1X algorithm to the kernel

This is a hybrid version of the patch to add the LZO1X compression
algorithm to the kernel.  Nitin and myself have merged the best parts of
the various patches to form this version which we're both happy with (and
are jointly signing off).

The performance of this version is equivalent to the original minilzo code
it was based on.  Bytecode comparisons have also been made on ARM, i386 and
x86_64 with favourable results.

There are several users of LZO lined up including jffs2, crypto and reiser4
since its much faster than zlib.

Signed-off-by: Nitin Gupta <nitingupta910@gmail.com>
Signed-off-by: Richard Purdie <rpurdie@openedhand.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/include/linux/lzo.h b/include/linux/lzo.h
new file mode 100644
index 0000000..582d8b7
--- /dev/null
+++ b/include/linux/lzo.h
@@ -0,0 +1,44 @@
+#ifndef __LZO_H__
+#define __LZO_H__
+/*
+ *  LZO Public Kernel Interface
+ *  A mini subset of the LZO real-time data compression library
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define LZO1X_MEM_COMPRESS	(16384 * sizeof(unsigned char *))
+#define LZO1X_1_MEM_COMPRESS	LZO1X_MEM_COMPRESS
+
+#define lzo1x_worst_compress(x) (x + (x / 64) + 16 + 3)
+
+/* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */
+int lzo1x_1_compress(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+/* safe decompression with overrun testing */
+int lzo1x_decompress_safe(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len);
+
+/*
+ * Return values (< 0 = Error)
+ */
+#define LZO_E_OK			0
+#define LZO_E_ERROR			(-1)
+#define LZO_E_OUT_OF_MEMORY		(-2)
+#define LZO_E_NOT_COMPRESSIBLE		(-3)
+#define LZO_E_INPUT_OVERRUN		(-4)
+#define LZO_E_OUTPUT_OVERRUN		(-5)
+#define LZO_E_LOOKBEHIND_OVERRUN	(-6)
+#define LZO_E_EOF_NOT_FOUND		(-7)
+#define LZO_E_INPUT_NOT_CONSUMED	(-8)
+#define LZO_E_NOT_YET_IMPLEMENTED	(-9)
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 2e7ae6b..3eb29d5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -64,6 +64,12 @@
 config ZLIB_DEFLATE
 	tristate
 
+config LZO_COMPRESS
+	tristate
+
+config LZO_DECOMPRESS
+	tristate
+
 #
 # Generic allocator support is selected if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index c8c8e20..d1b366b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -49,6 +49,8 @@
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
+obj-$(CONFIG_LZO_COMPRESS) += lzo/
+obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
 
 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
diff --git a/lib/lzo/Makefile b/lib/lzo/Makefile
new file mode 100644
index 0000000..e764116
--- /dev/null
+++ b/lib/lzo/Makefile
@@ -0,0 +1,5 @@
+lzo_compress-objs := lzo1x_compress.o
+lzo_decompress-objs := lzo1x_decompress.o
+
+obj-$(CONFIG_LZO_COMPRESS) += lzo_compress.o
+obj-$(CONFIG_LZO_DECOMPRESS) += lzo_decompress.o
diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
new file mode 100644
index 0000000..c935f00
--- /dev/null
+++ b/lib/lzo/lzo1x_compress.c
@@ -0,0 +1,226 @@
+/*
+ *  LZO1X Compressor from MiniLZO
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lzo.h>
+#include <asm/unaligned.h>
+#include "lzodefs.h"
+
+static noinline size_t
+_lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
+		unsigned char *out, size_t *out_len, void *wrkmem)
+{
+	const unsigned char * const in_end = in + in_len;
+	const unsigned char * const ip_end = in + in_len - M2_MAX_LEN - 5;
+	const unsigned char ** const dict = wrkmem;
+	const unsigned char *ip = in, *ii = ip;
+	const unsigned char *end, *m, *m_pos;
+	size_t m_off, m_len, dindex;
+	unsigned char *op = out;
+
+	ip += 4;
+
+	for (;;) {
+		dindex = ((0x21 * DX3(ip, 5, 5, 6)) >> 5) & D_MASK;
+		m_pos = dict[dindex];
+
+		if (m_pos < in)
+			goto literal;
+
+		if (ip == m_pos || (ip - m_pos) > M4_MAX_OFFSET)
+			goto literal;
+
+		m_off = ip - m_pos;
+		if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+			goto try_match;
+
+		dindex = (dindex & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f);
+		m_pos = dict[dindex];
+
+		if (m_pos < in)
+			goto literal;
+
+		if (ip == m_pos || (ip - m_pos) > M4_MAX_OFFSET)
+			goto literal;
+
+		m_off = ip - m_pos;
+		if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+			goto try_match;
+
+		goto literal;
+
+try_match:
+		if (get_unaligned((const unsigned short *)m_pos)
+				== get_unaligned((const unsigned short *)ip)) {
+			if (likely(m_pos[2] == ip[2]))
+					goto match;
+		}
+
+literal:
+		dict[dindex] = ip;
+		++ip;
+		if (unlikely(ip >= ip_end))
+			break;
+		continue;
+
+match:
+		dict[dindex] = ip;
+		if (ip != ii) {
+			size_t t = ip - ii;
+
+			if (t <= 3) {
+				op[-2] |= t;
+			} else if (t <= 18) {
+				*op++ = (t - 3);
+			} else {
+				size_t tt = t - 18;
+
+				*op++ = 0;
+				while (tt > 255) {
+					tt -= 255;
+					*op++ = 0;
+				}
+				*op++ = tt;
+			}
+			do {
+				*op++ = *ii++;
+			} while (--t > 0);
+		}
+
+		ip += 3;
+		if (m_pos[3] != *ip++ || m_pos[4] != *ip++
+				|| m_pos[5] != *ip++ || m_pos[6] != *ip++
+				|| m_pos[7] != *ip++ || m_pos[8] != *ip++) {
+			--ip;
+			m_len = ip - ii;
+
+			if (m_off <= M2_MAX_OFFSET) {
+				m_off -= 1;
+				*op++ = (((m_len - 1) << 5)
+						| ((m_off & 7) << 2));
+				*op++ = (m_off >> 3);
+			} else if (m_off <= M3_MAX_OFFSET) {
+				m_off -= 1;
+				*op++ = (M3_MARKER | (m_len - 2));
+				goto m3_m4_offset;
+			} else {
+				m_off -= 0x4000;
+
+				*op++ = (M4_MARKER | ((m_off & 0x4000) >> 11)
+						| (m_len - 2));
+				goto m3_m4_offset;
+			}
+		} else {
+			end = in_end;
+			m = m_pos + M2_MAX_LEN + 1;
+
+			while (ip < end && *m == *ip) {
+				m++;
+				ip++;
+			}
+			m_len = ip - ii;
+
+			if (m_off <= M3_MAX_OFFSET) {
+				m_off -= 1;
+				if (m_len <= 33) {
+					*op++ = (M3_MARKER | (m_len - 2));
+				} else {
+					m_len -= 33;
+					*op++ = M3_MARKER | 0;
+					goto m3_m4_len;
+				}
+			} else {
+				m_off -= 0x4000;
+				if (m_len <= M4_MAX_LEN) {
+					*op++ = (M4_MARKER
+						| ((m_off & 0x4000) >> 11)
+						| (m_len - 2));
+				} else {
+					m_len -= M4_MAX_LEN;
+					*op++ = (M4_MARKER
+						| ((m_off & 0x4000) >> 11));
+m3_m4_len:
+					while (m_len > 255) {
+						m_len -= 255;
+						*op++ = 0;
+					}
+
+					*op++ = (m_len);
+				}
+			}
+m3_m4_offset:
+			*op++ = ((m_off & 63) << 2);
+			*op++ = (m_off >> 6);
+		}
+
+		ii = ip;
+		if (unlikely(ip >= ip_end))
+			break;
+	}
+
+	*out_len = op - out;
+	return in_end - ii;
+}
+
+int lzo1x_1_compress(const unsigned char *in, size_t in_len, unsigned char *out,
+			size_t *out_len, void *wrkmem)
+{
+	const unsigned char *ii;
+	unsigned char *op = out;
+	size_t t;
+
+	if (unlikely(in_len <= M2_MAX_LEN + 5)) {
+		t = in_len;
+	} else {
+		t = _lzo1x_1_do_compress(in, in_len, op, out_len, wrkmem);
+		op += *out_len;
+	}
+
+	if (t > 0) {
+		ii = in + in_len - t;
+
+		if (op == out && t <= 238) {
+			*op++ = (17 + t);
+		} else if (t <= 3) {
+			op[-2] |= t;
+		} else if (t <= 18) {
+			*op++ = (t - 3);
+		} else {
+			size_t tt = t - 18;
+
+			*op++ = 0;
+			while (tt > 255) {
+				tt -= 255;
+				*op++ = 0;
+			}
+
+			*op++ = tt;
+		}
+		do {
+			*op++ = *ii++;
+		} while (--t > 0);
+	}
+
+	*op++ = M4_MARKER | 1;
+	*op++ = 0;
+	*op++ = 0;
+
+	*out_len = op - out;
+	return LZO_E_OK;
+}
+EXPORT_SYMBOL_GPL(lzo1x_1_compress);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZO1X-1 Compressor");
+
diff --git a/lib/lzo/lzo1x_decompress.c b/lib/lzo/lzo1x_decompress.c
new file mode 100644
index 0000000..9dc7056
--- /dev/null
+++ b/lib/lzo/lzo1x_decompress.c
@@ -0,0 +1,254 @@
+/*
+ *  LZO1X Decompressor from MiniLZO
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lzo.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include "lzodefs.h"
+
+#define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x))
+#define HAVE_OP(x, op_end, op) ((size_t)(op_end - op) < (x))
+#define HAVE_LB(m_pos, out, op) (m_pos < out || m_pos >= op)
+
+#define COPY4(dst, src)	\
+		put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst))
+
+int lzo1x_decompress_safe(const unsigned char *in, size_t in_len,
+			unsigned char *out, size_t *out_len)
+{
+	const unsigned char * const ip_end = in + in_len;
+	unsigned char * const op_end = out + *out_len;
+	const unsigned char *ip = in, *m_pos;
+	unsigned char *op = out;
+	size_t t;
+
+	*out_len = 0;
+
+	if (*ip > 17) {
+		t = *ip++ - 17;
+		if (t < 4)
+			goto match_next;
+		if (HAVE_OP(t, op_end, op))
+			goto output_overrun;
+		if (HAVE_IP(t + 1, ip_end, ip))
+			goto input_overrun;
+		do {
+			*op++ = *ip++;
+		} while (--t > 0);
+		goto first_literal_run;
+	}
+
+	while ((ip < ip_end)) {
+		t = *ip++;
+		if (t >= 16)
+			goto match;
+		if (t == 0) {
+			if (HAVE_IP(1, ip_end, ip))
+				goto input_overrun;
+			while (*ip == 0) {
+				t += 255;
+				ip++;
+				if (HAVE_IP(1, ip_end, ip))
+					goto input_overrun;
+			}
+			t += 15 + *ip++;
+		}
+		if (HAVE_OP(t + 3, op_end, op))
+			goto output_overrun;
+		if (HAVE_IP(t + 4, ip_end, ip))
+			goto input_overrun;
+
+		COPY4(op, ip);
+		op += 4;
+		ip += 4;
+		if (--t > 0) {
+			if (t >= 4) {
+				do {
+					COPY4(op, ip);
+					op += 4;
+					ip += 4;
+					t -= 4;
+				} while (t >= 4);
+				if (t > 0) {
+					do {
+						*op++ = *ip++;
+					} while (--t > 0);
+				}
+			} else {
+				do {
+					*op++ = *ip++;
+				} while (--t > 0);
+			}
+		}
+
+first_literal_run:
+		t = *ip++;
+		if (t >= 16)
+			goto match;
+		m_pos = op - (1 + M2_MAX_OFFSET);
+		m_pos -= t >> 2;
+		m_pos -= *ip++ << 2;
+
+		if (HAVE_LB(m_pos, out, op))
+			goto lookbehind_overrun;
+
+		if (HAVE_OP(3, op_end, op))
+			goto output_overrun;
+		*op++ = *m_pos++;
+		*op++ = *m_pos++;
+		*op++ = *m_pos;
+
+		goto match_done;
+
+		do {
+match:
+			if (t >= 64) {
+				m_pos = op - 1;
+				m_pos -= (t >> 2) & 7;
+				m_pos -= *ip++ << 3;
+				t = (t >> 5) - 1;
+				if (HAVE_LB(m_pos, out, op))
+					goto lookbehind_overrun;
+				if (HAVE_OP(t + 3 - 1, op_end, op))
+					goto output_overrun;
+				goto copy_match;
+			} else if (t >= 32) {
+				t &= 31;
+				if (t == 0) {
+					if (HAVE_IP(1, ip_end, ip))
+						goto input_overrun;
+					while (*ip == 0) {
+						t += 255;
+						ip++;
+						if (HAVE_IP(1, ip_end, ip))
+							goto input_overrun;
+					}
+					t += 31 + *ip++;
+				}
+				m_pos = op - 1;
+				m_pos -= le16_to_cpu(get_unaligned(
+					(const unsigned short *)ip)) >> 2;
+				ip += 2;
+			} else if (t >= 16) {
+				m_pos = op;
+				m_pos -= (t & 8) << 11;
+
+				t &= 7;
+				if (t == 0) {
+					if (HAVE_IP(1, ip_end, ip))
+						goto input_overrun;
+					while (*ip == 0) {
+						t += 255;
+						ip++;
+						if (HAVE_IP(1, ip_end, ip))
+							goto input_overrun;
+					}
+					t += 7 + *ip++;
+				}
+				m_pos -= le16_to_cpu(get_unaligned(
+					(const unsigned short *)ip) >> 2);
+				ip += 2;
+				if (m_pos == op)
+					goto eof_found;
+				m_pos -= 0x4000;
+			} else {
+				m_pos = op - 1;
+				m_pos -= t >> 2;
+				m_pos -= *ip++ << 2;
+
+				if (HAVE_LB(m_pos, out, op))
+					goto lookbehind_overrun;
+				if (HAVE_OP(2, op_end, op))
+					goto output_overrun;
+
+				*op++ = *m_pos++;
+				*op++ = *m_pos;
+				goto match_done;
+			}
+
+			if (HAVE_LB(m_pos, out, op))
+				goto lookbehind_overrun;
+			if (HAVE_OP(t + 3 - 1, op_end, op))
+				goto output_overrun;
+
+			if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
+				COPY4(op, m_pos);
+				op += 4;
+				m_pos += 4;
+				t -= 4 - (3 - 1);
+				do {
+					COPY4(op, m_pos);
+					op += 4;
+					m_pos += 4;
+					t -= 4;
+				} while (t >= 4);
+				if (t > 0)
+					do {
+						*op++ = *m_pos++;
+					} while (--t > 0);
+			} else {
+copy_match:
+				*op++ = *m_pos++;
+				*op++ = *m_pos++;
+				do {
+					*op++ = *m_pos++;
+				} while (--t > 0);
+			}
+match_done:
+			t = ip[-2] & 3;
+			if (t == 0)
+				break;
+match_next:
+			if (HAVE_OP(t, op_end, op))
+				goto output_overrun;
+			if (HAVE_IP(t + 1, ip_end, ip))
+				goto input_overrun;
+
+			*op++ = *ip++;
+			if (t > 1) {
+				*op++ = *ip++;
+				if (t > 2)
+					*op++ = *ip++;
+			}
+
+			t = *ip++;
+		} while (ip < ip_end);
+	}
+
+	*out_len = op - out;
+	return LZO_E_EOF_NOT_FOUND;
+
+eof_found:
+	*out_len = op - out;
+	return (ip == ip_end ? LZO_E_OK :
+		(ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
+input_overrun:
+	*out_len = op - out;
+	return LZO_E_INPUT_OVERRUN;
+
+output_overrun:
+	*out_len = op - out;
+	return LZO_E_OUTPUT_OVERRUN;
+
+lookbehind_overrun:
+	*out_len = op - out;
+	return LZO_E_LOOKBEHIND_OVERRUN;
+}
+
+EXPORT_SYMBOL_GPL(lzo1x_decompress_safe);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZO1X Decompressor");
+
diff --git a/lib/lzo/lzodefs.h b/lib/lzo/lzodefs.h
new file mode 100644
index 0000000..b6d482c
--- /dev/null
+++ b/lib/lzo/lzodefs.h
@@ -0,0 +1,43 @@
+/*
+ *  lzodefs.h -- architecture, OS and compiler specific defines
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define LZO_VERSION		0x2020
+#define LZO_VERSION_STRING	"2.02"
+#define LZO_VERSION_DATE	"Oct 17 2005"
+
+#define M1_MAX_OFFSET	0x0400
+#define M2_MAX_OFFSET	0x0800
+#define M3_MAX_OFFSET	0x4000
+#define M4_MAX_OFFSET	0xbfff
+
+#define M1_MIN_LEN	2
+#define M1_MAX_LEN	2
+#define M2_MIN_LEN	3
+#define M2_MAX_LEN	8
+#define M3_MIN_LEN	3
+#define M3_MAX_LEN	33
+#define M4_MIN_LEN	3
+#define M4_MAX_LEN	9
+
+#define M1_MARKER	0
+#define M2_MARKER	64
+#define M3_MARKER	32
+#define M4_MARKER	16
+
+#define D_BITS		14
+#define D_MASK		((1u << D_BITS) - 1)
+#define D_HIGH		((D_MASK >> 1) + 1)
+
+#define DX2(p, s1, s2)	(((((size_t)((p)[2]) << (s2)) ^ (p)[1]) \
+							<< (s1)) ^ (p)[0])
+#define DX3(p, s1, s2, s3)	((DX2((p)+1, s2, s3) << (s1)) ^ (p)[0])