sh: Provide sdivsi3/udivsi3/udivdi3 for sh64, kill off libgcc linking.

This moves in the necessary libgcc bits and kills off the libgcc linking
for sh64 kernels as well.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index d56889e..c59098d 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -177,10 +177,8 @@
 KBUILD_CPPFLAGS		+= $(cflags-y)
 KBUILD_AFLAGS		+= $(cflags-y)
 
-LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
-
 libs-$(CONFIG_SUPERH32)		:= arch/sh/lib/	$(libs-y)
-libs-$(CONFIG_SUPERH64)		:= arch/sh/lib64/ $(libs-y) $(LIBGCC)
+libs-$(CONFIG_SUPERH64)		:= arch/sh/lib64/ $(libs-y)
 
 PHONY += maketools FORCE
 
diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c
index ab7adaa..0d74d6b 100644
--- a/arch/sh/kernel/sh_ksyms_64.c
+++ b/arch/sh/kernel/sh_ksyms_64.c
@@ -76,7 +76,5 @@
 #define DECLARE_EXPORT(name) extern void name(void);EXPORT_SYMBOL(name)
 
 DECLARE_EXPORT(__sdivsi3);
-DECLARE_EXPORT(__sdivsi3_2);
-DECLARE_EXPORT(__muldi3);
 DECLARE_EXPORT(__udivsi3);
 DECLARE_EXPORT(__div_table);
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
index 1d932e7..4bacb9e 100644
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -12,3 +12,6 @@
 # Panic should really be compiled as PIC
 lib-y  := udelay.o c-checksum.o dbg.o panic.o memcpy.o memset.o \
 	  copy_user_memcpy.o copy_page.o clear_page.o strcpy.o strlen.o
+
+# Extracted from libgcc
+lib-y	+= udivsi3.o udivdi3.o sdivsi3.o
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
new file mode 100644
index 0000000..6a800c6
--- /dev/null
+++ b/arch/sh/lib64/sdivsi3.S
@@ -0,0 +1,131 @@
+	.global	__sdivsi3
+	.section	.text..SHmedia32,"ax"
+	.align	2
+
+	/* inputs: r4,r5 */
+	/* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
+	/* result in r0 */
+__sdivsi3:
+	ptb __div_table,tr0
+
+	nsb r5, r1
+	shlld r5, r1, r25    /* normalize; [-2 ..1, 1..2) in s2.62 */
+	shari r25, 58, r21   /* extract 5(6) bit index (s2.4 with hole -1..1) */
+	/* bubble */
+	gettr tr0,r20
+	ldx.ub r20, r21, r19 /* u0.8 */
+	shari r25, 32, r25   /* normalize to s2.30 */
+	shlli r21, 1, r21
+	muls.l r25, r19, r19 /* s2.38 */
+	ldx.w r20, r21, r21  /* s2.14 */
+	ptabs r18, tr0
+	shari r19, 24, r19   /* truncate to s2.14 */
+	sub r21, r19, r19    /* some 11 bit inverse in s1.14 */
+	muls.l r19, r19, r21 /* u0.28 */
+	sub r63, r1, r1
+	addi r1, 92, r1
+	muls.l r25, r21, r18 /* s2.58 */
+	shlli r19, 45, r19   /* multiply by two and convert to s2.58 */
+	/* bubble */
+	sub r19, r18, r18
+	shari r18, 28, r18   /* some 22 bit inverse in s1.30 */
+	muls.l r18, r25, r0  /* s2.60 */
+	muls.l r18, r4, r25 /* s32.30 */
+	/* bubble */
+	shari r0, 16, r19   /* s-16.44 */
+	muls.l r19, r18, r19 /* s-16.74 */
+	shari r25, 63, r0
+	shari r4, 14, r18   /* s19.-14 */
+	shari r19, 30, r19   /* s-16.44 */
+	muls.l r19, r18, r19 /* s15.30 */
+	xor r21, r0, r21    /* You could also use the constant 1 << 27. */
+	add r21, r25, r21
+	sub r21, r19, r21
+	shard r21, r1, r21
+	sub r21, r0, r0
+	blink tr0, r63
+	
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+   Max defect: 6.081536e-07 at -1.000000e+00
+   Min defect: 2.849516e-08 at 1.030651e+00
+   Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+   Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+   Defect at 1: 1.238659e-07
+   Defect at -2: 1.061708e-07 */
+
+	.balign 2
+	.type	__div_table,@object
+	.size	__div_table,128
+/* negative division constants */
+	.word	-16638
+	.word	-17135
+	.word	-17737
+	.word	-18433
+	.word	-19103
+	.word	-19751
+	.word	-20583
+	.word	-21383
+	.word	-22343
+	.word	-23353
+	.word	-24407
+	.word	-25582
+	.word	-26863
+	.word	-28382
+	.word	-29965
+	.word	-31800
+/* negative division factors */
+	.byte	66
+	.byte	70
+	.byte	75
+	.byte	81
+	.byte	87
+	.byte	93
+	.byte	101
+	.byte	109
+	.byte	119
+	.byte	130
+	.byte	142
+	.byte	156
+	.byte	172
+	.byte	192
+	.byte	214
+	.byte	241
+	.skip 16
+	.global	__div_table
+__div_table:
+	.skip 16
+/* positive division factors */
+	.byte	241
+	.byte	214
+	.byte	192
+	.byte	172
+	.byte	156
+	.byte	142
+	.byte	130
+	.byte	119
+	.byte	109
+	.byte	101
+	.byte	93
+	.byte	87
+	.byte	81
+	.byte	75
+	.byte	70
+	.byte	66
+/* positive division constants */
+	.word	31801
+	.word	29966
+	.word	28383
+	.word	26864
+	.word	25583
+	.word	24408
+	.word	23354
+	.word	22344
+	.word	21384
+	.word	20584
+	.word	19752
+	.word	19104
+	.word	18434
+	.word	17738
+	.word	17136
+	.word	16639
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S
new file mode 100644
index 0000000..6895c02
--- /dev/null
+++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	__udivdi3
+__udivdi3:
+	shlri r3,1,r4
+	nsb r4,r22
+	shlld r3,r22,r6
+	shlri r6,49,r5
+	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+	sub r21,r5,r1
+	mmulfx.w r1,r1,r4
+	mshflo.w r1,r63,r1
+	sub r63,r22,r20 // r63 == 64 % 64
+	mmulfx.w r5,r4,r4
+	pta large_divisor,tr0
+	addi r20,32,r9
+	msub.w r1,r4,r1
+	madd.w r1,r1,r1
+	mmulfx.w r1,r1,r4
+	shlri r6,32,r7
+	bgt/u r9,r63,tr0 // large_divisor
+	mmulfx.w r5,r4,r4
+	shlri r2,32+14,r19
+	addi r22,-31,r0
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r19,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	mulu.l r5,r3,r8
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	shlld r8,r0,r8
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r2,r8,r2
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+
+	shlri r2,22,r21
+	mulu.l r21,r1,r21
+	shlld r5,r0,r8
+	addi r20,30-22,r0
+	shlrd r21,r0,r21
+	mulu.l r21,r3,r5
+	add r8,r21,r8
+	mcmpgt.l r21,r63,r21 // See Note 1
+	addi r20,30,r0
+	mshfhi.l r63,r21,r21
+	sub r2,r5,r2
+	andc r2,r21,r2
+
+	/* small divisor: need a third divide step */
+	mulu.l r2,r1,r7
+	ptabs r18,tr0
+	addi r2,1,r2
+	shlrd r7,r0,r7
+	mulu.l r7,r3,r5
+	add r8,r7,r8
+	sub r2,r3,r2
+	cmpgt r2,r5,r5
+	add r8,r5,r2
+	/* could test r3 here to check for divide by zero.  */
+	blink tr0,r63
+
+large_divisor:
+	mmulfx.w r5,r4,r4
+	shlrd r2,r9,r25
+	shlri r25,32,r8
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r8,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	shlri r5,14-1,r8
+	mulu.l r8,r7,r5
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r25,r5,r25
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+
+	shlri r25,22,r21
+	mulu.l r21,r1,r21
+	pta no_lo_adj,tr0
+	addi r22,32,r0
+	shlri r21,40,r21
+	mulu.l r21,r7,r5
+	add r8,r21,r8
+	shlld r2,r0,r2
+	sub r25,r5,r25
+	bgtu/u r7,r25,tr0 // no_lo_adj
+	addi r8,1,r8
+	sub r25,r7,r25
+no_lo_adj:
+	mextr4 r2,r25,r2
+
+	/* large_divisor: only needs a few adjustments.  */
+	mulu.l r8,r6,r5
+	ptabs r18,tr0
+	/* bubble */
+	cmpgtu r5,r2,r5
+	sub r8,r5,r2
+	blink tr0,r63
+	
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */
diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S
new file mode 100644
index 0000000..e68120e
--- /dev/null
+++ b/arch/sh/lib64/udivsi3.S
@@ -0,0 +1,59 @@
+	.global	__udivsi3
+	.section	.text..SHmedia32,"ax"
+	.align	2
+
+/*
+   inputs: r4,r5
+   clobbered: r18,r19,r20,r21,r22,r25,tr0
+   result in r0.
+ */
+__udivsi3:
+	addz.l r5,r63,r22
+	nsb r22,r0
+	shlld r22,r0,r25
+	shlri r25,48,r25
+	movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */
+	sub r20,r25,r21
+	mmulfx.w r21,r21,r19
+	mshflo.w r21,r63,r21
+	ptabs r18,tr0
+	mmulfx.w r25,r19,r19
+	sub r20,r0,r0
+	/* bubble */
+	msub.w r21,r19,r19
+
+	/*
+	 * It would be nice for scheduling to do this add to r21 before
+	 * the msub.w, but we need a different value for r19 to keep
+	 * errors under control.
+	 */
+	addi r19,-2,r21
+	mulu.l r4,r21,r18
+	mmulfx.w r19,r19,r19
+	shlli r21,15,r21
+	shlrd r18,r0,r18
+	mulu.l r18,r22,r20
+	mmacnfx.wl r25,r19,r21
+	/* bubble */
+	sub r4,r20,r25
+
+	mulu.l r25,r21,r19
+	addi r0,14,r0
+	/* bubble */
+	shlrd r19,r0,r19
+	mulu.l r19,r22,r20
+	add r18,r19,r18
+	/* bubble */
+	sub.l r25,r20,r25
+
+	mulu.l r25,r21,r19
+	addz.l r25,r63,r25
+	sub r25,r22,r25
+	shlrd r19,r0,r19
+	mulu.l r19,r22,r20
+	addi r25,1,r25
+	add r18,r19,r18
+
+	cmpgt r25,r20,r25
+	add.l r18,r25,r0
+	blink tr0,r63