Blame - arch/sh/lib/memcpy-sh4.S - kernel/msm-4.9

blob: 459fa92a7c5311b02c1e1f21c74536eb27432ed2 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* "memcpy" implementation of SuperH
				3	*
				4	* Copyright (C) 1999 Niibe Yutaka
				5	* Copyright (c) 2002 STMicroelectronics Ltd
				6	* Modified from memcpy.S and micro-optimised for SH4
				7	* Stuart Menefy (stuart.menefy@st.com)
				8	*
				9	*/
				10	#include <linux/linkage.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11
				12	/*
				13	* void memcpy(void dst, const void *src, size_t n);
				14	*
				15	* It is assumed that there is no overlap between src and dst.
				16	* If there is an overlap, then the results are undefined.
				17	*/
				18
				19	!
				20	! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
				21	!
				22
				23	! Size is 16 or greater, and may have trailing bytes
				24
				25	.balign 32
				26	.Lcase1:
				27	! Read a long word and write a long word at once
				28	! At the start of each iteration, r7 contains last long load
				29	add #-1,r5 ! 79 EX
				30	mov r4,r2 ! 5 MT (0 cycles latency)
				31
				32	mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
				33	add #-4,r5 ! 50 EX
				34
				35	add #7,r2 ! 79 EX
				36	!
				37	#ifdef CONFIG_CPU_LITTLE_ENDIAN
				38	! 6 cycles, 4 bytes per iteration
				39	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
				40	mov r7, r3 ! 5 MT (latency=0) ! RQPO
				41
				42	cmp/hi r2,r0 ! 57 MT
				43	shll16 r3 ! 103 EX
				44
				45	mov r1,r6 ! 5 MT (latency=0)
				46	shll8 r3 ! 102 EX ! Oxxx
				47
				48	shlr8 r6 ! 106 EX ! xNML
				49	mov r1, r7 ! 5 MT (latency=0)
				50
				51	or r6,r3 ! 82 EX ! ONML
				52	bt/s 3b ! 109 BR
				53
				54	mov.l r3,@-r0 ! 30 LS
				55	#else
				56	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
				57	mov r7,r3 ! 5 MT (latency=0) ! OPQR
				58
				59	cmp/hi r2,r0 ! 57 MT
				60	shlr16 r3 ! 107 EX
				61
				62	shlr8 r3 ! 106 EX ! xxxO
				63	mov r1,r6 ! 5 MT (latency=0)
				64
				65	shll8 r6 ! 102 EX ! LMNx
				66	mov r1,r7 ! 5 MT (latency=0)
				67
				68	or r6,r3 ! 82 EX ! LMNO
				69	bt/s 3b ! 109 BR
				70
				71	mov.l r3,@-r0 ! 30 LS
				72	#endif
				73	! Finally, copy a byte at once, if necessary
				74
				75	add #4,r5 ! 50 EX
				76	cmp/eq r4,r0 ! 54 MT
				77
				78	add #-6,r2 ! 50 EX
				79	bt 9f ! 109 BR
				80
				81	8: cmp/hi r2,r0 ! 57 MT
				82	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				83
				84	bt/s 8b ! 109 BR
				85
				86	mov.b r1,@-r0 ! 29 LS
				87
				88	9: rts
				89	nop
				90
				91
				92	!
				93	! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
				94	!
				95
				96	! Size is 16 or greater, and may have trailing bytes
				97
				98	.balign 32
				99	.Lcase3:
				100	! Read a long word and write a long word at once
				101	! At the start of each iteration, r7 contains last long load
				102	add #-3,r5 ! 79 EX
				103	mov r4,r2 ! 5 MT (0 cycles latency)
				104
				105	mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
				106	add #-4,r5 ! 50 EX
				107
				108	add #7,r2 ! 79 EX
				109	!
				110	#ifdef CONFIG_CPU_LITTLE_ENDIAN
				111	! 6 cycles, 4 bytes per iteration
				112	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
				113	mov r7, r3 ! 5 MT (latency=0) ! RQPO
				114
				115	cmp/hi r2,r0 ! 57 MT
				116	shll8 r3 ! 102 EX ! QPOx
				117
				118	mov r1,r6 ! 5 MT (latency=0)
				119	shlr16 r6 ! 107 EX
				120
				121	shlr8 r6 ! 106 EX ! xxxN
				122	mov r1, r7 ! 5 MT (latency=0)
				123
				124	or r6,r3 ! 82 EX ! QPON
				125	bt/s 3b ! 109 BR
				126
				127	mov.l r3,@-r0 ! 30 LS
				128	#else
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	129	3: mov r7,r3 ! OPQR
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	130	shlr8 r3 ! xOPQ
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	131	mov.l @(r0,r5),r7 ! KLMN
				132	mov r7,r6
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	133	shll16 r6
				134	shll8 r6 ! Nxxx
				135	or r6,r3 ! NOPQ
				136	cmp/hi r2,r0
				137	bt/s 3b
				138	mov.l r3,@-r0
				139	#endif
				140
				141	! Finally, copy a byte at once, if necessary
				142
				143	add #6,r5 ! 50 EX
				144	cmp/eq r4,r0 ! 54 MT
				145
				146	add #-6,r2 ! 50 EX
				147	bt 9f ! 109 BR
				148
				149	8: cmp/hi r2,r0 ! 57 MT
				150	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				151
				152	bt/s 8b ! 109 BR
				153
				154	mov.b r1,@-r0 ! 29 LS
				155
				156	9: rts
				157	nop
				158
				159	ENTRY(memcpy)
				160
				161	! Calculate the invariants which will be used in the remainder
				162	! of the code:
				163	!
				164	! r4 --> [ ... ] DST [ ... ] SRC
				165	! [ ... ] [ ... ]
				166	! : :
				167	! r0 --> [ ... ] r0+r5 --> [ ... ]
				168	!
				169	!
				170
				171	! Short circuit the common case of src, dst and len being 32 bit aligned
				172	! and test for zero length move
				173
				174	mov r6, r0 ! 5 MT (0 cycle latency)
				175	or r4, r0 ! 82 EX
				176
				177	or r5, r0 ! 82 EX
				178	tst r6, r6 ! 86 MT
				179
				180	bt/s 99f ! 111 BR (zero len)
				181	tst #3, r0 ! 87 MT
				182
				183	mov r4, r0 ! 5 MT (0 cycle latency)
				184	add r6, r0 ! 49 EX
				185
				186	mov #16, r1 ! 6 EX
				187	bt/s .Lcase00 ! 111 BR (aligned)
				188
				189	sub r4, r5 ! 75 EX
				190
				191	! Arguments are not nicely long word aligned or zero len.
				192	! Check for small copies, and if so do a simple byte at a time copy.
				193	!
				194	! Deciding on an exact value of 'small' is not easy, as the point at which
				195	! using the optimised routines become worthwhile varies (these are the
				196	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
				197	! size byte-at-time long word byte
				198	! 16 42 39-40 46-50 50-55
				199	! 24 58 43-44 54-58 62-67
				200	! 36 82 49-50 66-70 80-85
				201	! However the penalty for getting it 'wrong' is much higher for long word
				202	! aligned data (and this is more common), so use a value of 16.
				203
				204	cmp/gt r6,r1 ! 56 MT
				205
				206	add #-1,r5 ! 50 EX
				207	bf/s 6f ! 108 BR (not small)
				208
				209	mov r5, r3 ! 5 MT (latency=0)
				210	shlr r6 ! 104 EX
				211
				212	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				213	bf/s 4f ! 111 BR
				214
				215	add #-1,r3 ! 50 EX
				216	tst r6, r6 ! 86 MT
				217
				218	bt/s 98f ! 110 BR
				219	mov.b r1,@-r0 ! 29 LS
				220
				221	! 4 cycles, 2 bytes per iteration
				222	3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				223
				224	4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
				225	dt r6 ! 67 EX
				226
				227	mov.b r1,@-r0 ! 29 LS
				228	bf/s 3b ! 111 BR
				229
				230	mov.b r2,@-r0 ! 29 LS
				231	98:
				232	rts
				233	nop
				234
				235	99: rts
				236	mov r4, r0
				237
				238	! Size is not small, so its worthwhile looking for optimisations.
				239	! First align destination to a long word boundary.
				240	!
				241	! r5 = normal value -1
				242
				243	6: tst #3, r0 ! 87 MT
				244	mov #3, r3 ! 6 EX
				245
				246	bt/s 2f ! 111 BR
				247	and r0,r3 ! 78 EX
				248
				249	! 3 cycles, 1 byte per iteration
				250	1: dt r3 ! 67 EX
				251	mov.b @(r0,r5),r1 ! 19 LS (latency=2)
				252
				253	add #-1, r6 ! 79 EX
				254	bf/s 1b ! 109 BR
				255
				256	mov.b r1,@-r0 ! 28 LS
				257
				258	2: add #1, r5 ! 79 EX
				259
				260	! Now select the appropriate bulk transfer code based on relative
				261	! alignment of src and dst.
				262
				263	mov r0, r3 ! 5 MT (latency=0)
				264
				265	mov r5, r0 ! 5 MT (latency=0)
				266	tst #1, r0 ! 87 MT
				267
				268	bf/s 1f ! 111 BR
				269	mov #64, r7 ! 6 EX
				270
				271	! bit 0 clear
				272
				273	cmp/ge r7, r6 ! 55 MT
				274
				275	bt/s 2f ! 111 BR
				276	tst #2, r0 ! 87 MT
				277
				278	! small
				279	bt/s .Lcase0
				280	mov r3, r0
				281
				282	bra .Lcase2
				283	nop
				284
				285	! big
				286	2: bt/s .Lcase0b
				287	mov r3, r0
				288
				289	bra .Lcase2b
				290	nop
				291
				292	! bit 0 set
				293	1: tst #2, r0 ! 87 MT
				294
				295	bt/s .Lcase1
				296	mov r3, r0
				297
				298	bra .Lcase3
				299	nop
				300
				301
				302	!
				303	! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
				304	!
				305
				306	! src, dst and size are all long word aligned
				307	! size is non-zero
				308
				309	.balign 32
				310	.Lcase00:
				311	mov #64, r1 ! 6 EX
				312	mov r5, r3 ! 5 MT (latency=0)
				313
				314	cmp/gt r6, r1 ! 56 MT
				315	add #-4, r5 ! 50 EX
				316
				317	bf .Lcase00b ! 108 BR (big loop)
				318	shlr2 r6 ! 105 EX
				319
				320	shlr r6 ! 104 EX
				321	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				322
				323	bf/s 4f ! 111 BR
				324	add #-8, r3 ! 50 EX
				325
				326	tst r6, r6 ! 86 MT
				327	bt/s 5f ! 110 BR
				328
				329	mov.l r1,@-r0 ! 30 LS
				330
				331	! 4 cycles, 2 long words per iteration
				332	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				333
				334	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
				335	dt r6 ! 67 EX
				336
				337	mov.l r1, @-r0 ! 30 LS
				338	bf/s 3b ! 109 BR
				339
				340	mov.l r2, @-r0 ! 30 LS
				341
				342	5: rts
				343	nop
				344
				345
				346	! Size is 16 or greater and less than 64, but may have trailing bytes
				347
				348	.balign 32
				349	.Lcase0:
				350	add #-4, r5 ! 50 EX
				351	mov r4, r7 ! 5 MT (latency=0)
				352
				353	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				354	mov #4, r2 ! 6 EX
				355
				356	add #11, r7 ! 50 EX
				357	tst r2, r6 ! 86 MT
				358
				359	mov r5, r3 ! 5 MT (latency=0)
				360	bt/s 4f ! 111 BR
				361
				362	add #-4, r3 ! 50 EX
				363	mov.l r1,@-r0 ! 30 LS
				364
				365	! 4 cycles, 2 long words per iteration
				366	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				367
				368	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
				369	cmp/hi r7, r0
				370
				371	mov.l r1, @-r0 ! 30 LS
				372	bt/s 3b ! 109 BR
				373
				374	mov.l r2, @-r0 ! 30 LS
				375
				376	! Copy the final 0-3 bytes
				377
				378	add #3,r5 ! 50 EX
				379
				380	cmp/eq r0, r4 ! 54 MT
				381	add #-10, r7 ! 50 EX
				382
				383	bt 9f ! 110 BR
				384
				385	! 3 cycles, 1 byte per iteration
				386	1: mov.b @(r0,r5),r1 ! 19 LS
				387	cmp/hi r7,r0 ! 57 MT
				388
				389	bt/s 1b ! 111 BR
				390	mov.b r1,@-r0 ! 28 LS
				391
				392	9: rts
				393	nop
				394
				395	! Size is at least 64 bytes, so will be going round the big loop at least once.
				396	!
				397	! r2 = rounded up r4
				398	! r3 = rounded down r0
				399
				400	.balign 32
				401	.Lcase0b:
				402	add #-4, r5 ! 50 EX
				403
				404	.Lcase00b:
				405	mov r0, r3 ! 5 MT (latency=0)
				406	mov #(~0x1f), r1 ! 6 EX
				407
				408	and r1, r3 ! 78 EX
				409	mov r4, r2 ! 5 MT (latency=0)
				410
				411	cmp/eq r3, r0 ! 54 MT
				412	add #0x1f, r2 ! 50 EX
				413
				414	bt/s 1f ! 110 BR
				415	and r1, r2 ! 78 EX
				416
				417	! copy initial words until cache line aligned
				418
				419	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				420	tst #4, r0 ! 87 MT
				421
				422	mov r5, r6 ! 5 MT (latency=0)
				423	add #-4, r6 ! 50 EX
				424
				425	bt/s 4f ! 111 BR
				426	add #8, r3 ! 50 EX
				427
				428	tst #0x18, r0 ! 87 MT
				429
				430	bt/s 1f ! 109 BR
				431	mov.l r1,@-r0 ! 30 LS
				432
				433	! 4 cycles, 2 long words per iteration
				434	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				435
				436	4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
				437	cmp/eq r3, r0 ! 54 MT
				438
				439	mov.l r1, @-r0 ! 30 LS
				440	bf/s 3b ! 109 BR
				441
				442	mov.l r7, @-r0 ! 30 LS
				443
				444	! Copy the cache line aligned blocks
				445	!
				446	! In use: r0, r2, r4, r5
				447	! Scratch: r1, r3, r6, r7
				448	!
				449	! We could do this with the four scratch registers, but if src
				450	! and dest hit the same cache line, this will thrash, so make
				451	! use of additional registers.
				452	!
				453	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
				454	! r5: src (was r0+r5)
				455	! r1: dest (was r0)
				456	! this can be reversed at the end, so we don't need to save any extra
				457	! state.
				458	!
				459	1: mov.l r8, @-r15 ! 30 LS
				460	add r0, r5 ! 49 EX
				461
				462	mov.l r9, @-r15 ! 30 LS
				463	mov r0, r1 ! 5 MT (latency=0)
				464
				465	mov.l r10, @-r15 ! 30 LS
				466	add #-0x1c, r5 ! 50 EX
				467
				468	mov.l r11, @-r15 ! 30 LS
				469
				470	! 16 cycles, 32 bytes per iteration
				471	2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
				472	add #-0x20, r1 ! 50 EX
				473	mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
				474	mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
				475	mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
				476	mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
				477	mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
				478	mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
				479	mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
				480	movca.l r0,@r1 ! 40 LS (latency=3-7)
				481	mov.l r3,@(0x04,r1) ! 33 LS
				482	mov.l r6,@(0x08,r1) ! 33 LS
				483	mov.l r7,@(0x0c,r1) ! 33 LS
				484
				485	mov.l r8,@(0x10,r1) ! 33 LS
				486	add #-0x20, r5 ! 50 EX
				487
				488	mov.l r9,@(0x14,r1) ! 33 LS
				489	cmp/eq r2,r1 ! 54 MT
				490
				491	mov.l r10,@(0x18,r1) ! 33 LS
				492	bf/s 2b ! 109 BR
				493
				494	mov.l r11,@(0x1c,r1) ! 33 LS
				495
				496	mov r1, r0 ! 5 MT (latency=0)
				497
				498	mov.l @r15+, r11 ! 15 LS
				499	sub r1, r5 ! 75 EX
				500
				501	mov.l @r15+, r10 ! 15 LS
				502	cmp/eq r4, r0 ! 54 MT
				503
				504	bf/s 1f ! 109 BR
				505	mov.l @r15+, r9 ! 15 LS
				506
				507	rts
				508	1: mov.l @r15+, r8 ! 15 LS
				509	sub r4, r1 ! 75 EX (len remaining)
				510
				511	! number of trailing bytes is non-zero
				512	!
				513	! invariants restored (r5 already decremented by 4)
				514	! also r1=num bytes remaining
				515
				516	mov #4, r2 ! 6 EX
				517	mov r4, r7 ! 5 MT (latency=0)
				518
				519	add #0x1c, r5 ! 50 EX (back to -4)
				520	cmp/hs r2, r1 ! 58 MT
				521
				522	bf/s 5f ! 108 BR
				523	add #11, r7 ! 50 EX
				524
				525	mov.l @(r0, r5), r6 ! 21 LS (latency=2)
				526	tst r2, r1 ! 86 MT
				527
				528	mov r5, r3 ! 5 MT (latency=0)
				529	bt/s 4f ! 111 BR
				530
				531	add #-4, r3 ! 50 EX
				532	cmp/hs r2, r1 ! 58 MT
				533
				534	bt/s 5f ! 111 BR
				535	mov.l r6,@-r0 ! 30 LS
				536
				537	! 4 cycles, 2 long words per iteration
				538	3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
				539
				540	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
				541	cmp/hi r7, r0
				542
				543	mov.l r6, @-r0 ! 30 LS
				544	bt/s 3b ! 109 BR
				545
				546	mov.l r2, @-r0 ! 30 LS
				547
				548	! Copy the final 0-3 bytes
				549
				550	5: cmp/eq r0, r4 ! 54 MT
				551	add #-10, r7 ! 50 EX
				552
				553	bt 9f ! 110 BR
				554	add #3,r5 ! 50 EX
				555
				556	! 3 cycles, 1 byte per iteration
				557	1: mov.b @(r0,r5),r1 ! 19 LS
				558	cmp/hi r7,r0 ! 57 MT
				559
				560	bt/s 1b ! 111 BR
				561	mov.b r1,@-r0 ! 28 LS
				562
				563	9: rts
				564	nop
				565
				566	!
				567	! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
				568	!
				569
				570	.balign 32
				571	.Lcase2:
				572	! Size is 16 or greater and less then 64, but may have trailing bytes
				573
				574	2: mov r5, r6 ! 5 MT (latency=0)
				575	add #-2,r5 ! 50 EX
				576
				577	mov r4,r2 ! 5 MT (latency=0)
				578	add #-4,r6 ! 50 EX
				579
				580	add #7,r2 ! 50 EX
				581	3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
				582
				583	mov.w @(r0,r6),r3 ! 20 LS (latency=2)
				584	cmp/hi r2,r0 ! 57 MT
				585
				586	mov.w r1,@-r0 ! 29 LS
				587	bt/s 3b ! 111 BR
				588
				589	mov.w r3,@-r0 ! 29 LS
				590
				591	bra 10f
				592	nop
				593
				594
				595	.balign 32
				596	.Lcase2b:
				597	! Size is at least 64 bytes, so will be going round the big loop at least once.
				598	!
				599	! r2 = rounded up r4
				600	! r3 = rounded down r0
				601
				602	mov r0, r3 ! 5 MT (latency=0)
				603	mov #(~0x1f), r1 ! 6 EX
				604
				605	and r1, r3 ! 78 EX
				606	mov r4, r2 ! 5 MT (latency=0)
				607
				608	cmp/eq r3, r0 ! 54 MT
				609	add #0x1f, r2 ! 50 EX
				610
				611	add #-2, r5 ! 50 EX
				612	bt/s 1f ! 110 BR
				613	and r1, r2 ! 78 EX
				614
				615	! Copy a short word one at a time until we are cache line aligned
				616	! Normal values: r0, r2, r3, r4
				617	! Unused: r1, r6, r7
				618	! Mod: r5 (=r5-2)
				619	!
				620	add #2, r3 ! 50 EX
				621
				622	2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
				623	cmp/eq r3,r0 ! 54 MT
				624
				625	bf/s 2b ! 111 BR
				626
				627	mov.w r1,@-r0 ! 29 LS
				628
				629	! Copy the cache line aligned blocks
				630	!
				631	! In use: r0, r2, r4, r5 (=r5-2)
				632	! Scratch: r1, r3, r6, r7
				633	!
				634	! We could do this with the four scratch registers, but if src
				635	! and dest hit the same cache line, this will thrash, so make
				636	! use of additional registers.
				637	!
				638	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
				639	! r5: src (was r0+r5)
				640	! r1: dest (was r0)
				641	! this can be reversed at the end, so we don't need to save any extra
				642	! state.
				643	!
				644	1: mov.l r8, @-r15 ! 30 LS
				645	add r0, r5 ! 49 EX
				646
				647	mov.l r9, @-r15 ! 30 LS
				648	mov r0, r1 ! 5 MT (latency=0)
				649
				650	mov.l r10, @-r15 ! 30 LS
				651	add #-0x1e, r5 ! 50 EX
				652
				653	mov.l r11, @-r15 ! 30 LS
				654
				655	mov.l r12, @-r15 ! 30 LS
				656
				657	! 17 cycles, 32 bytes per iteration
				658	#ifdef CONFIG_CPU_LITTLE_ENDIAN
				659	2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
				660	add #-0x20, r1 ! 50 EX
				661
				662	mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
				663
				664	mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
				665	shll16 r0 ! 103 EX JI..
				666
				667	mov.l @r5+, r7 ! 15 LS (latency=2)
				668	xtrct r3, r0 ! 48 EX LKJI
				669
				670	mov.l @r5+, r8 ! 15 LS (latency=2)
				671	xtrct r6, r3 ! 48 EX PONM
				672
				673	mov.l @r5+, r9 ! 15 LS (latency=2)
				674	xtrct r7, r6 ! 48 EX
				675
				676	mov.l @r5+, r10 ! 15 LS (latency=2)
				677	xtrct r8, r7 ! 48 EX
				678
				679	mov.l @r5+, r11 ! 15 LS (latency=2)
				680	xtrct r9, r8 ! 48 EX
				681
				682	mov.w @r5+, r12 ! 15 LS (latency=2)
				683	xtrct r10, r9 ! 48 EX
				684
				685	movca.l r0,@r1 ! 40 LS (latency=3-7)
				686	xtrct r11, r10 ! 48 EX
				687
				688	mov.l r3, @(0x04,r1) ! 33 LS
				689	xtrct r12, r11 ! 48 EX
				690
				691	mov.l r6, @(0x08,r1) ! 33 LS
				692
				693	mov.l r7, @(0x0c,r1) ! 33 LS
				694
				695	mov.l r8, @(0x10,r1) ! 33 LS
				696	add #-0x40, r5 ! 50 EX
				697
				698	mov.l r9, @(0x14,r1) ! 33 LS
				699	cmp/eq r2,r1 ! 54 MT
				700
				701	mov.l r10, @(0x18,r1) ! 33 LS
				702	bf/s 2b ! 109 BR
				703
				704	mov.l r11, @(0x1c,r1) ! 33 LS
				705	#else
				706	2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
				707	add #-2, r5 ! 50 EX
				708
				709	mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
				710	add #-4, r1 ! 50 EX
				711
				712	mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
				713	shll16 r0 ! 103 EX
				714
				715	mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
				716	xtrct r3, r0 ! 48 EX
				717
				718	mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
				719	xtrct r6, r3 ! 48 EX
				720
				721	mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
				722	xtrct r7, r6 ! 48 EX
				723
				724	mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
				725	xtrct r8, r7 ! 48 EX
				726
				727	mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
				728	xtrct r9, r8 ! 48 EX
				729
Nobuhiro Iwamatsu	c7afb7e	2006-09-27 17:50:03 +0900	[diff] [blame]	730	mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
				731	xtrct r10, r9 ! 48 EX
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	732
				733	movca.l r0,@r1 ! 40 LS (latency=3-7)
				734	add #-0x1c, r1 ! 50 EX
				735
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	736	mov.l r3, @(0x18,r1) ! 33 LS
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	737	xtrct r11, r10 ! 48 EX
				738
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	739	mov.l r6, @(0x14,r1) ! 33 LS
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	740	xtrct r12, r11 ! 48 EX
				741
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	742	mov.l r7, @(0x10,r1) ! 33 LS
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	743
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	744	mov.l r8, @(0x0c,r1) ! 33 LS
				745	add #-0x1e, r5 ! 50 EX
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	746
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	747	mov.l r9, @(0x08,r1) ! 33 LS
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	748	cmp/eq r2,r1 ! 54 MT
				749
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	750	mov.l r10, @(0x04,r1) ! 33 LS
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	751	bf/s 2b ! 109 BR
				752
Hideo Saito	e08b954	2008-05-15 13:28:46 +0900	[diff] [blame]	753	mov.l r11, @(0x00,r1) ! 33 LS
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	754	#endif
				755
				756	mov.l @r15+, r12
				757	mov r1, r0 ! 5 MT (latency=0)
				758
				759	mov.l @r15+, r11 ! 15 LS
				760	sub r1, r5 ! 75 EX
				761
				762	mov.l @r15+, r10 ! 15 LS
				763	cmp/eq r4, r0 ! 54 MT
				764
				765	bf/s 1f ! 109 BR
				766	mov.l @r15+, r9 ! 15 LS
				767
				768	rts
				769	1: mov.l @r15+, r8 ! 15 LS
				770
				771	add #0x1e, r5 ! 50 EX
				772
				773	! Finish off a short word at a time
				774	! r5 must be invariant - 2
				775	10: mov r4,r2 ! 5 MT (latency=0)
				776	add #1,r2 ! 50 EX
				777
				778	cmp/hi r2, r0 ! 57 MT
				779	bf/s 1f ! 109 BR
				780
				781	add #2, r2 ! 50 EX
				782
				783	3: mov.w @(r0,r5),r1 ! 20 LS
				784	cmp/hi r2,r0 ! 57 MT
				785
				786	bt/s 3b ! 109 BR
				787
				788	mov.w r1,@-r0 ! 29 LS
				789	1:
				790
				791	!
				792	! Finally, copy the last byte if necessary
				793	cmp/eq r4,r0 ! 54 MT
				794	bt/s 9b
				795	add #1,r5
				796	mov.b @(r0,r5),r1
				797	rts
				798	mov.b r1,@-r0
				799