Blame - lib/openssl/crypto/rc4/asm/rc4-ia64.pl - kernel/lk

blob: 49cd5b5e6945a16fd3f67d343d028864174d06c8 [file] [log] [blame]

Kinson Chik	a8fa74c	2011-07-29 11:33:41 -0700	[diff] [blame^]	1	#!/usr/bin/env perl
				2	#
				3	# ====================================================================
				4	# Written by David Mosberger <David.Mosberger@acm.org> based on the
				5	# Itanium optimized Crypto code which was released by HP Labs at
				6	# http://www.hpl.hp.com/research/linux/crypto/.
				7	#
				8	# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
				9	#
				10	# Permission is hereby granted, free of charge, to any person obtaining
				11	# a copy of this software and associated documentation files (the
				12	# "Software"), to deal in the Software without restriction, including
				13	# without limitation the rights to use, copy, modify, merge, publish,
				14	# distribute, sublicense, and/or sell copies of the Software, and to
				15	# permit persons to whom the Software is furnished to do so, subject to
				16	# the following conditions:
				17	#
				18	# The above copyright notice and this permission notice shall be
				19	# included in all copies or substantial portions of the Software.
				20
				21	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				22	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				23	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				24	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
				25	# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
				26	# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
				27	# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
				28
				29
				30
				31	# This is a little helper program which generates a software-pipelined
				32	# for RC4 encryption. The basic algorithm looks like this:
				33	#
				34	# for (counter = 0; counter < len; ++counter)
				35	# {
				36	# in = inp[counter];
				37	# SI = S[I];
				38	# J = (SI + J) & 0xff;
				39	# SJ = S[J];
				40	# T = (SI + SJ) & 0xff;
				41	# S[I] = SJ, S[J] = SI;
				42	# ST = S[T];
				43	# outp[counter] = in ^ ST;
				44	# I = (I + 1) & 0xff;
				45	# }
				46	#
				47	# Pipelining this loop isn't easy, because the stores to the S[] array
				48	# need to be observed in the right order. The loop generated by the
				49	# code below has the following pipeline diagram:
				50	#
				51	# cycle
				52	# \| 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| 8 \| 9 \|10 \|11 \|12 \|13 \|14 \|15 \|16 \|17 \|
				53	# iter
				54	# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
				55	# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
				56	# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
				57	#
				58	# where:
				59	# LDI = load of S[I]
				60	# LDJ = load of S[J]
				61	# SWP = swap of S[I] and S[J]
				62	# LDT = load of S[T]
				63	#
				64	# Note that in the above diagram, the major trouble-spot is that LDI
				65	# of the 2nd iteration is performed BEFORE the SWP of the first
				66	# iteration. Fortunately, this is easy to detect (I of the 1st
				67	# iteration will be equal to J of the 2nd iteration) and when this
				68	# happens, we simply forward the proper value from the 1st iteration
				69	# to the 2nd one. The proper value in this case is simply the value
				70	# of S[I] from the first iteration (thanks to the fact that SWP
				71	# simply swaps the contents of S[I] and S[J]).
				72	#
				73	# Another potential trouble-spot is in cycle 7, where SWP of the 1st
				74	# iteration issues at the same time as the LDI of the 3rd iteration.
				75	# However, thanks to IA-64 execution semantics, this can be taken
				76	# care of simply by placing LDI later in the instruction-group than
				77	# SWP. IA-64 CPUs will automatically forward the value if they
				78	# detect that the SWP and LDI are accessing the same memory-location.
				79
				80	# The core-loop that can be pipelined then looks like this (annotated
				81	# with McKinley/Madison issue port & latency numbers, assuming L1
				82	# cache hits for the most part):
				83
				84	# operation: instruction: issue-ports: latency
				85	# ------------------ ----------------------------- ------------- -------
				86
				87	# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
				88	# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
				89	# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
				90	# ;;
				91	# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
				92	# ;;
				93	# cmp.eq.unc pBypass = I, J * after J is valid!
				94	# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
				95	# (pBypass) br.cond.spnt Bypass
				96	# ;;
				97	# ---------------------------------------------------------------------------------------
				98	# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
				99	# ;;
				100	# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
				101	# ;;
				102	# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
				103	# ;;
				104	# ---------------------------------------------------------------------------------------
				105	# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
				106	# ;;
				107	# T = T & 0xff zxt1 T = T I0, I1 1 cyc
				108	# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
				109	# S[J] = SI st8 [Jptr] = SI M2-M3
				110	# ;;
				111	# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
				112	# ;;
				113	# ---------------------------------------------------------------------------------------
				114	# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
				115	# ;;
				116	# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
				117	# ;;
				118	# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
				119	# ;;
				120	# ---------------------------------------------------------------------------------------
				121
				122	# There are several points worth making here:
				123
				124	# - Note that due to the bypass/forwarding-path, the first two
				125	# phases of the loop are strangly mingled together. In
				126	# particular, note that the first stage of the pipeline is
				127	# using the value of "J", as calculated by the second stage.
				128	# - Each bundle-pair will have exactly 6 instructions.
				129	# - Pipelined, the loop can execute in 3 cycles/iteration and
				130	# 4 stages. However, McKinley/Madison can issue "st1" to
				131	# the same bank at a rate of at most one per 4 cycles. Thus,
				132	# instead of storing each byte, we accumulate them in a word
				133	# and then write them back at once with a single "st8" (this
				134	# implies that the setup code needs to ensure that the output
				135	# buffer is properly aligned, if need be, by encoding the
				136	# first few bytes separately).
				137	# - There is no space for a "br.ctop" instruction. For this
				138	# reason we can't use module-loop support in IA-64 and have
				139	# to do a traditional, purely software-pipelined loop.
				140	# - We can't replace any of the remaining "add/zxt1" pairs with
				141	# "padd1" because the latency for that instruction is too high
				142	# and would push the loop to the point where more bypasses
				143	# would be needed, which we don't have space for.
				144	# - The above loop runs at around 3.26 cycles/byte, or roughly
				145	# 440 MByte/sec on a 1.5GHz Madison. This is well below the
				146	# system bus bandwidth and hence with judicious use of
				147	# "lfetch" this loop can run at (almost) peak speed even when
				148	# the input and output data reside in memory. The
				149	# max. latency that can be tolerated is (PREFETCH_DISTANCE *
				150	# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
				151	# least) 1-ahead prefetching of 128 byte cache-lines. Note
				152	# that we do NOT prefetch into L1, since that would only
				153	# interfere with the S[] table values stored there. This is
				154	# acceptable because there is a 10 cycle latency between
				155	# load and first use of the input data.
				156	# - We use a branch to out-of-line bypass-code of cycle-pressure:
				157	# we calculate the next J, check for the need to activate the
				158	# bypass path, and activate the bypass path ALL IN THE SAME
				159	# CYCLE. If we didn't have these constraints, we could do
				160	# the bypass with a simple conditional move instruction.
				161	# Fortunately, the bypass paths get activated relatively
				162	# infrequently, so the extra branches don't cost all that much
				163	# (about 0.04 cycles/byte, measured on a 16396 byte file with
				164	# random input data).
				165	#
				166
				167	$phases = 4; # number of stages/phases in the pipelined-loop
				168	$unroll_count = 6; # number of times we unrolled it
				169	$pComI = (1 << 0);
				170	$pComJ = (1 << 1);
				171	$pComT = (1 << 2);
				172	$pOut = (1 << 3);
				173
				174	$NData = 4;
				175	$NIP = 3;
				176	$NJP = 2;
				177	$NI = 2;
				178	$NSI = 3;
				179	$NSJ = 2;
				180	$NT = 2;
				181	$NOutWord = 2;
				182
				183	#
				184	# $threshold is the minimum length before we attempt to use the
				185	# big software-pipelined loop. It MUST be greater-or-equal
				186	# to:
				187	# PHASES * (UNROLL_COUNT + 1) + 7
				188	#
				189	# The "+ 7" comes from the fact we may have to encode up to
				190	# 7 bytes separately before the output pointer is aligned.
				191	#
				192	$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
				193
				194	sub I {
				195	local *code = shift;
				196	local $format = shift;
				197	$code .= sprintf ("\t\t".$format."\n", @_);
				198	}
				199
				200	sub P {
				201	local *code = shift;
				202	local $format = shift;
				203	$code .= sprintf ($format."\n", @_);
				204	}
				205
				206	sub STOP {
				207	local *code = shift;
				208	$code .=<<___;
				209	;;
				210	___
				211	}
				212
				213	sub emit_body {
				214	local *c = shift;
				215	local *bypass = shift;
				216	local ($iteration, $p) = @_;
				217
				218	local $i0 = $iteration;
				219	local $i1 = $iteration - 1;
				220	local $i2 = $iteration - 2;
				221	local $i3 = $iteration - 3;
				222	local $iw0 = ($iteration - 3) / 8;
				223	local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
				224	local $byte_num = ($iteration - 3) % 8;
				225	local $label = $iteration + 1;
				226	local $pAny = ($p & 0xf) == 0xf;
				227	local $pByp = (($p & $pComI) && ($iteration > 0));
				228
				229	$c.=<<___;
				230	//////////////////////////////////////////////////
				231	___
				232
				233	if (($p & 0xf) == 0) {
				234	$c.="#ifdef HOST_IS_BIG_ENDIAN\n";
				235	&I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
				236	$iw1 % $NOutWord, $iw1 % $NOutWord);
				237	$c.="#endif\n";
				238	&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
				239	return;
				240	}
				241
				242	# Cycle 0
				243	&I(\$c, "{ .mmi") if ($pAny);
				244	&I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
				245	&I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
				246	&I(\$c, "zxt1 J = J") if ($p & $pComJ);
				247	&I(\$c, "}") if ($pAny);
				248	&I(\$c, "{ .mmi") if ($pAny);
				249	&I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
				250	&I(\$c, "add T[%u] = SI[%u], SJ[%u]",
				251	$i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
				252	&I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
				253	&I(\$c, "}") if ($pAny);
				254	&STOP(\$c);
				255
				256	# Cycle 1
				257	&I(\$c, "{ .mmi") if ($pAny);
				258	&I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
				259	&I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
				260	&I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
				261	&I(\$c, "}") if ($pAny);
				262	&I(\$c, "{ .mmi") if ($pAny);
				263	&I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
				264	&I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
				265	&I(\$c, "xor Data[%u] = Data[%u], T[%u]",
				266	$i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
				267	&I(\$c, "}") if ($pAny);
				268	&STOP(\$c);
				269
				270	# Cycle 2
				271	&I(\$c, "{ .mmi") if ($pAny);
				272	&I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
				273	&I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
				274	&I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
				275	$iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
				276	&I(\$c, "}") if ($pAny);
				277	&I(\$c, "{ .mmb") if ($pAny);
				278	&I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
				279	&I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
				280	&P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
				281	&I(\$c, "}") if ($pAny);
				282	&STOP(\$c);
				283
				284	&P(\$c, ".rc4Resume%u:", $label) if ($pByp);
				285	if ($byte_num == 0 && $iteration >= $phases) {
				286	&I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
				287	$iw1 % $NOutWord) if ($p & $pOut);
				288	if ($iteration == (1 + $unroll_count) * $phases - 1) {
				289	if ($unroll_count == 6) {
				290	&I(\$c, "mov OutWord[%u] = OutWord[%u]",
				291	$iw1 % $NOutWord, $iw0 % $NOutWord);
				292	}
				293	&I(\$c, "lfetch.nt1 [InPrefetch], %u",
				294	$unroll_count * $phases);
				295	&I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
				296	$unroll_count * $phases);
				297	&I(\$c, "br.cloop.sptk.few .rc4Loop");
				298	}
				299	}
				300
				301	if ($pByp) {
				302	&P(\$bypass, ".rc4Bypass%u:", $label);
				303	&I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
				304	&I(\$bypass, "nop 0");
				305	&I(\$bypass, "nop 0");
				306	&I(\$bypass, ";;");
				307	&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
				308	&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
				309	&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
				310	&I(\$bypass, ";;");
				311	}
				312	}
				313
				314	$code=<<___;
				315	.ident \"rc4-ia64.s, version 3.0\"
				316	.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
				317
				318	#define LCSave r8
				319	#define PRSave r9
				320
				321	/* Inputs become invalid once rotation begins! */
				322
				323	#define StateTable in0
				324	#define DataLen in1
				325	#define InputBuffer in2
				326	#define OutputBuffer in3
				327
				328	#define KTable r14
				329	#define J r15
				330	#define InPtr r16
				331	#define OutPtr r17
				332	#define InPrefetch r18
				333	#define OutPrefetch r19
				334	#define One r20
				335	#define LoopCount r21
				336	#define Remainder r22
				337	#define IFinal r23
				338	#define EndPtr r24
				339
				340	#define tmp0 r25
				341	#define tmp1 r26
				342
				343	#define pBypass p6
				344	#define pDone p7
				345	#define pSmall p8
				346	#define pAligned p9
				347	#define pUnaligned p10
				348
				349	#define pComputeI pPhase[0]
				350	#define pComputeJ pPhase[1]
				351	#define pComputeT pPhase[2]
				352	#define pOutput pPhase[3]
				353
				354	#define RetVal r8
				355	#define L_OK p7
				356	#define L_NOK p8
				357
				358	#define _NINPUTS 4
				359	#define _NOUTPUT 0
				360
				361	#define _NROTATE 24
				362	#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
				363
				364	#ifndef SZ
				365	# define SZ 4 // this must be set to sizeof(RC4_INT)
				366	#endif
				367
				368	#if SZ == 1
				369	# define LKEY ld1
				370	# define SKEY st1
				371	# define KEYADDR(dst, i) add dst = i, KTable
				372	#elif SZ == 2
				373	# define LKEY ld2
				374	# define SKEY st2
				375	# define KEYADDR(dst, i) shladd dst = i, 1, KTable
				376	#elif SZ == 4
				377	# define LKEY ld4
				378	# define SKEY st4
				379	# define KEYADDR(dst, i) shladd dst = i, 2, KTable
				380	#else
				381	# define LKEY ld8
				382	# define SKEY st8
				383	# define KEYADDR(dst, i) shladd dst = i, 3, KTable
				384	#endif
				385
				386	#if defined(_HPUX_SOURCE) && !defined(_LP64)
				387	# define ADDP addp4
				388	#else
				389	# define ADDP add
				390	#endif
				391
				392	/* Define a macro for the bit number of the n-th byte: */
				393
				394	#if defined(_HPUX_SOURCE) \|\| defined(B_ENDIAN)
				395	# define HOST_IS_BIG_ENDIAN
				396	# define BYTE_POS(n) (56 - (8 * (n)))
				397	#else
				398	# define BYTE_POS(n) (8 * (n))
				399	#endif
				400
				401	/*
				402	We must perform the first phase of the pipeline explicitly since
				403	we will always load from the stable the first time. The br.cexit
				404	will never be taken since regardless of the number of bytes because
				405	the epilogue count is 4.
				406	*/
				407	/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
				408	assembler failed on original macro with syntax error. <appro> */
				409	#define MODSCHED_RC4_PROLOGUE \\
				410	{ \\
				411	ld1 Data[0] = [InPtr], 1; \\
				412	add IFinal = 1, I[1]; \\
				413	KEYADDR(IPr[0], I[1]); \\
				414	} ;; \\
				415	{ \\
				416	LKEY SI[0] = [IPr[0]]; \\
				417	mov pr.rot = 0x10000; \\
				418	mov ar.ec = 4; \\
				419	} ;; \\
				420	{ \\
				421	add J = J, SI[0]; \\
				422	zxt1 I[0] = IFinal; \\
				423	br.cexit.spnt.few .+16; /* never taken */ \\
				424	} ;;
				425	#define MODSCHED_RC4_LOOP(label) \\
				426	label: \\
				427	{ .mmi; \\
				428	(pComputeI) ld1 Data[0] = [InPtr], 1; \\
				429	(pComputeI) add IFinal = 1, I[1]; \\
				430	(pComputeJ) zxt1 J = J; \\
				431	}{ .mmi; \\
				432	(pOutput) LKEY T[1] = [T[1]]; \\
				433	(pComputeT) add T[0] = SI[2], SJ[1]; \\
				434	(pComputeI) KEYADDR(IPr[0], I[1]); \\
				435	} ;; \\
				436	{ .mmi; \\
				437	(pComputeT) SKEY [IPr[2]] = SJ[1]; \\
				438	(pComputeT) SKEY [JP[1]] = SI[2]; \\
				439	(pComputeT) zxt1 T[0] = T[0]; \\
				440	}{ .mmi; \\
				441	(pComputeI) LKEY SI[0] = [IPr[0]]; \\
				442	(pComputeJ) KEYADDR(JP[0], J); \\
				443	(pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
				444	} ;; \\
				445	{ .mmi; \\
				446	(pComputeJ) LKEY SJ[0] = [JP[0]]; \\
				447	(pOutput) xor Data[3] = Data[3], T[1]; \\
				448	nop 0x0; \\
				449	}{ .mmi; \\
				450	(pComputeT) KEYADDR(T[0], T[0]); \\
				451	(pBypass) mov SI[0] = SI[1]; \\
				452	(pComputeI) zxt1 I[0] = IFinal; \\
				453	} ;; \\
				454	{ .mmb; \\
				455	(pOutput) st1 [OutPtr] = Data[3], 1; \\
				456	(pComputeI) add J = J, SI[0]; \\
				457	br.ctop.sptk.few label; \\
				458	} ;;
				459
				460	.text
				461
				462	.align 32
				463
				464	.type RC4, \@function
				465	.global RC4
				466
				467	.proc RC4
				468	.prologue
				469
				470	RC4:
				471	{
				472	.mmi
				473	alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
				474
				475	.rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
				476	OutWord[2]
				477	.rotp pPhase[4]
				478
				479	ADDP InPrefetch = 0, InputBuffer
				480	ADDP KTable = 0, StateTable
				481	}
				482	{
				483	.mmi
				484	ADDP InPtr = 0, InputBuffer
				485	ADDP OutPtr = 0, OutputBuffer
				486	mov RetVal = r0
				487	}
				488	;;
				489	{
				490	.mmi
				491	lfetch.nt1 [InPrefetch], 0x80
				492	ADDP OutPrefetch = 0, OutputBuffer
				493	}
				494	{ // Return 0 if the input length is nonsensical
				495	.mib
				496	ADDP StateTable = 0, StateTable
				497	cmp.ge.unc L_NOK, L_OK = r0, DataLen
				498	(L_NOK) br.ret.sptk.few rp
				499	}
				500	;;
				501	{
				502	.mib
				503	cmp.eq.or L_NOK, L_OK = r0, InPtr
				504	cmp.eq.or L_NOK, L_OK = r0, OutPtr
				505	nop 0x0
				506	}
				507	{
				508	.mib
				509	cmp.eq.or L_NOK, L_OK = r0, StateTable
				510	nop 0x0
				511	(L_NOK) br.ret.sptk.few rp
				512	}
				513	;;
				514	LKEY I[1] = [KTable], SZ
				515	/* Prefetch the state-table. It contains 256 elements of size SZ */
				516
				517	#if SZ == 1
				518	ADDP tmp0 = 1*128, StateTable
				519	#elif SZ == 2
				520	ADDP tmp0 = 3*128, StateTable
				521	ADDP tmp1 = 2*128, StateTable
				522	#elif SZ == 4
				523	ADDP tmp0 = 7*128, StateTable
				524	ADDP tmp1 = 6*128, StateTable
				525	#elif SZ == 8
				526	ADDP tmp0 = 15*128, StateTable
				527	ADDP tmp1 = 14*128, StateTable
				528	#endif
				529	;;
				530	#if SZ >= 8
				531	lfetch.fault.nt1 [tmp0], -256 // 15
				532	lfetch.fault.nt1 [tmp1], -256;;
				533	lfetch.fault.nt1 [tmp0], -256 // 13
				534	lfetch.fault.nt1 [tmp1], -256;;
				535	lfetch.fault.nt1 [tmp0], -256 // 11
				536	lfetch.fault.nt1 [tmp1], -256;;
				537	lfetch.fault.nt1 [tmp0], -256 // 9
				538	lfetch.fault.nt1 [tmp1], -256;;
				539	#endif
				540	#if SZ >= 4
				541	lfetch.fault.nt1 [tmp0], -256 // 7
				542	lfetch.fault.nt1 [tmp1], -256;;
				543	lfetch.fault.nt1 [tmp0], -256 // 5
				544	lfetch.fault.nt1 [tmp1], -256;;
				545	#endif
				546	#if SZ >= 2
				547	lfetch.fault.nt1 [tmp0], -256 // 3
				548	lfetch.fault.nt1 [tmp1], -256;;
				549	#endif
				550	{
				551	.mii
				552	lfetch.fault.nt1 [tmp0] // 1
				553	add I[1]=1,I[1];;
				554	zxt1 I[1]=I[1]
				555	}
				556	{
				557	.mmi
				558	lfetch.nt1 [InPrefetch], 0x80
				559	lfetch.excl.nt1 [OutPrefetch], 0x80
				560	.save pr, PRSave
				561	mov PRSave = pr
				562	} ;;
				563	{
				564	.mmi
				565	lfetch.excl.nt1 [OutPrefetch], 0x80
				566	LKEY J = [KTable], SZ
				567	ADDP EndPtr = DataLen, InPtr
				568	} ;;
				569	{
				570	.mmi
				571	ADDP EndPtr = -1, EndPtr // Make it point to
				572	// last data byte.
				573	mov One = 1
				574	.save ar.lc, LCSave
				575	mov LCSave = ar.lc
				576	.body
				577	} ;;
				578	{
				579	.mmb
				580	sub Remainder = 0, OutPtr
				581	cmp.gtu pSmall, p0 = $threshold, DataLen
				582	(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
				583	// big loop.
				584	} ;;
				585	{
				586	.mmi
				587	and Remainder = 0x7, Remainder
				588	;;
				589	cmp.eq pAligned, pUnaligned = Remainder, r0
				590	nop 0x0
				591	} ;;
				592	{
				593	.mmb
				594	.pred.rel "mutex",pUnaligned,pAligned
				595	(pUnaligned) add Remainder = -1, Remainder
				596	(pAligned) sub Remainder = EndPtr, InPtr
				597	(pAligned) br.cond.dptk.many .rc4Aligned
				598	} ;;
				599	{
				600	.mmi
				601	nop 0x0
				602	nop 0x0
				603	mov.i ar.lc = Remainder
				604	}
				605
				606	/* Do the initial few bytes via the compact, modulo-scheduled loop
				607	until the output pointer is 8-byte-aligned. */
				608
				609	MODSCHED_RC4_PROLOGUE
				610	MODSCHED_RC4_LOOP(.RC4AlignLoop)
				611
				612	{
				613	.mib
				614	sub Remainder = EndPtr, InPtr
				615	zxt1 IFinal = IFinal
				616	clrrrb // Clear CFM.rrb.pr so
				617	;; // next "mov pr.rot = N"
				618	// does the right thing.
				619	}
				620	{
				621	.mmi
				622	mov I[1] = IFinal
				623	nop 0x0
				624	nop 0x0
				625	} ;;
				626
				627
				628	.rc4Aligned:
				629
				630	/*
				631	Unrolled loop count = (Remainder - ($unroll_count+1)$phases)/($unroll_count$phases)
				632	*/
				633
				634	{
				635	.mlx
				636	add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
				637	movl Remainder = 0xaaaaaaaaaaaaaaab
				638	} ;;
				639	{
				640	.mmi
				641	setf.sig f6 = LoopCount // M2, M3 6 cyc
				642	setf.sig f7 = Remainder // M2, M3 6 cyc
				643	nop 0x0
				644	} ;;
				645	{
				646	.mfb
				647	nop 0x0
				648	xmpy.hu f6 = f6, f7
				649	nop 0x0
				650	} ;;
				651	{
				652	.mmi
				653	getf.sig LoopCount = f6;; // M2 5 cyc
				654	nop 0x0
				655	shr.u LoopCount = LoopCount, 4
				656	} ;;
				657	{
				658	.mmi
				659	nop 0x0
				660	nop 0x0
				661	mov.i ar.lc = LoopCount
				662	} ;;
				663
				664	/* Now comes the unrolled loop: */
				665
				666	.rc4Prologue:
				667	___
				668
				669	$iteration = 0;
				670
				671	# Generate the prologue:
				672	$predicates = 1;
				673	for ($i = 0; $i < $phases; ++$i) {
				674	&emit_body (\$code, \$bypass, $iteration++, $predicates);
				675	$predicates = ($predicates << 1) \| 1;
				676	}
				677
				678	$code.=<<___;
				679	.rc4Loop:
				680	___
				681
				682	# Generate the body:
				683	for ($i = 0; $i < $unroll_count*$phases; ++$i) {
				684	&emit_body (\$code, \$bypass, $iteration++, $predicates);
				685	}
				686
				687	$code.=<<___;
				688	.rc4Epilogue:
				689	___
				690
				691	# Generate the epilogue:
				692	for ($i = 0; $i < $phases; ++$i) {
				693	$predicates <<= 1;
				694	&emit_body (\$code, \$bypass, $iteration++, $predicates);
				695	}
				696
				697	$code.=<<___;
				698	{
				699	.mmi
				700	lfetch.nt1 [EndPtr] // fetch line with last byte
				701	mov IFinal = I[1]
				702	nop 0x0
				703	}
				704
				705	.rc4Remainder:
				706	{
				707	.mmi
				708	sub Remainder = EndPtr, InPtr // Calculate
				709	// # of bytes
				710	// left - 1
				711	nop 0x0
				712	nop 0x0
				713	} ;;
				714	{
				715	.mib
				716	cmp.eq pDone, p0 = -1, Remainder // done already?
				717	mov.i ar.lc = Remainder
				718	(pDone) br.cond.dptk.few .rc4Complete
				719	}
				720
				721	/* Do the remaining bytes via the compact, modulo-scheduled loop */
				722
				723	MODSCHED_RC4_PROLOGUE
				724	MODSCHED_RC4_LOOP(.RC4RestLoop)
				725
				726	.rc4Complete:
				727	{
				728	.mmi
				729	add KTable = -SZ, KTable
				730	add IFinal = -1, IFinal
				731	mov ar.lc = LCSave
				732	} ;;
				733	{
				734	.mii
				735	SKEY [KTable] = J,-SZ
				736	zxt1 IFinal = IFinal
				737	mov pr = PRSave, 0x1FFFF
				738	} ;;
				739	{
				740	.mib
				741	SKEY [KTable] = IFinal
				742	add RetVal = 1, r0
				743	br.ret.sptk.few rp
				744	} ;;
				745	___
				746
				747	# Last but not least, emit the code for the bypass-code of the unrolled loop:
				748
				749	$code.=$bypass;
				750
				751	$code.=<<___;
				752	.endp RC4
				753	___
				754
				755	print $code;