Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Optmized version of the ip_fast_csum() function |
| 3 | * Used for calculating IP header checksum |
| 4 | * |
| 5 | * Return: 16bit checksum, complemented |
| 6 | * |
| 7 | * Inputs: |
| 8 | * in0: address of buffer to checksum (char *) |
| 9 | * in1: length of the buffer (int) |
| 10 | * |
| 11 | * Copyright (C) 2002 Intel Corp. |
| 12 | * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> |
| 13 | */ |
| 14 | |
| 15 | #include <asm/asmmacro.h> |
| 16 | |
| 17 | /* |
| 18 | * Since we know that most likely this function is called with buf aligned |
| 19 | * on 4-byte boundary and 20 bytes in length, we can execution rather quickly |
| 20 | * versus calling generic version of do_csum, which has lots of overhead in |
| 21 | * handling various alignments and sizes. However, due to lack of constrains |
| 22 | * put on the function input argument, cases with alignment not on 4-byte or |
| 23 | * size not equal to 20 bytes will be handled by the generic do_csum function. |
| 24 | */ |
| 25 | |
| 26 | #define in0 r32 |
| 27 | #define in1 r33 |
| 28 | #define ret0 r8 |
| 29 | |
| 30 | GLOBAL_ENTRY(ip_fast_csum) |
| 31 | .prologue |
| 32 | .body |
| 33 | cmp.ne p6,p7=5,in1 // size other than 20 byte? |
| 34 | and r14=3,in0 // is it aligned on 4-byte? |
| 35 | add r15=4,in0 // second source pointer |
| 36 | ;; |
| 37 | cmp.ne.or.andcm p6,p7=r14,r0 |
| 38 | ;; |
| 39 | (p7) ld4 r20=[in0],8 |
| 40 | (p7) ld4 r21=[r15],8 |
| 41 | (p6) br.spnt .generic |
| 42 | ;; |
| 43 | ld4 r22=[in0],8 |
| 44 | ld4 r23=[r15],8 |
| 45 | ;; |
| 46 | ld4 r24=[in0] |
| 47 | add r20=r20,r21 |
| 48 | add r22=r22,r23 |
| 49 | ;; |
| 50 | add r20=r20,r22 |
| 51 | ;; |
| 52 | add r20=r20,r24 |
| 53 | ;; |
| 54 | shr.u ret0=r20,16 // now need to add the carry |
| 55 | zxt2 r20=r20 |
| 56 | ;; |
| 57 | add r20=ret0,r20 |
| 58 | ;; |
| 59 | shr.u ret0=r20,16 // add carry again |
| 60 | zxt2 r20=r20 |
| 61 | ;; |
| 62 | add r20=ret0,r20 |
| 63 | ;; |
| 64 | shr.u ret0=r20,16 |
| 65 | zxt2 r20=r20 |
| 66 | ;; |
| 67 | add r20=ret0,r20 |
| 68 | ;; |
| 69 | andcm ret0=-1,r20 |
| 70 | .restore sp // reset frame state |
| 71 | br.ret.sptk.many b0 |
| 72 | ;; |
| 73 | |
| 74 | .generic: |
| 75 | .prologue |
| 76 | .save ar.pfs, r35 |
| 77 | alloc r35=ar.pfs,2,2,2,0 |
| 78 | .save rp, r34 |
| 79 | mov r34=b0 |
| 80 | .body |
| 81 | dep.z out1=in1,2,30 |
| 82 | mov out0=in0 |
| 83 | ;; |
| 84 | br.call.sptk.many b0=do_csum |
| 85 | ;; |
| 86 | andcm ret0=-1,ret0 |
| 87 | mov ar.pfs=r35 |
| 88 | mov b0=r34 |
| 89 | br.ret.sptk.many b0 |
| 90 | END(ip_fast_csum) |