I found that asm version of csum_partial_copy_from_user() introduced
in e9e016815f264227b6260f77ca84f1c43cf8b9bd was less effective.
For csum_partial_copy_from_user() case, "both_aligned" 8-word copy/sum
loop block is skipped to handle LOAD failure properly, and 4-word
copy/sum block is not loop, thus we will loop at ineffective
"less_than_4units" block.
This patch re-arrange register usages so that t0-t7 can be used in
"both_aligned" loop. This makes "both_aligned" loop can be used for
copy_from_user case too. This patch also cleanup codes around entry
point.
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
---
arch/mips/lib/csum_partial.S | 74 ++++++++++++++++---------------------------
include/asm-mips/checksum.h | 12 +++---
2 files changed, 35 insertions(+), 51 deletions(-)
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index ec0744d..26ac0f8 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -291,8 +291,8 @@ #define dst a1
#define len a2
#define psum a3
#define sum v0
-#define odd t5
-#define errptr t6
+#define odd t8
+#define errptr t9
/*
* The exception handler for loads requires that:
@@ -376,30 +376,20 @@ #define ADDRMASK (NBYTES-1)
.set noat
-LEAF(csum_partial_copy_nocheck)
- move AT, zero
- b __csum_partial_copy
- move errptr, zero
-FEXPORT(__csum_partial_copy_from_user)
- b __csum_partial_copy_user
- PTR_ADDU AT, src, len /* See (1) above. */
-FEXPORT(__csum_and_copy_to_user)
- move AT, zero
-__csum_partial_copy_user:
+LEAF(__csum_partial_copy_user)
+ PTR_ADDU AT, src, len /* See (1) above. */
#ifdef CONFIG_64BIT
move errptr, a4
#else
lw errptr, 16(sp)
#endif
-__csum_partial_copy:
+FEXPORT(csum_partial_copy_nocheck)
move sum, zero
move odd, zero
/*
* Note: dst & src may be unaligned, len may be 0
* Temps
*/
-#define rem t8
-
/*
* The "issue break"s below are very approximate.
* Issue delays for dcache fills will perturb the schedule, as will
@@ -422,51 +412,45 @@ #define rem t8
both_aligned:
SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
beqz t0, cleanup_both_aligned # len < 8*NBYTES
- and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
- /*
- * We can not do this loop if LOAD might fail, otherwize
- * l_exc_copy can not calclate sum correctly.
- * AT==0 means LOAD should not fail.
- */
- bnez AT, cleanup_both_aligned
nop
+ SUB len, 8*NBYTES # subtract here for bgez loop
.align 4
1:
- LOAD t0, UNIT(0)(src)
- LOAD t1, UNIT(1)(src)
- LOAD t2, UNIT(2)(src)
- LOAD t3, UNIT(3)(src)
+EXC( LOAD t0, UNIT(0)(src), l_exc)
+EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
+EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
+EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
+EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
+EXC( LOAD t5, UNIT(5)(src), l_exc_copy)
+EXC( LOAD t6, UNIT(6)(src), l_exc_copy)
+EXC( LOAD t7, UNIT(7)(src), l_exc_copy)
SUB len, len, 8*NBYTES
- LOAD t4, UNIT(4)(src)
- LOAD t7, UNIT(5)(src)
+ ADD src, src, 8*NBYTES
EXC( STORE t0, UNIT(0)(dst), s_exc)
ADDC(sum, t0)
EXC( STORE t1, UNIT(1)(dst), s_exc)
ADDC(sum, t1)
- LOAD t0, UNIT(6)(src)
- LOAD t1, UNIT(7)(src)
- ADD src, src, 8*NBYTES
- ADD dst, dst, 8*NBYTES
-EXC( STORE t2, UNIT(-6)(dst), s_exc)
+EXC( STORE t2, UNIT(2)(dst), s_exc)
ADDC(sum, t2)
-EXC( STORE t3, UNIT(-5)(dst), s_exc)
+EXC( STORE t3, UNIT(3)(dst), s_exc)
ADDC(sum, t3)
-EXC( STORE t4, UNIT(-4)(dst), s_exc)
+EXC( STORE t4, UNIT(4)(dst), s_exc)
ADDC(sum, t4)
-EXC( STORE t7, UNIT(-3)(dst), s_exc)
+EXC( STORE t5, UNIT(5)(dst), s_exc)
+ ADDC(sum, t5)
+EXC( STORE t6, UNIT(6)(dst), s_exc)
+ ADDC(sum, t6)
+EXC( STORE t7, UNIT(7)(dst), s_exc)
ADDC(sum, t7)
-EXC( STORE t0, UNIT(-2)(dst), s_exc)
- ADDC(sum, t0)
-EXC( STORE t1, UNIT(-1)(dst), s_exc)
- .set reorder
- ADDC(sum, t1)
- bne len, rem, 1b
- .set noreorder
+ bgez len, 1b
+ ADD dst, dst, 8*NBYTES
+ ADD len, 8*NBYTES # revert len (see above)
/*
- * len == rem == the number of bytes left to copy < 8*NBYTES
+ * len == the number of bytes left to copy < 8*NBYTES
*/
cleanup_both_aligned:
+#define rem t7
beqz len, done
sltu t0, len, 4*NBYTES
bnez t0, less_than_4units
@@ -729,4 +713,4 @@ s_exc:
li v1, -EFAULT
jr ra
sw v1, (errptr)
- END(csum_partial_copy_nocheck)
+ END(__csum_partial_copy_user)
diff --git a/include/asm-mips/checksum.h b/include/asm-mips/checksum.h
index 6596fe6..84b0ace 100644
--- a/include/asm-mips/checksum.h
+++ b/include/asm-mips/checksum.h
@@ -29,10 +29,8 @@ #include <asm/uaccess.h>
*/
__wsum csum_partial(const void *buff, int len, __wsum sum);
-__wsum __csum_partial_copy_from_user(const void __user *src, void *dst,
- int len, __wsum sum, int *err_ptr);
-__wsum __csum_and_copy_to_user(const void *src, void __user *dst,
- int len, __wsum sum, int *err_ptr);
+__wsum __csum_partial_copy_user(const void __user *src, void __user *dst,
+ int len, __wsum sum, int *err_ptr);
/*
* this is a new version of the above that records errors it finds in *errp,
@@ -43,7 +41,8 @@ __wsum csum_partial_copy_from_user(const
__wsum sum, int *err_ptr)
{
might_sleep();
- return __csum_partial_copy_from_user(src, dst, len, sum, err_ptr);
+ return __csum_partial_copy_user(src, (void __user *)dst,
+ len, sum, err_ptr);
}
/*
@@ -56,7 +55,8 @@ __wsum csum_and_copy_to_user(const void
{
might_sleep();
if (access_ok(VERIFY_WRITE, dst, len))
- return __csum_and_copy_to_user(src, dst, len, sum, err_ptr);
+ return __csum_partial_copy_user((const void __user *)src, dst,
+ len, sum, err_ptr);
if (len)
*err_ptr = -EFAULT;
|