I found that asm version of csum_partial_copy_from_user() introduced
in e9e016815f264227b6260f77ca84f1c43cf8b9bd was less effective.
For csum_partial_copy_from_user() case, "both_aligned" 8-word copy/sum
loop block is skipped to handle LOAD failure properly. So we should
iterate 4-word copy/sum block for that case, otherwize we will loop at
ineffective "less_than_4units" block.
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
---
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index ec0744d..0d6e9ae 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -488,8 +488,11 @@ EXC( STORE t2, UNIT(2)(dst), s_exc)
ADDC(sum, t2)
EXC( STORE t3, UNIT(3)(dst), s_exc)
ADDC(sum, t3)
- beqz len, done
+ /* If we skipped both_aligned 8-word loop, iterate here */
+ bnez AT, cleanup_both_aligned
ADD dst, dst, 4*NBYTES
+ beqz len, done
+ nop
less_than_4units:
/*
* rem = len % NBYTES
|