Ralf,
The R4000/R4400 has a coprocessor 0 hazard when a P-cache operation is
less than two non-load, non-cache instructions apart from a store to the
same line. For processors without a secondary cache, the code in pg-r4k.c
currently issues a Create Dirty Exclusive D-cache operation and then
immediately executes consecutive stores to the same line, therefore
fulfilling the conditions for the hazard.
The following patch changes the problematic operations to be performed on
the cache line following the one to be written immediately. It is safe to
do so, because the cache operations are only a performance hint and are
not required for data coherency. However it is essential not to bypass
the end of the page, so the trailing area of the page is excluded from
these cache operation, similarly to what has already been done for
prefetching.
Actually, I'd like to optimize the functions a bit further, specifically
to avoid multiple cacheops to the same line (if you don't mind), but
currently I'd like to apply this change to assure correct operation. As I
have no non-SC R4000/R4400 system, this was untested, but perhaps studying
the problem covered by the -scache patch sent previously will show if the
hazard is indeed avoided.
The patch also increases the buffers a bit for three reasons:
1. copy_page_array is already too small for the 128-byte S-cache line
case. ;-)
2. The trail for non-SC R4000/R4400 increases buffer consumption and I was
too lazy to calculate the requirements.
3. The planned optimization will likely require a little bit more space as
well.
BTW, I was unable to reproduce your instruction count calculation for the
prefetch case; other results seem OK.
OK to apply?
Maciej
--
+ Maciej W. Rozycki, Technical University of Gdansk, Poland +
+--------------------------------------------------------------+
+ e-mail: macro@ds2.pg.gda.pl, PGP key available +
patch-mips-2.4.24-pre2-20040116-mips-pg-r4k-hazard-7
diff -up --recursive --new-file
linux-mips-2.4.24-pre2-20040116.macro/arch/mips/mm/pg-r4k.c
linux-mips-2.4.24-pre2-20040116/arch/mips/mm/pg-r4k.c
--- linux-mips-2.4.24-pre2-20040116.macro/arch/mips/mm/pg-r4k.c 2004-01-26
16:05:38.000000000 +0000
+++ linux-mips-2.4.24-pre2-20040116/arch/mips/mm/pg-r4k.c 2004-01-26
16:06:21.000000000 +0000
@@ -34,7 +34,7 @@
* With prefetching, 16 byte strides 0xa0 bytes
*/
-static unsigned int clear_page_array[0xa0 / 4];
+static unsigned int clear_page_array[0x100 / 4];
void clear_page(void * page) __attribute__((alias("clear_page_array")));
@@ -46,7 +46,7 @@ void clear_page(void * page) __attribute
* R4600 v2.0: 0x84 bytes
* With prefetching, 16 byte strides 0xb8 bytes
*/
-static unsigned int copy_page_array[0xb8 / 4];
+static unsigned int copy_page_array[0x100 / 4];
void copy_page(void *to, void *from) __attribute__((alias("copy_page_array")));
@@ -159,7 +159,7 @@ static inline void build_cdex(void)
mi.c_format.rs = 4; /* $a0 */
mi.c_format.c_op = 3; /* Create Dirty Exclusive */
mi.c_format.cache = 1; /* Data Cache */
- mi.c_format.simmediate = store_offset;
+ mi.c_format.simmediate = store_offset + cpu_dcache_line_size();
*epc++ = mi.word;
}
@@ -300,6 +300,8 @@ static inline void build_jr_ra(void)
void __init build_clear_page(void)
{
+ int lead_size, loop_size;
+
epc = (unsigned int *) &clear_page_array;
if (cpu_has_prefetch) {
@@ -316,7 +318,20 @@ void __init build_clear_page(void)
}
}
- build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_clear :
0));
+ if (cpu_has_prefetch)
+ lead_size = PAGE_SIZE - pref_offset_clear;
+ else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+ loop_size = 4;
+ if (cpu_has_64bit_registers)
+ loop_size *= 2;
+ loop_size *= 8;
+ if (loop_size < cpu_dcache_line_size())
+ loop_size = cpu_dcache_line_size();
+ lead_size = PAGE_SIZE - loop_size;
+ } else
+ lead_size = PAGE_SIZE;
+
+ build_addiu_a2_a0(lead_size);
if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
*epc++ = 0x40026000; /* mfc0 $v0, $12 */
@@ -354,8 +369,8 @@ dest = epc;
build_bne(dest);
build_store_reg(0);
- if (cpu_has_prefetch && pref_offset_clear) {
- build_addiu_a2_a0(pref_offset_clear);
+ if (lead_size < PAGE_SIZE) {
+ build_addiu_a2_a0(PAGE_SIZE - lead_size);
dest = epc;
__build_store_reg(0);
__build_store_reg(0);
@@ -383,9 +398,26 @@ dest = epc;
void __init build_copy_page(void)
{
+ int lead_size, loop_size;
+
epc = (unsigned int *) ©_page_array;
- build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_copy :
0));
+ if (cpu_has_prefetch)
+ lead_size = PAGE_SIZE - pref_offset_copy;
+ else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+ loop_size = 4;
+#ifdef CONFIG_MIPS64
+ loop_size *= 2;
+#endif
+ loop_size *= 8;
+ if (loop_size < cpu_dcache_line_size())
+ loop_size = cpu_dcache_line_size();
+ lead_size = PAGE_SIZE - loop_size;
+ } else
+ lead_size = PAGE_SIZE;
+
+ build_addiu_a2_a0(lead_size);
+
if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
*epc++ = 0x40026000; /* mfc0 $v0, $12 */
@@ -440,8 +472,8 @@ dest = epc;
build_bne(dest);
build_store_reg(11);
- if (cpu_has_prefetch && pref_offset_copy) {
- build_addiu_a2_a0(pref_offset_copy);
+ if (lead_size < PAGE_SIZE) {
+ build_addiu_a2_a0(PAGE_SIZE - lead_size);
dest = epc;
__build_load_reg( 8);
__build_load_reg( 9);
diff -up --recursive --new-file
linux-mips-2.4.24-pre2-20040116.macro/arch/mips64/mm/pg-r4k.c
linux-mips-2.4.24-pre2-20040116/arch/mips64/mm/pg-r4k.c
--- linux-mips-2.4.24-pre2-20040116.macro/arch/mips64/mm/pg-r4k.c
2004-01-26 16:05:38.000000000 +0000
+++ linux-mips-2.4.24-pre2-20040116/arch/mips64/mm/pg-r4k.c 2004-01-26
16:06:21.000000000 +0000
@@ -34,7 +34,7 @@
* With prefetching, 16 byte strides 0xa0 bytes
*/
-static unsigned int clear_page_array[0xa0 / 4];
+static unsigned int clear_page_array[0x100 / 4];
void clear_page(void * page) __attribute__((alias("clear_page_array")));
@@ -46,7 +46,7 @@ void clear_page(void * page) __attribute
* R4600 v2.0: 0x84 bytes
* With prefetching, 16 byte strides 0xb8 bytes
*/
-static unsigned int copy_page_array[0xb8 / 4];
+static unsigned int copy_page_array[0x100 / 4];
void copy_page(void *to, void *from) __attribute__((alias("copy_page_array")));
@@ -159,7 +159,7 @@ static inline void build_cdex(void)
mi.c_format.rs = 4; /* $a0 */
mi.c_format.c_op = 3; /* Create Dirty Exclusive */
mi.c_format.cache = 1; /* Data Cache */
- mi.c_format.simmediate = store_offset;
+ mi.c_format.simmediate = store_offset + cpu_dcache_line_size();
*epc++ = mi.word;
}
@@ -300,6 +300,8 @@ static inline void build_jr_ra(void)
void __init build_clear_page(void)
{
+ int lead_size, loop_size;
+
epc = (unsigned int *) &clear_page_array;
if (cpu_has_prefetch) {
@@ -316,7 +318,20 @@ void __init build_clear_page(void)
}
}
- build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_clear :
0));
+ if (cpu_has_prefetch)
+ lead_size = PAGE_SIZE - pref_offset_clear;
+ else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+ loop_size = 4;
+ if (cpu_has_64bit_registers)
+ loop_size *= 2;
+ loop_size *= 8;
+ if (loop_size < cpu_dcache_line_size())
+ loop_size = cpu_dcache_line_size();
+ lead_size = PAGE_SIZE - loop_size;
+ } else
+ lead_size = PAGE_SIZE;
+
+ build_addiu_a2_a0(lead_size);
if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
*epc++ = 0x40026000; /* mfc0 $v0, $12 */
@@ -354,8 +369,8 @@ dest = epc;
build_bne(dest);
build_store_reg(0);
- if (cpu_has_prefetch && pref_offset_clear) {
- build_addiu_a2_a0(pref_offset_clear);
+ if (lead_size < PAGE_SIZE) {
+ build_addiu_a2_a0(PAGE_SIZE - lead_size);
dest = epc;
__build_store_reg(0);
__build_store_reg(0);
@@ -383,9 +398,26 @@ dest = epc;
void __init build_copy_page(void)
{
+ int lead_size, loop_size;
+
epc = (unsigned int *) ©_page_array;
- build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_copy :
0));
+ if (cpu_has_prefetch)
+ lead_size = PAGE_SIZE - pref_offset_copy;
+ else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+ loop_size = 4;
+#ifdef CONFIG_MIPS64
+ loop_size *= 2;
+#endif
+ loop_size *= 8;
+ if (loop_size < cpu_dcache_line_size())
+ loop_size = cpu_dcache_line_size();
+ lead_size = PAGE_SIZE - loop_size;
+ } else
+ lead_size = PAGE_SIZE;
+
+ build_addiu_a2_a0(lead_size);
+
if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
*epc++ = 0x40026000; /* mfc0 $v0, $12 */
@@ -440,8 +472,8 @@ dest = epc;
build_bne(dest);
build_store_reg(11);
- if (cpu_has_prefetch && pref_offset_copy) {
- build_addiu_a2_a0(pref_offset_copy);
+ if (lead_size < PAGE_SIZE) {
+ build_addiu_a2_a0(PAGE_SIZE - lead_size);
dest = epc;
__build_load_reg( 8);
__build_load_reg( 9);
|