From macro@ds2.pg.gda.pl  Wed Jan 30 15:56:35 2002
Received: (uucp@localhost) by guadalquivir.fnet.fr (8.8.8/97.02.12/Guadalquivir); id PAA08537; Wed, 30 Jan 2002 15:56:35 +0100 (MET)
Received-Date: Wed, 30 Jan 2002 15:56:35 +0100 (MET)
Received: from delta.ds2.pg.gda.pl(213.192.72.1)
 via SMTP by guadalquivir.fnet.fr, id smtpd008528; Wed Jan 30 15:56:16 2002
Received: from localhost by delta.ds2.pg.gda.pl (8.9.3/8.9.3) with SMTP id PAA08257;
	Wed, 30 Jan 2002 15:56:15 +0100 (MET)
Date: Wed, 30 Jan 2002 15:56:14 +0100 (MET)
From: "Maciej W. Rozycki" <macro@ds2.pg.gda.pl>
To: Ralf Baechle <ralf@uni-koblenz.de>
cc: linux-mips@fnet.fr, linux-mips@oss.sgi.com
Subject: [patch] linux 2.4.17: An mb() rework
Message-ID: <Pine.GSO.3.96.1020130151155.2880D-100000@delta.ds2.pg.gda.pl>
Organization: Technical University of Gdansk
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII
Content-Length: 4180
Lines: 142

Hello,

 The current implementation of mb() and friends is broken.  It depends on
a model-specific behaviour of the R4000/R4400 CPU and does not guarantee
preserving the desired semantics over the whole MIPS family.  What's
worse, recently I've identified a case it doesn't work at all on an
R4400SC CPU -- values written to an i/o memory resource were not committed
to the device even after executing over 40 subsequent instructions. 

 Here is a patch that implements mb() in an "architecturally defined" way. 
Since "sync" acts as a reordering barrier and an uncached load cannot be
prefetched or postponed, this implementation assures a correct
serialization on every MIPS processor, regardless of its implementation
details. 

 For MIPS I CPUs, which do not support the "sync" instruction, a lone
uncached load is defined to provide a serialization itself, thus the
implementation is correct for these CPUs as well. 

 The patch looks more complicated than it should, but it's a part of a
wbflush() clean-up and a few of these helper submacros are really needed
(and others are defined for consistency).

 Tested successfully on an R3400 and an R4400SC CPU on DECstation systems. 

 If anyone has any comments, then please speak up.  Otherwise, Ralf,
please apply.

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

patch-mips-2.4.17-20020111-mb-4
diff -up --recursive --new-file linux-mips-2.4.17-20020111.macro/arch/mips/config.in linux-mips-2.4.17-20020111/arch/mips/config.in
--- linux-mips-2.4.17-20020111.macro/arch/mips/config.in	Mon Jan  7 05:27:13 2002
+++ linux-mips-2.4.17-20020111/arch/mips/config.in	Sat Jan 26 02:35:35 2002
@@ -363,6 +363,12 @@ else
       fi
    fi
 fi
+if [ "$CONFIG_CPU_R3000" = "y" -o \
+     "$CONFIG_CPU_TX39XX" = "y" ]; then
+   define_bool CONFIG_CPU_HAS_SYNC n
+else
+   define_bool CONFIG_CPU_HAS_SYNC y
+fi
 endmenu
 
 mainmenu_option next_comment
diff -up --recursive --new-file linux-mips-2.4.17-20020111.macro/include/asm-mips/system.h linux-mips-2.4.17-20020111/include/asm-mips/system.h
--- linux-mips-2.4.17-20020111.macro/include/asm-mips/system.h	Thu Dec 13 05:28:09 2001
+++ linux-mips-2.4.17-20020111/include/asm-mips/system.h	Wed Jan 30 02:17:18 2002
@@ -18,9 +18,12 @@
 
 #include <linux/config.h>
 #include <asm/sgidefs.h>
-#include <asm/ptrace.h>
+
 #include <linux/kernel.h>
 
+#include <asm/addrspace.h>
+#include <asm/ptrace.h>
+
 __asm__ (
 	".macro\t__sti\n\t"
 	".set\tpush\n\t"
@@ -170,27 +173,57 @@ extern void __global_restore_flags(unsig
 /*
  * These are probably defined overly paranoid ...
  */
+#ifdef CONFIG_CPU_HAS_SYNC
+#define __sync()	__asm__ __volatile__("sync" : : : "memory")
+#else
+#define __sync()	do { } while(0)
+#endif
+
+#define __fast_wmb()	do { } while(0)
+#define __fast_rmb()			\
+	__asm__ __volatile__(		\
+		".set	push\n\t"	\
+		".set	noreorder\n\t"	\
+		"lw	$0,%0\n\t"	\
+		"nop\n\t"		\
+		".set	pop"		\
+		: /* no output */	\
+		: "m" (*(int *)KSEG1)	\
+		: "memory")
+#define __fast_mb()	__fast_rmb()
+
+#define fast_wmb()			\
+	do {				\
+		__sync();		\
+		__fast_wmb();		\
+	} while (0)
+#define fast_rmb()			\
+	do {				\
+		__sync();		\
+		__fast_rmb();		\
+	} while (0)
+#define fast_mb()			\
+	do {				\
+		__sync();		\
+		__fast_mb();		\
+	} while (0)
+
 #ifdef CONFIG_CPU_HAS_WB
 
 #include <asm/wbflush.h>
-#define rmb()	do { } while(0)
-#define wmb()	wbflush()
-#define mb()	wbflush()
+#define wmb()		fast_wmb()
+#define rmb()				\
+	do {				\
+		__sync();		\
+		wbflush();		\
+	} while (0)
+#define mb()		rmb()
 
 #else /* CONFIG_CPU_HAS_WB  */
 
-#define mb()						\
-__asm__ __volatile__(					\
-	"# prevent instructions being moved around\n\t"	\
-	".set\tnoreorder\n\t"				\
-	"# 8 nops to fool the R4400 pipeline\n\t"	\
-	"nop;nop;nop;nop;nop;nop;nop;nop\n\t"		\
-	".set\treorder"					\
-	: /* no output */				\
-	: /* no input */				\
-	: "memory")
-#define rmb() mb()
-#define wmb() mb()
+#define wmb()		fast_wmb()
+#define rmb()		fast_rmb()
+#define mb()		fast_mb()
 
 #endif /* CONFIG_CPU_HAS_WB  */
 

From macro@ds2.pg.gda.pl  Thu Jan 31 13:18:28 2002
Received: (uucp@localhost) by guadalquivir.fnet.fr (8.8.8/97.02.12/Guadalquivir); id NAA20227; Thu, 31 Jan 2002 13:18:28 +0100 (MET)
Received-Date: Thu, 31 Jan 2002 13:18:28 +0100 (MET)
Received: from delta.ds2.pg.gda.pl(213.192.72.1)
 via SMTP by guadalquivir.fnet.fr, id smtpd020223; Thu Jan 31 13:18:23 2002
Received: from localhost by delta.ds2.pg.gda.pl (8.9.3/8.9.3) with SMTP id NAA08560;
	Thu, 31 Jan 2002 13:17:39 +0100 (MET)
Date: Thu, 31 Jan 2002 13:17:39 +0100 (MET)
From: "Maciej W. Rozycki" <macro@ds2.pg.gda.pl>
To: Jason Gunthorpe <jgg@debian.org>
cc: linux-mips@fnet.fr, linux-mips@oss.sgi.com
Subject: Re: [patch] linux 2.4.17: An mb() rework
In-Reply-To: <Pine.LNX.3.96.1020130123109.11192A-100000@wakko.deltatee.com>
Message-ID: <Pine.GSO.3.96.1020131115837.5578A-100000@delta.ds2.pg.gda.pl>
Organization: Technical University of Gdansk
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII
Content-Length: 3199
Lines: 65

On Wed, 30 Jan 2002, Jason Gunthorpe wrote:

> I was under the impression that the mb() functions existed to maintain
> order of memory access and were not defined as flushes per say - so is
> the time delay a concern (perhaps sticking a sync before ERET is
> appropriate?)

 Well, other architectures seem to define the macros such that wmb() is a
pure ordering barrier (to assure strong ordering between writes) and rmb()
is a flush (since reads are synchronous by their nature this implies all
other transactions have to be finished before).

 Putting a "sync" before "eret" certainly doesn't work.  The case I've
identified is mask_and_ack() in an interrupt handler.  It masks an active
IRQ line in an external controller so that handle_IRQ_event() can unmask
interrupts in the CPU (this implies mask_and_ack() is synchronous).  But
due to the current lack of proper synchronization, the write isn't
executed until after the __sti() there.  As a result for almost every IRQ
routed through the external controler there is a spurious one signalled
shortly afterwards.  There is still a long path from here to an "eret". 

> I spent some time talking with the Sandcraft people about memory barrier
> issues, and it turns out that at least on the SR71000 (and in most cases
> the RM7K) the order of SysAD transactions will always match the order of
> the instruction stream, but all writes are posted and all reads are split
> - that is the CPU can execute two back to back uncached loads and several
> back to back uncached stores without stalling the pipeline, or getting the
> IO's out of order. Adding sync's and uncached loads only slows things down
> for these chips. 

 Ok, what about assuring a value written to an I/O memory resource has
actually been commited?  That's what rmb() is for.  Since the CPU is
strongly-orderes wmb() (a "sync") should be indistinguishable from a
"nop". 

> I understand this is because the CPUs have a single load/store unit and do
> not do out of order execution. Many older/embedded MIPS designs probably
> have a similar configuration, they could likely also run with out the
> syncs. 

 Putting a wmb() or not should be the matter of requirements of specific
drivers.  Wasting a "nop" (effectively) is surely a justifiable price for
system's consistency.

> So - could you add something like CONFIG_IN_ORDER_IO which would nullify
> the syncs for these processors?

 Hmm, the option seems to exist already, namely CONFIG_NONCOHERENT_IO, but
is it really worth making the third variation to save a single "nop",
especially as barriers are relatively rare? I'll do it, if it is.

> BTW, does anyone know what CONFIG_WB is ment to mean? The CPU has a write
> buffer that does not preserve order? 

 Certain DECstation models have a write-back buffer that needs to be
handled explicitly.  For example rmb() is "1: bc0f 1b" for the R3220 WB
chip.  Wmb() is null, certainly, as the buffer is strongly-ordered.  See
arch/mips/dec/wbflush.c for details.

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

From macro@ds2.pg.gda.pl  Thu Jan 31 22:50:45 2002
Received: (uucp@localhost) by guadalquivir.fnet.fr (8.8.8/97.02.12/Guadalquivir); id WAA27257; Thu, 31 Jan 2002 22:50:45 +0100 (MET)
Received-Date: Thu, 31 Jan 2002 22:50:45 +0100 (MET)
Received: from delta.ds2.pg.gda.pl(213.192.72.1)
 via SMTP by guadalquivir.fnet.fr, id smtpd027253; Thu Jan 31 22:50:41 2002
Received: from localhost by delta.ds2.pg.gda.pl (8.9.3/8.9.3) with SMTP id WAA08755;
	Thu, 31 Jan 2002 22:50:21 +0100 (MET)
Date: Thu, 31 Jan 2002 22:50:20 +0100 (MET)
From: "Maciej W. Rozycki" <macro@ds2.pg.gda.pl>
Reply-To: "Maciej W. Rozycki" <macro@ds2.pg.gda.pl>
To: Jason Gunthorpe <jgg@debian.org>
cc: linux-mips@fnet.fr, linux-mips@oss.sgi.com
Subject: Re: [patch] linux 2.4.17: An mb() rework
In-Reply-To: <Pine.LNX.3.96.1020131110531.13418A-100000@wakko.deltatee.com>
Message-ID: <Pine.GSO.3.96.1020131215446.579H-100000@delta.ds2.pg.gda.pl>
Organization: Technical University of Gdansk
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII
Content-Length: 5890
Lines: 146

On Thu, 31 Jan 2002, Jason Gunthorpe wrote:

> >  Well, other architectures seem to define the macros such that wmb() is a
> > pure ordering barrier (to assure strong ordering between writes) and rmb()
> > is a flush (since reads are synchronous by their nature this implies all
> > other transactions have to be finished before).
> 
> Yes, I've noticed the wmb bit, not sure about rmb - it is hard to tell
> without knowing the gritty details of the unusual asm ops used :< sparc64 
> for instance seems to define them purely as ordering barriers.

 Weird.

> A little more qualification would be that:
>     write(...,device);   // Disable int
>     wmb()
>     enable_ints();
> Is expected to have a potential spurious interrupt. But, perhaps this is

 Of course -- this only assures any write within "enable_ints()" won't
happen before "write(...,device)".  Actual writes may happen much later,
e.g. when conditions on the bus allow. 

> OK:
>     outl(...,device);
>     wmb();
>     enable_ints();
> This is consistant with how the PCI spec discusses ordering/etc and
> barriers are frequently used in the PCI drivers. Looking at the i386 code
> this is what I would expect to see.

 Hmm, the DECstation is not a PCI machine.  And the external interrupt
controller can be treated as a part of the chipset -- writes to it doesn't
go through any kind of an external bus.  It's possible that the chipset
acts as a write-back buffer extension for the CPU's internal buffer.

> >  Putting a "sync" before "eret" certainly doesn't work.  The case I've
> > identified is mask_and_ack() in an interrupt handler.  It masks an active
> > IRQ line in an external controller so that handle_IRQ_event() can unmask
> 
> Yeah, Ok, this is fairly normal. You have the unique case were using a
> sync will solve your problem. Sync only fixes this if the target address
> is in the CPU's system controller, it does not work for, say, PCI devices.

 A "sync" is required since otherwise a write can bypass a read.  For CPUs
for which it doesn't happen, a "sync" is effectively a "nop", so it
doesn't really matter.

 Note that I don't need a "sync" specifically (the code works for me
either way), since for the R4400 a write cannot bypass a read, but it's
still needed for correctness. 

> IMHO it is better to manage this sort of flushing explicitly in all
> drivers, PCI drivers already do a read from device, drivers for system
> controller devices could do that, or use a special sync macro.. That way
> it is visible in the code and not relying on an arch/cpu-specific wmb() 
> side effect. 

 Hmm, wmb() is pretty clearly defined.  The current implementation does
not enforce strict ordering and is thus incorrect.  Note that the R4400
manual explicitly states a cached write can bypass an uncached one, hence
the CPU may exploit weak ordering under certain circumstances.  The "sync" 
instruction was specifically defined to avoid such a risk.

> If you are using a UP mips that has strongly ordered blocking reads and
> writes I'd think that rmb/wmb should only be asm("":::"memory"). If your
> driver needs to do syncs for non-ordering reasons it should be doing
> syncs. 

 Nope, it's not always strongly ordered -- see the R4400 manual.  Also
note that rmb() is defined as asm("lock; addl $0,0(%%esp)":::"memory") for
the i386, to make sure all transactions are completed even in the UP mode,
even though the i386 is strongly ordered.

> No, a sync will still empty the write buffer. It may halt the pipeline for
> many (~80 perhaps) cycles while posted write data is dumped to the system
> controller.

 Then it's an implementation bug.  For a CPU in the UP mode "sync" is only
defined to prevent write and read reordering -- there is nothing about
flushing.

> Regrettably, the SysAD bus that alot of mips chips use does not allow a
> non-posted write, so in that case 'sync' is not going to commit an I/O
> write as you would expect, it just moves it into a write buffer on the
> system controller or on a PCI bridge. (This is not what PCI defines for IO
> accesses, I don't know of any really elegent way to fix it though..)

 That's a correct implementation -- see the "sync" definition.

> I don't think rmb is generaly defined to flush IO writes.. It isn't used
> very often at all in the drives/net and drivers/scsi stuff..

 If a driver doesn't need it then it doesn't use it.  It may be buggy as
well, e.g. due to being i386-centric.

> Er, no, CONFIG_NONCOHERENT_IO is for caches that are not coherent with the
> IO subsystem.

 I haven't investigated the option much, I admit.

> While we are on this topic, does anyone know what the correct linux way
> to turn off an interrupt at a PCI device is? I've seen this in a few
> of the drivers:
> 
>     writeb(0,base+INT_ENABLE);
>     readb(base+INT_ENABLE);    // Flush the write through the PCI bus
>     wmb();
>     ints_on();
> 
> Or should it be:
> 
>     writeb(0,base+INT_ENABLE);
>     wmb();
>     readb(base+INT_ENABLE);    // Flush the write through the PCI bus
>     rmb();
>     ints_on();
> 
> Which makes more sense..

 I would do it as:

	writeb(0,base+INT_ENABLE);
	mb();
	readb(base+INT_ENABLE);	// Flush the write through the PCI bus
	rmb();
	ints_on();

You need an "mb()", since you are changing the access type, so you need to
synchronize both kinds.

> If the latter is correct then your patch needs to define rmb as:
>    'lw $1,KSEG'
>    'addiu $0, $1, 0x0000'
> Otherwise MIPS chips that do not support blocking uncached memory reads
> will not work right. (SR71000 is such a chip)

 I don't understand what the purpose of the above code is, except that it
wastes a cycle.  Please elaborate. 

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

