For multi-node Loongson-3 (NUMA configuration), r4k_blast_scache() can only flush Node-0's scache. So we add r4k_blast_scache_node() by using (CAC_BASE | (node_id << NODE_ADDRSPACE_SHIFT)) instead of CKSEG0 as the start address.
Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Huacai Chen chenhc@lemote.com --- arch/mips/include/asm/mach-loongson64/mmzone.h | 1 + arch/mips/include/asm/mmzone.h | 8 +++++ arch/mips/include/asm/r4kcache.h | 25 +++++++++++++++ arch/mips/mm/c-r4k.c | 44 ++++++++++++++++++++++---- 4 files changed, 71 insertions(+), 7 deletions(-)
diff --git a/arch/mips/include/asm/mach-loongson64/mmzone.h b/arch/mips/include/asm/mach-loongson64/mmzone.h index c9f7e23..59c8b11 100644 --- a/arch/mips/include/asm/mach-loongson64/mmzone.h +++ b/arch/mips/include/asm/mach-loongson64/mmzone.h @@ -21,6 +21,7 @@ #define NODE3_ADDRSPACE_OFFSET 0x300000000000UL
#define pa_to_nid(addr) (((addr) & 0xf00000000000) >> NODE_ADDRSPACE_SHIFT) +#define nid_to_addrbase(nid) ((nid) << NODE_ADDRSPACE_SHIFT)
#define LEVELS_PER_SLICE 128
diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index f085fba..2a0fe1d 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -9,6 +9,14 @@ #include <asm/page.h> #include <mmzone.h>
+#ifndef pa_to_nid +#define pa_to_nid(addr) 0 +#endif + +#ifndef nid_to_addrbase +#define nid_to_addrbase(nid) 0 +#endif + #ifdef CONFIG_DISCONTIGMEM
#define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h index 7f12d7e..e3f70dc 100644 --- a/arch/mips/include/asm/r4kcache.h +++ b/arch/mips/include/asm/r4kcache.h @@ -747,4 +747,29 @@ __BUILD_BLAST_CACHE_RANGE(s, scache, Hit_Writeback_Inv_SD, , ) __BUILD_BLAST_CACHE_RANGE(inv_d, dcache, Hit_Invalidate_D, , ) __BUILD_BLAST_CACHE_RANGE(inv_s, scache, Hit_Invalidate_SD, , )
+/* Currently, this is very specific to Loongson-3 */ +#define __BUILD_BLAST_CACHE_NODE(pfx, desc, indexop, hitop, lsize) \ +static inline void blast_##pfx##cache##lsize##_node(long node) \ +{ \ + unsigned long start = CAC_BASE | nid_to_addrbase(node); \ + unsigned long end = start + current_cpu_data.desc.waysize; \ + unsigned long ws_inc = 1UL << current_cpu_data.desc.waybit; \ + unsigned long ws_end = current_cpu_data.desc.ways << \ + current_cpu_data.desc.waybit; \ + unsigned long ws, addr; \ + \ + __##pfx##flush_prologue \ + \ + for (ws = 0; ws < ws_end; ws += ws_inc) \ + for (addr = start; addr < end; addr += lsize * 32) \ + cache##lsize##_unroll32(addr|ws, indexop); \ + \ + __##pfx##flush_epilogue \ +} + +__BUILD_BLAST_CACHE_NODE(s, scache, Index_Writeback_Inv_SD, Hit_Writeback_Inv_SD, 16) +__BUILD_BLAST_CACHE_NODE(s, scache, Index_Writeback_Inv_SD, Hit_Writeback_Inv_SD, 32) +__BUILD_BLAST_CACHE_NODE(s, scache, Index_Writeback_Inv_SD, Hit_Writeback_Inv_SD, 64) +__BUILD_BLAST_CACHE_NODE(s, scache, Index_Writeback_Inv_SD, Hit_Writeback_Inv_SD, 128) + #endif /* _ASM_R4KCACHE_H */ diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index a9ef057..05a539d 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -459,11 +459,28 @@ static void r4k_blast_scache_setup(void) r4k_blast_scache = blast_scache128; }
+static void (*r4k_blast_scache_node)(long node); + +static void r4k_blast_scache_node_setup(void) +{ + unsigned long sc_lsize = cpu_scache_line_size(); + + if (current_cpu_type() != CPU_LOONGSON3) + r4k_blast_scache_node = (void *)cache_noop; + else if (sc_lsize == 16) + r4k_blast_scache_node = blast_scache16_node; + else if (sc_lsize == 32) + r4k_blast_scache_node = blast_scache32_node; + else if (sc_lsize == 64) + r4k_blast_scache_node = blast_scache64_node; + else if (sc_lsize == 128) + r4k_blast_scache_node = blast_scache128_node; +} + static inline void local_r4k___flush_cache_all(void * args) { switch (current_cpu_type()) { case CPU_LOONGSON2: - case CPU_LOONGSON3: case CPU_R4000SC: case CPU_R4000MC: case CPU_R4400SC: @@ -480,6 +497,11 @@ static inline void local_r4k___flush_cache_all(void * args) r4k_blast_scache(); break;
+ case CPU_LOONGSON3: + /* Use get_ebase_cpunum() for both NUMA=y/n */ + r4k_blast_scache_node(get_ebase_cpunum() >> 2); + break; + case CPU_BMIPS5000: r4k_blast_scache(); __sync(); @@ -840,10 +862,14 @@ static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size)
preempt_disable(); if (cpu_has_inclusive_pcaches) { - if (size >= scache_size) - r4k_blast_scache(); - else + if (size >= scache_size) { + if (current_cpu_type() != CPU_LOONGSON3) + r4k_blast_scache(); + else + r4k_blast_scache_node(pa_to_nid(addr)); + } else { blast_scache_range(addr, addr + size); + } preempt_enable(); __sync(); return; @@ -877,9 +903,12 @@ static void r4k_dma_cache_inv(unsigned long addr, unsigned long size)
preempt_disable(); if (cpu_has_inclusive_pcaches) { - if (size >= scache_size) - r4k_blast_scache(); - else { + if (size >= scache_size) { + if (current_cpu_type() != CPU_LOONGSON3) + r4k_blast_scache(); + else + r4k_blast_scache_node(pa_to_nid(addr)); + } else { /* * There is no clearly documented alignment requirement * for the cache instruction on MIPS processors and @@ -1918,6 +1947,7 @@ void r4k_cache_init(void) r4k_blast_scache_page_setup(); r4k_blast_scache_page_indexed_setup(); r4k_blast_scache_setup(); + r4k_blast_scache_node_setup(); #ifdef CONFIG_EVA r4k_blast_dcache_user_page_setup(); r4k_blast_icache_user_page_setup();
Hi Huacai,
Copying DMA mapping maintainers for any input they may have.
On Wed, Sep 05, 2018 at 05:33:02PM +0800, Huacai Chen wrote:
static inline void local_r4k___flush_cache_all(void * args) { switch (current_cpu_type()) { case CPU_LOONGSON2:
- case CPU_LOONGSON3: case CPU_R4000SC: case CPU_R4000MC: case CPU_R4400SC:
@@ -480,6 +497,11 @@ static inline void local_r4k___flush_cache_all(void * args) r4k_blast_scache(); break;
- case CPU_LOONGSON3:
/* Use get_ebase_cpunum() for both NUMA=y/n */
r4k_blast_scache_node(get_ebase_cpunum() >> 2);
break;
I wonder if we could instead just include the node ID bits in INDEX_BASE? Then we could continue using r4k_blast_scache() here as usual.
case CPU_BMIPS5000: r4k_blast_scache(); __sync(); @@ -840,10 +862,14 @@ static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size) preempt_disable(); if (cpu_has_inclusive_pcaches) {
if (size >= scache_size)
r4k_blast_scache();
else
if (size >= scache_size) {
if (current_cpu_type() != CPU_LOONGSON3)
r4k_blast_scache();
else
r4k_blast_scache_node(pa_to_nid(addr));
} else { blast_scache_range(addr, addr + size);
preempt_enable(); __sync(); return;}
Hmm, so if I understand correctly this will writeback+invalidate the L2 for one node only? ie. you just changed which node that is.
I'm presuming L2 ops performed in one node aren't broadcast to other nodes, otherwise this patch is pointless?
Thus presumably L2 caches in other nodes may contain stale data, right? Or even worse, dirty data which may get written back at any moment?
I'm not sure this is safe - do you need to operate on all L2 caches in the system here?
I also wonder whether it would be cleaner for Loongson3 to provide a custom struct dma_map_ops to implement this, rather than adding the condition to the generic implementation.
Thanks, Paul
On Thu, Sep 27, 2018 at 5:47 AM Paul Burton paul.burton@mips.com wrote:
Hi Huacai,
Copying DMA mapping maintainers for any input they may have.
On Wed, Sep 05, 2018 at 05:33:02PM +0800, Huacai Chen wrote:
static inline void local_r4k___flush_cache_all(void * args) { switch (current_cpu_type()) { case CPU_LOONGSON2:
case CPU_LOONGSON3: case CPU_R4000SC: case CPU_R4000MC: case CPU_R4400SC:
@@ -480,6 +497,11 @@ static inline void local_r4k___flush_cache_all(void * args) r4k_blast_scache(); break;
case CPU_LOONGSON3:
/* Use get_ebase_cpunum() for both NUMA=y/n */
r4k_blast_scache_node(get_ebase_cpunum() >> 2);
break;
I wonder if we could instead just include the node ID bits in INDEX_BASE? Then we could continue using r4k_blast_scache() here as usual.
Yes, but it has no advantages.
case CPU_BMIPS5000: r4k_blast_scache(); __sync();
@@ -840,10 +862,14 @@ static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size)
preempt_disable(); if (cpu_has_inclusive_pcaches) {
if (size >= scache_size)
r4k_blast_scache();
else
if (size >= scache_size) {
if (current_cpu_type() != CPU_LOONGSON3)
r4k_blast_scache();
else
r4k_blast_scache_node(pa_to_nid(addr));
} else { blast_scache_range(addr, addr + size);
} preempt_enable(); __sync(); return;
Hmm, so if I understand correctly this will writeback+invalidate the L2 for one node only? ie. you just changed which node that is.
I'm presuming L2 ops performed in one node aren't broadcast to other nodes, otherwise this patch is pointless?
Thus presumably L2 caches in other nodes may contain stale data, right? Or even worse, dirty data which may get written back at any moment?
I'm not sure this is safe - do you need to operate on all L2 caches in the system here?
I also wonder whether it would be cleaner for Loongson3 to provide a custom struct dma_map_ops to implement this, rather than adding the condition to the generic implementation.
In Loongson-3, L2 cache is shared by all nodes, that means a memory address has only one copy in L2 (node-0's memory only cached in node-0's L2, node-1's memory only cached in node-1's memory. If node-0 want to access node-1's memory, it may hit in node-1's L2).
Huacai
Thanks, Paul
linux-stable-mirror@lists.linaro.org