Each Chip-Select (CS) of a Unified Memory Controller (UMC) on AMD EPYC SOCs has an Address Mask and a Secondary Address Mask register associated with it. The amd64_edac module logs DIMM sizes on a per-UMC per-CS granularity during init using these two registers.
Currently, the module primarily considers only the Address Mask register for computing DIMM sizes. The Secondary Address Mask register is only considered for odd CS. Additionally, if it has been considered, the Address Mask register is ignored altogether for that CS. For power-of-two DIMMs, this is not an issue since only the Address Mask register is used.
For non-power-of-two DIMMs, however, the Secondary Address Mask register is used in conjunction with the Address Mask register. However, since the module only considers either of the two registers for a CS, the size computed by the module is incorrect. The Secondary Address Mask register is not considered for even CS, and the Address Mask register is not considered for odd CS.
Furthermore, also rename some variables for greater clarity.
Fixes: 81f5090db843 ("EDAC/amd64: Support asymmetric dual-rank DIMMs") Signed-off-by: Avadhut Naik avadhut.naik@amd.com Cc: stable@vger.kernel.org --- drivers/edac/amd64_edac.c | 56 ++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 21 deletions(-)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 90f0eb7cc5b9..16117fda727f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1209,7 +1209,9 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) if (csrow_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_PRIMARY;
- /* Asymmetric dual-rank DIMM support. */ + if (csrow_sec_enabled(2 * dimm, ctrl, pvt)) + cs_mode |= CS_EVEN_SECONDARY; + if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_SECONDARY;
@@ -1230,12 +1232,10 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; }
-static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, - int csrow_nr, int dimm) +static int calculate_cs_size(u32 mask, unsigned int cs_mode) { - u32 msb, weight, num_zero_bits; - u32 addr_mask_deinterleaved; - int size = 0; + int msb, weight, num_zero_bits; + u32 deinterleaved_mask = 0;
/* * The number of zero bits in the mask is equal to the number of bits @@ -1248,19 +1248,32 @@ static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, * without swapping with the most significant bit. This can be handled * by keeping the MSB where it is and ignoring the single zero bit. */ - msb = fls(addr_mask_orig) - 1; - weight = hweight_long(addr_mask_orig); + msb = fls(mask) - 1; + weight = hweight_long(mask); num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE);
/* Take the number of zero bits off from the top of the mask. */ - addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); + deinterleaved_mask = GENMASK(msb - num_zero_bits, 1); + edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", deinterleaved_mask); + + return (deinterleaved_mask >> 2) + 1; +} + +static int __addr_mask_to_cs_size(u32 addr_mask, u32 addr_mask_sec, + unsigned int cs_mode, int csrow_nr, int dimm) +{ + int size = 0;
edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); - edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); - edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); + edac_dbg(1, " Primary AddrMask: 0x%x\n", addr_mask);
/* Register [31:1] = Address [39:9]. Size is in kBs here. */ - size = (addr_mask_deinterleaved >> 2) + 1; + size = calculate_cs_size(addr_mask, cs_mode); + + if (addr_mask_sec) { + edac_dbg(1, " Secondary AddrMask: 0x%x\n", addr_mask); + size += calculate_cs_size(addr_mask_sec, cs_mode); + }
/* Return size in MBs. */ return size >> 10; @@ -1270,7 +1283,7 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { int cs_mask_nr = csrow_nr; - u32 addr_mask_orig; + u32 addr_mask = 0, addr_mask_sec = 0; int dimm, size = 0;
/* No Chip Selects are enabled. */ @@ -1308,13 +1321,13 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, if (!pvt->flags.zn_regs_v2) cs_mask_nr >>= 1;
- /* Asymmetric dual-rank DIMM support. */ - if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) - addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr]; - else - addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr]; + if (cs_mode & CS_EVEN_PRIMARY || cs_mode & CS_ODD_PRIMARY) + addr_mask = pvt->csels[umc].csmasks[cs_mask_nr]; + + if (cs_mode & CS_EVEN_SECONDARY || cs_mode & CS_ODD_SECONDARY) + addr_mask_sec = pvt->csels[umc].csmasks_sec[cs_mask_nr];
- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, dimm); + return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, dimm); }
static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) @@ -3512,9 +3525,10 @@ static void gpu_get_err_info(struct mce *m, struct err_info *err) static int gpu_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { - u32 addr_mask_orig = pvt->csels[umc].csmasks[csrow_nr]; + u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr]; + u32 addr_mask_sec = pvt->csels[umc].csmasks_sec[csrow_nr];
- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, csrow_nr >> 1); + return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, csrow_nr >> 1); }
static void gpu_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
base-commit: f1861fb575b028e35e6233295441d535f2e3f240
On Thu, Mar 27, 2025 at 09:03:50PM +0000, Avadhut Naik wrote:
Each Chip-Select (CS) of a Unified Memory Controller (UMC) on AMD EPYC SOCs has an Address Mask and a Secondary Address Mask register associated with it. The amd64_edac module logs DIMM sizes on a per-UMC per-CS granularity during init using these two registers.
Currently, the module primarily considers only the Address Mask register for computing DIMM sizes. The Secondary Address Mask register is only considered for odd CS. Additionally, if it has been considered, the Address Mask register is ignored altogether for that CS. For power-of-two DIMMs, this is not an issue since only the Address Mask register is used.
For non-power-of-two DIMMs, however, the Secondary Address Mask register is used in conjunction with the Address Mask register. However, since the module only considers either of the two registers for a CS, the size computed by the module is incorrect. The Secondary Address Mask register is not considered for even CS, and the Address Mask register is not considered for odd CS.
Missing an imperative statement for the major change.
"Include Secondary Address Mask register in calculation..." or similar.
Furthermore, also rename some variables for greater clarity.
Fixes: 81f5090db843 ("EDAC/amd64: Support asymmetric dual-rank DIMMs") Signed-off-by: Avadhut Naik avadhut.naik@amd.com Cc: stable@vger.kernel.org
JFYI, the TIP maintainer's guide recommends *not* actually sending the patch to the stable list. Though I recall some stable maintainers are fine with, or prefer, this.
drivers/edac/amd64_edac.c | 56 ++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 21 deletions(-)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 90f0eb7cc5b9..16117fda727f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1209,7 +1209,9 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) if (csrow_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_PRIMARY;
- /* Asymmetric dual-rank DIMM support. */
- if (csrow_sec_enabled(2 * dimm, ctrl, pvt))
cs_mode |= CS_EVEN_SECONDARY;
- if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_SECONDARY;
@@ -1230,12 +1232,10 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; } -static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode,
int csrow_nr, int dimm)
+static int calculate_cs_size(u32 mask, unsigned int cs_mode) {
- u32 msb, weight, num_zero_bits;
- u32 addr_mask_deinterleaved;
- int size = 0;
- int msb, weight, num_zero_bits;
- u32 deinterleaved_mask = 0;
Don't need to initialize if it is set before first use below.
It doesn't hurt, but we might get patches to change this. I forget the exact reason; maybe saving an instruction here and there adds up throughout the kernel.
/* * The number of zero bits in the mask is equal to the number of bits @@ -1248,19 +1248,32 @@ static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, * without swapping with the most significant bit. This can be handled * by keeping the MSB where it is and ignoring the single zero bit. */
- msb = fls(addr_mask_orig) - 1;
- weight = hweight_long(addr_mask_orig);
- msb = fls(mask) - 1;
- weight = hweight_long(mask); num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE);
/* Take the number of zero bits off from the top of the mask. */
- addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1);
- deinterleaved_mask = GENMASK(msb - num_zero_bits, 1);
This change makes sense to me. But it would be good to mention it in the commit message. This is more than just renaming variables for clarity.
- edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", deinterleaved_mask);
- return (deinterleaved_mask >> 2) + 1;
Also, 'introducing a new helper function' should be highlighted in the commit message. It doesn't need to be a long description.
+}
+static int __addr_mask_to_cs_size(u32 addr_mask, u32 addr_mask_sec,
unsigned int cs_mode, int csrow_nr, int dimm)
+{
- int size = 0;
You don't need to initialize this since it is immediately set below.
Or you can just call the function here.
edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm);
- edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig);
- edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved);
- edac_dbg(1, " Primary AddrMask: 0x%x\n", addr_mask);
/* Register [31:1] = Address [39:9]. Size is in kBs here. */
- size = (addr_mask_deinterleaved >> 2) + 1;
- size = calculate_cs_size(addr_mask, cs_mode);
- if (addr_mask_sec) {
I think we can skip this check.
For debug messages, it doesn't hurt to be more explicit. So printing a 'mask: 0x0' message is more helpful/reassuring than 'no message'.
edac_dbg(1, " Secondary AddrMask: 0x%x\n", addr_mask);
addr_mask -> addr_mask_sec
size += calculate_cs_size(addr_mask_sec, cs_mode);
Maybe add a "!mask" check to return early if you want to save some cycles in this helper function.
- }
/* Return size in MBs. */ return size >> 10; @@ -1270,7 +1283,7 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { int cs_mask_nr = csrow_nr;
- u32 addr_mask_orig;
- u32 addr_mask = 0, addr_mask_sec = 0; int dimm, size = 0;
/* No Chip Selects are enabled. */ @@ -1308,13 +1321,13 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, if (!pvt->flags.zn_regs_v2) cs_mask_nr >>= 1;
- /* Asymmetric dual-rank DIMM support. */
- if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY))
addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr];
- else
addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr];
- if (cs_mode & CS_EVEN_PRIMARY || cs_mode & CS_ODD_PRIMARY)
Another common way to do this kind of check:
if (cs_mode & (CS_EVEN_PRIMARY | CS_ODD_PRIMARY))
addr_mask = pvt->csels[umc].csmasks[cs_mask_nr];
- if (cs_mode & CS_EVEN_SECONDARY || cs_mode & CS_ODD_SECONDARY)
addr_mask_sec = pvt->csels[umc].csmasks_sec[cs_mask_nr];
- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, dimm);
- return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, dimm);
} static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) @@ -3512,9 +3525,10 @@ static void gpu_get_err_info(struct mce *m, struct err_info *err) static int gpu_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) {
- u32 addr_mask_orig = pvt->csels[umc].csmasks[csrow_nr];
- u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr];
- u32 addr_mask_sec = pvt->csels[umc].csmasks_sec[csrow_nr];
Please align on the '='.
- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, csrow_nr >> 1);
- return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, csrow_nr >> 1);
} static void gpu_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
base-commit: f1861fb575b028e35e6233295441d535f2e3f240
Thanks, Yazen
Hi,
On 4/4/2025 19:58, Yazen Ghannam wrote:
On Thu, Mar 27, 2025 at 09:03:50PM +0000, Avadhut Naik wrote:
Each Chip-Select (CS) of a Unified Memory Controller (UMC) on AMD EPYC SOCs has an Address Mask and a Secondary Address Mask register associated with it. The amd64_edac module logs DIMM sizes on a per-UMC per-CS granularity during init using these two registers.
Currently, the module primarily considers only the Address Mask register for computing DIMM sizes. The Secondary Address Mask register is only considered for odd CS. Additionally, if it has been considered, the Address Mask register is ignored altogether for that CS. For power-of-two DIMMs, this is not an issue since only the Address Mask register is used.
For non-power-of-two DIMMs, however, the Secondary Address Mask register is used in conjunction with the Address Mask register. However, since the module only considers either of the two registers for a CS, the size computed by the module is incorrect. The Secondary Address Mask register is not considered for even CS, and the Address Mask register is not considered for odd CS.
Missing an imperative statement for the major change.
"Include Secondary Address Mask register in calculation..." or similar.
Will add a statement.
Furthermore, also rename some variables for greater clarity.
Fixes: 81f5090db843 ("EDAC/amd64: Support asymmetric dual-rank DIMMs") Signed-off-by: Avadhut Naik avadhut.naik@amd.com Cc: stable@vger.kernel.org
JFYI, the TIP maintainer's guide recommends *not* actually sending the patch to the stable list. Though I recall some stable maintainers are fine with, or prefer, this.
Thanks for the info!
drivers/edac/amd64_edac.c | 56 ++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 21 deletions(-)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 90f0eb7cc5b9..16117fda727f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1209,7 +1209,9 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) if (csrow_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_PRIMARY;
- /* Asymmetric dual-rank DIMM support. */
- if (csrow_sec_enabled(2 * dimm, ctrl, pvt))
cs_mode |= CS_EVEN_SECONDARY;
- if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_SECONDARY;
@@ -1230,12 +1232,10 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; } -static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode,
int csrow_nr, int dimm)
+static int calculate_cs_size(u32 mask, unsigned int cs_mode) {
- u32 msb, weight, num_zero_bits;
- u32 addr_mask_deinterleaved;
- int size = 0;
- int msb, weight, num_zero_bits;
- u32 deinterleaved_mask = 0;
Don't need to initialize if it is set before first use below.
It doesn't hurt, but we might get patches to change this. I forget the exact reason; maybe saving an instruction here and there adds up throughout the kernel.
Will remove this.
/* * The number of zero bits in the mask is equal to the number of bits @@ -1248,19 +1248,32 @@ static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode, * without swapping with the most significant bit. This can be handled * by keeping the MSB where it is and ignoring the single zero bit. */
- msb = fls(addr_mask_orig) - 1;
- weight = hweight_long(addr_mask_orig);
- msb = fls(mask) - 1;
- weight = hweight_long(mask); num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE);
/* Take the number of zero bits off from the top of the mask. */
- addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1);
- deinterleaved_mask = GENMASK(msb - num_zero_bits, 1);
This change makes sense to me. But it would be good to mention it in the commit message. This is more than just renaming variables for clarity.
Noted. Will mention this in the commit message.
- edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", deinterleaved_mask);
- return (deinterleaved_mask >> 2) + 1;
Also, 'introducing a new helper function' should be highlighted in the commit message. It doesn't need to be a long description.
Okay. Will mention this in the commit message.
+}
+static int __addr_mask_to_cs_size(u32 addr_mask, u32 addr_mask_sec,
unsigned int cs_mode, int csrow_nr, int dimm)
+{
- int size = 0;
You don't need to initialize this since it is immediately set below.
Or you can just call the function here.
Will remove the initialization. Calling the function here might make the debug logs confusing.
edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm);
- edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig);
- edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved);
- edac_dbg(1, " Primary AddrMask: 0x%x\n", addr_mask);
/* Register [31:1] = Address [39:9]. Size is in kBs here. */
- size = (addr_mask_deinterleaved >> 2) + 1;
- size = calculate_cs_size(addr_mask, cs_mode);
- if (addr_mask_sec) {
I think we can skip this check.
Commented below on this.
For debug messages, it doesn't hurt to be more explicit. So printing a 'mask: 0x0' message is more helpful/reassuring than 'no message'.
edac_dbg(1, " Secondary AddrMask: 0x%x\n", addr_mask);
addr_mask -> addr_mask_sec
size += calculate_cs_size(addr_mask_sec, cs_mode);
Maybe add a "!mask" check to return early if you want to save some cycles in this helper function.
In a way, this is the reason why I had added the above condition check. To avoid unnecessary function calls.
AFAIK, power-of-2 DIMMs are predominantly used, so the Secondary Address Mask register will seldom be used.
Would you agree?
- }
/* Return size in MBs. */ return size >> 10; @@ -1270,7 +1283,7 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { int cs_mask_nr = csrow_nr;
- u32 addr_mask_orig;
- u32 addr_mask = 0, addr_mask_sec = 0; int dimm, size = 0;
/* No Chip Selects are enabled. */ @@ -1308,13 +1321,13 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, if (!pvt->flags.zn_regs_v2) cs_mask_nr >>= 1;
- /* Asymmetric dual-rank DIMM support. */
- if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY))
addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr];
- else
addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr];
- if (cs_mode & CS_EVEN_PRIMARY || cs_mode & CS_ODD_PRIMARY)
Another common way to do this kind of check:
if (cs_mode & (CS_EVEN_PRIMARY | CS_ODD_PRIMARY))
Will use this!
addr_mask = pvt->csels[umc].csmasks[cs_mask_nr];
- if (cs_mode & CS_EVEN_SECONDARY || cs_mode & CS_ODD_SECONDARY)
addr_mask_sec = pvt->csels[umc].csmasks_sec[cs_mask_nr];
- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, dimm);
- return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, dimm);
} static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) @@ -3512,9 +3525,10 @@ static void gpu_get_err_info(struct mce *m, struct err_info *err) static int gpu_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) {
- u32 addr_mask_orig = pvt->csels[umc].csmasks[csrow_nr];
- u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr];
- u32 addr_mask_sec = pvt->csels[umc].csmasks_sec[csrow_nr];
Please align on the '='.
Will do.
- return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, csrow_nr >> 1);
- return __addr_mask_to_cs_size(addr_mask, addr_mask_sec, cs_mode, csrow_nr, csrow_nr >> 1);
} static void gpu_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
base-commit: f1861fb575b028e35e6233295441d535f2e3f240
Thanks, Yazen
linux-stable-mirror@lists.linaro.org