Skip to content

Commit 21becdf

Browse files
davejianggregkh
authored andcommitted
cxl: Add post-reset warning if reset results in loss of previously committed HDM decoders
[ Upstream commit 934edcd ] Secondary Bus Reset (SBR) is equivalent to a device being hot removed and inserted again. Doing a SBR on a CXL type 3 device is problematic if the exported device memory is part of system memory that cannot be offlined. The event is equivalent to violently ripping out that range of memory from the kernel. While the hardware requires the "Unmask SBR" bit set in the Port Control Extensions register and the kernel currently does not unmask it, user can unmask this bit via setpci or similar tool. The driver does not have a way to detect whether a reset coming from the PCI subsystem is a Function Level Reset (FLR) or SBR. The only way to detect is to note if a decoder is marked as enabled in software but the decoder control register indicates it's not committed. Add a helper function to find discrepancy between the decoder software state versus the hardware register state. Suggested-by: Dan Williams <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Dave Jiang <[email protected]> Signed-off-by: Bjorn Helgaas <[email protected]> Reviewed-by: Jonathan Cameron <[email protected]> Reviewed-by: Dan Williams <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 2b42e95 commit 21becdf

File tree

3 files changed

+53
-0
lines changed

3 files changed

+53
-0
lines changed

drivers/cxl/core/pci.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,3 +1045,32 @@ long cxl_pci_get_latency(struct pci_dev *pdev)
10451045

10461046
return cxl_flit_size(pdev) * MEGA / bw;
10471047
}
1048+
1049+
static int __cxl_endpoint_decoder_reset_detected(struct device *dev, void *data)
1050+
{
1051+
struct cxl_port *port = data;
1052+
struct cxl_decoder *cxld;
1053+
struct cxl_hdm *cxlhdm;
1054+
void __iomem *hdm;
1055+
u32 ctrl;
1056+
1057+
if (!is_endpoint_decoder(dev))
1058+
return 0;
1059+
1060+
cxld = to_cxl_decoder(dev);
1061+
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
1062+
return 0;
1063+
1064+
cxlhdm = dev_get_drvdata(&port->dev);
1065+
hdm = cxlhdm->regs.hdm_decoder;
1066+
ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id));
1067+
1068+
return !FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl);
1069+
}
1070+
1071+
bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port)
1072+
{
1073+
return device_for_each_child(&port->dev, port,
1074+
__cxl_endpoint_decoder_reset_detected);
1075+
}
1076+
EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, CXL);

drivers/cxl/cxl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,8 @@ void cxl_coordinates_combine(struct access_coordinate *out,
891891
struct access_coordinate *c1,
892892
struct access_coordinate *c2);
893893

894+
bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
895+
894896
/*
895897
* Unit test builds overrides this to __weak, find the 'strong' version
896898
* of these symbols in tools/testing/cxl/.

drivers/cxl/pci.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,11 +957,33 @@ static void cxl_error_resume(struct pci_dev *pdev)
957957
dev->driver ? "successful" : "failed");
958958
}
959959

960+
static void cxl_reset_done(struct pci_dev *pdev)
961+
{
962+
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
963+
struct cxl_memdev *cxlmd = cxlds->cxlmd;
964+
struct device *dev = &pdev->dev;
965+
966+
/*
967+
* FLR does not expect to touch the HDM decoders and related
968+
* registers. SBR, however, will wipe all device configurations.
969+
* Issue a warning if there was an active decoder before the reset
970+
* that no longer exists.
971+
*/
972+
guard(device)(&cxlmd->dev);
973+
if (cxlmd->endpoint &&
974+
cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) {
975+
dev_crit(dev, "SBR happened without memory regions removal.\n");
976+
dev_crit(dev, "System may be unstable if regions hosted system memory.\n");
977+
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
978+
}
979+
}
980+
960981
static const struct pci_error_handlers cxl_error_handlers = {
961982
.error_detected = cxl_error_detected,
962983
.slot_reset = cxl_slot_reset,
963984
.resume = cxl_error_resume,
964985
.cor_error_detected = cxl_cor_error_detected,
986+
.reset_done = cxl_reset_done,
965987
};
966988

967989
static struct pci_driver cxl_pci_driver = {

0 commit comments

Comments
 (0)