Skip to content

Commit b451639

Browse files
PCIe AER printk ratelimiting backport (#520)
Signed-off-by: James Sewart <jamessewart@arista.com>
1 parent c3a089c commit b451639

13 files changed

+1284
-0
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
From 642a9f17e1bce39ecc639ed1e6f959edcd039aef Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:15 -0500
4+
Subject: [PATCH 01/12] PCI/AER: Simplify pci_print_aer()
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
Simplify pci_print_aer() by initializing the struct aer_err_info "info"
10+
with a designated initializer list (it was previously initialized with
11+
memset()) and using pci_name().
12+
13+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
14+
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
15+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
16+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
17+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
18+
Link: https://patch.msgid.link/20250522232339.1525671-10-helgaas@kernel.org
19+
(cherry picked from commit ad9839137cf9fb0f0c2d531bd04bc4382e6f2de9)
20+
---
21+
drivers/pci/pcie/aer.c | 18 +++++++++---------
22+
1 file changed, 9 insertions(+), 9 deletions(-)
23+
24+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
25+
index 13b8586924ea..7d7fc4a9fec2 100644
26+
--- a/drivers/pci/pcie/aer.c
27+
+++ b/drivers/pci/pcie/aer.c
28+
@@ -730,7 +730,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
29+
if (info->id && info->error_dev_num > 1 && info->id == id)
30+
pci_err(dev, " Error of this Agent is reported first\n");
31+
32+
- trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
33+
+ trace_aer_event(pci_name(dev), (info->status & ~info->mask),
34+
info->severity, info->tlp_header_valid, &info->tlp);
35+
}
36+
37+
@@ -766,7 +766,10 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
38+
{
39+
int layer, agent, tlp_header_valid = 0;
40+
u32 status, mask;
41+
- struct aer_err_info info;
42+
+ struct aer_err_info info = {
43+
+ .severity = aer_severity,
44+
+ .first_error = PCI_ERR_CAP_FEP(aer->cap_control),
45+
+ };
46+
47+
if (aer_severity == AER_CORRECTABLE) {
48+
status = aer->cor_status;
49+
@@ -777,14 +780,11 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
50+
tlp_header_valid = status & AER_LOG_TLP_MASKS;
51+
}
52+
53+
- layer = AER_GET_LAYER_ERROR(aer_severity, status);
54+
- agent = AER_GET_AGENT(aer_severity, status);
55+
-
56+
- memset(&info, 0, sizeof(info));
57+
- info.severity = aer_severity;
58+
info.status = status;
59+
info.mask = mask;
60+
- info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
61+
+
62+
+ layer = AER_GET_LAYER_ERROR(aer_severity, status);
63+
+ agent = AER_GET_AGENT(aer_severity, status);
64+
65+
pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
66+
__aer_print_error(dev, &info);
67+
@@ -798,7 +798,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
68+
if (tlp_header_valid)
69+
__print_tlp_header(dev, &aer->header_log);
70+
71+
- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
72+
+ trace_aer_event(pci_name(dev), (status & ~mask),
73+
aer_severity, tlp_header_valid, &aer->header_log);
74+
}
75+
EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
76+
--
77+
2.47.0
78+
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
From dfcd0e7d5008dffb55517a8207d455cab69dc70c Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:16 -0500
4+
Subject: [PATCH 02/12] PCI/AER: Update statistics before ratelimiting
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
There are two AER logging entry points:
10+
11+
- aer_print_error() is used by DPC (dpc_process_error()) and native AER
12+
handling (aer_process_err_devices()).
13+
14+
- pci_print_aer() is used by GHES (aer_recover_work_func()) and CXL
15+
(cxl_handle_rdport_errors())
16+
17+
Both use __aer_print_error() to print the AER error bits. Previously
18+
__aer_print_error() also incremented the AER statistics via
19+
pci_dev_aer_stats_incr().
20+
21+
Call pci_dev_aer_stats_incr() early in the entry points instead of in
22+
__aer_print_error() so we update the statistics even if the actual printing
23+
of error bits is rate limited by a future change.
24+
25+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
26+
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
27+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
28+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
29+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
30+
Link: https://patch.msgid.link/20250522232339.1525671-11-helgaas@kernel.org
31+
(cherry picked from commit 88a7765e62b9e4c79c7ca2c7b749ae04f54a5668)
32+
---
33+
drivers/pci/pcie/aer.c | 5 ++++-
34+
1 file changed, 4 insertions(+), 1 deletion(-)
35+
36+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
37+
index 7d7fc4a9fec2..48a33151d145 100644
38+
--- a/drivers/pci/pcie/aer.c
39+
+++ b/drivers/pci/pcie/aer.c
40+
@@ -694,7 +694,6 @@ static void __aer_print_error(struct pci_dev *dev,
41+
pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
42+
info->first_error == i ? " (First)" : "");
43+
}
44+
- pci_dev_aer_stats_incr(dev, info);
45+
}
46+
47+
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
48+
@@ -703,6 +702,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
49+
int id = pci_dev_id(dev);
50+
const char *level;
51+
52+
+ pci_dev_aer_stats_incr(dev, info);
53+
+
54+
if (!info->status) {
55+
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
56+
aer_error_severity_string[info->severity]);
57+
@@ -783,6 +784,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
58+
info.status = status;
59+
info.mask = mask;
60+
61+
+ pci_dev_aer_stats_incr(dev, &info);
62+
+
63+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
64+
agent = AER_GET_AGENT(aer_severity, status);
65+
66+
--
67+
2.47.0
68+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
From b6cbde42bc2406d734511a7b28499f9fda4fd91c Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:17 -0500
4+
Subject: [PATCH 03/12] PCI/AER: Trace error event before ratelimiting
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
As with the AER statistics, we always want to emit trace events, even if
10+
the actual dmesg logging is rate limited.
11+
12+
Call trace_aer_event() immediately after pci_dev_aer_stats_incr() so both
13+
happen before ratelimiting.
14+
15+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
16+
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
17+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
18+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
19+
Link: https://patch.msgid.link/20250522232339.1525671-12-helgaas@kernel.org
20+
(cherry picked from commit 6bb4befbd65fa7f99688fb707e376637e5acfe36)
21+
---
22+
drivers/pci/pcie/aer.c | 10 ++++------
23+
1 file changed, 4 insertions(+), 6 deletions(-)
24+
25+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
26+
index 48a33151d145..6fc993d05647 100644
27+
--- a/drivers/pci/pcie/aer.c
28+
+++ b/drivers/pci/pcie/aer.c
29+
@@ -703,6 +703,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
30+
const char *level;
31+
32+
pci_dev_aer_stats_incr(dev, info);
33+
+ trace_aer_event(pci_name(dev), (info->status & ~info->mask),
34+
+ info->severity, info->tlp_header_valid, &info->tlp);
35+
36+
if (!info->status) {
37+
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
38+
@@ -730,9 +732,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
39+
out:
40+
if (info->id && info->error_dev_num > 1 && info->id == id)
41+
pci_err(dev, " Error of this Agent is reported first\n");
42+
-
43+
- trace_aer_event(pci_name(dev), (info->status & ~info->mask),
44+
- info->severity, info->tlp_header_valid, &info->tlp);
45+
}
46+
47+
static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
48+
@@ -785,6 +784,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
49+
info.mask = mask;
50+
51+
pci_dev_aer_stats_incr(dev, &info);
52+
+ trace_aer_event(pci_name(dev), (status & ~mask),
53+
+ aer_severity, tlp_header_valid, &aer->header_log);
54+
55+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
56+
agent = AER_GET_AGENT(aer_severity, status);
57+
@@ -800,9 +801,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
58+
59+
if (tlp_header_valid)
60+
__print_tlp_header(dev, &aer->header_log);
61+
-
62+
- trace_aer_event(pci_name(dev), (status & ~mask),
63+
- aer_severity, tlp_header_valid, &aer->header_log);
64+
}
65+
EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
66+
67+
--
68+
2.47.0
69+
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
From 824b8af8cc8d81ef95d020dcfcd4338249f1f7de Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:23 -0500
4+
Subject: [PATCH 04/12] PCI/AER: Simplify add_error_device()
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
Return -ENOSPC error early so the usual path through add_error_device() is
10+
the straightline code.
11+
12+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
13+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
14+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
15+
Link: https://patch.msgid.link/20250522232339.1525671-18-helgaas@kernel.org
16+
(cherry picked from commit d72bae423004aa7b4d94c34a7fd0b48b64305a08)
17+
---
18+
drivers/pci/pcie/aer.c | 15 +++++++++------
19+
1 file changed, 9 insertions(+), 6 deletions(-)
20+
21+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
22+
index 6fc993d05647..a10a005ca6e9 100644
23+
--- a/drivers/pci/pcie/aer.c
24+
+++ b/drivers/pci/pcie/aer.c
25+
@@ -811,12 +811,15 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
26+
*/
27+
static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
28+
{
29+
- if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
30+
- e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
31+
- e_info->error_dev_num++;
32+
- return 0;
33+
- }
34+
- return -ENOSPC;
35+
+ int i = e_info->error_dev_num;
36+
+
37+
+ if (i >= AER_MAX_MULTI_ERR_DEVICES)
38+
+ return -ENOSPC;
39+
+
40+
+ e_info->dev[i] = pci_dev_get(dev);
41+
+ e_info->error_dev_num++;
42+
+
43+
+ return 0;
44+
}
45+
46+
/**
47+
--
48+
2.47.0
49+
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
From 4355136d6587aa7ce7729aa8f35f898993c5f8a4 Mon Sep 17 00:00:00 2001
2+
From: Karolina Stolarek <karolina.stolarek@oracle.com>
3+
Date: Thu, 22 May 2025 18:21:18 -0500
4+
Subject: [PATCH 05/12] PCI/AER: Check log level once and remember it
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
When reporting an AER error, we check its type multiple times to determine
10+
the log level for each message. Do this check only in the top-level
11+
functions (aer_isr_one_error(), pci_print_aer()) and save the level in
12+
struct aer_err_info.
13+
14+
[bhelgaas: save log level in struct aer_err_info instead of passing it
15+
as a parameter]
16+
17+
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
18+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
19+
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
20+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
21+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
22+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
23+
Link: https://patch.msgid.link/20250522232339.1525671-13-helgaas@kernel.org
24+
(cherry picked from commit c8f6791e33a7757025285db26f3b382cdcb7f7cd)
25+
---
26+
drivers/pci/pci.h | 1 +
27+
drivers/pci/pcie/aer.c | 18 +++++++++---------
28+
drivers/pci/pcie/dpc.c | 1 +
29+
3 files changed, 11 insertions(+), 9 deletions(-)
30+
31+
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
32+
index 65df6d2ac003..ce78241b6302 100644
33+
--- a/drivers/pci/pci.h
34+
+++ b/drivers/pci/pci.h
35+
@@ -505,6 +505,7 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev)
36+
struct aer_err_info {
37+
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
38+
int error_dev_num;
39+
+ const char *level; /* printk level */
40+
41+
unsigned int id:16;
42+
43+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
44+
index a10a005ca6e9..82acb7580fd4 100644
45+
--- a/drivers/pci/pcie/aer.c
46+
+++ b/drivers/pci/pcie/aer.c
47+
@@ -675,16 +675,14 @@ static void __aer_print_error(struct pci_dev *dev,
48+
{
49+
const char **strings;
50+
unsigned long status = info->status & ~info->mask;
51+
- const char *level, *errmsg;
52+
+ const char *level = info->level;
53+
+ const char *errmsg;
54+
int i;
55+
56+
- if (info->severity == AER_CORRECTABLE) {
57+
+ if (info->severity == AER_CORRECTABLE)
58+
strings = aer_correctable_error_string;
59+
- level = KERN_WARNING;
60+
- } else {
61+
+ else
62+
strings = aer_uncorrectable_error_string;
63+
- level = KERN_ERR;
64+
- }
65+
66+
for_each_set_bit(i, &status, 32) {
67+
errmsg = strings[i];
68+
@@ -700,7 +698,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
69+
{
70+
int layer, agent;
71+
int id = pci_dev_id(dev);
72+
- const char *level;
73+
+ const char *level = info->level;
74+
75+
pci_dev_aer_stats_incr(dev, info);
76+
trace_aer_event(pci_name(dev), (info->status & ~info->mask),
77+
@@ -715,8 +713,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
78+
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
79+
agent = AER_GET_AGENT(info->severity, info->status);
80+
81+
- level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
82+
-
83+
pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
84+
aer_error_severity_string[info->severity],
85+
aer_error_layer[layer], aer_agent_string[agent]);
86+
@@ -774,9 +770,11 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
87+
if (aer_severity == AER_CORRECTABLE) {
88+
status = aer->cor_status;
89+
mask = aer->cor_mask;
90+
+ info.level = KERN_WARNING;
91+
} else {
92+
status = aer->uncor_status;
93+
mask = aer->uncor_mask;
94+
+ info.level = KERN_ERR;
95+
tlp_header_valid = status & AER_LOG_TLP_MASKS;
96+
}
97+
98+
@@ -1291,6 +1289,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
99+
if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
100+
e_info.id = ERR_COR_ID(e_src->id);
101+
e_info.severity = AER_CORRECTABLE;
102+
+ e_info.level = KERN_WARNING;
103+
104+
if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
105+
e_info.multi_error_valid = 1;
106+
@@ -1304,6 +1303,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
107+
108+
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
109+
e_info.id = ERR_UNCOR_ID(e_src->id);
110+
+ e_info.level = KERN_ERR;
111+
112+
if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
113+
e_info.severity = AER_FATAL;
114+
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
115+
index cdc54315d879..d830696dccec 100644
116+
--- a/drivers/pci/pcie/dpc.c
117+
+++ b/drivers/pci/pcie/dpc.c
118+
@@ -254,6 +254,7 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
119+
else
120+
info->severity = AER_NONFATAL;
121+
122+
+ info->level = KERN_ERR;
123+
return 1;
124+
}
125+
126+
--
127+
2.47.0
128+

0 commit comments

Comments
 (0)