Skip to content

Commit 1629086

Browse files
committed
Fix user-pinned DMA for Thunderbolt-connected devices
The kernel marks Thunderbolt-connected devices as "untrusted", forcing DMA through SWIOTLB bounce buffering. This breaks user-pinned DMA: the SWIOTLB path allocates IOVAs per scatter-gather entry rather than as a contiguous range, so PIN_PAGES fails the contiguity check. Even if it didn't fail there, bounce buffering is fundamentally incompatible with user-pinned memory since the device would DMA to bounce buffers rather than the user's pages. Clear the untrusted flag during probe. With IOMMU active, the device is already restricted to explicitly-mapped addresses and cannot access arbitrary system memory. Also add fix-tt-hotplug-bars, a helper script to work around BAR allocation failures on Thunderbolt hotplug. Requires "pci=realloc" on the kernel command line.
1 parent 8cee7d0 commit 1629086

File tree

2 files changed

+126
-0
lines changed

2 files changed

+126
-0
lines changed

enumerate.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,21 @@ static int tenstorrent_pci_probe(struct pci_dev *dev, const struct pci_device_id
278278
dma_set_max_seg_size(&dev->dev, UINT_MAX);
279279
dma_set_seg_boundary(&dev->dev, ULONG_MAX);
280280

281+
// Thunderbolt-connected devices are marked "untrusted" by the kernel,
282+
// which forces SWIOTLB bounce buffering even with IOMMU enabled. The
283+
// IOMMU itself provides DMA isolation; clear the flag only when IOMMU
284+
// is active and the untrust is due to an external-facing port.
285+
// pci_dev.external_facing was added in v5.9.
286+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
287+
if (dev->untrusted && is_iommu_translated(&dev->dev)) {
288+
struct pci_dev *root = pcie_find_root_port(dev);
289+
if (root && root->external_facing) {
290+
dev_info(&dev->dev, "Clearing untrusted flag (external-facing port, IOMMU active)\n");
291+
dev->untrusted = 0;
292+
}
293+
}
294+
#endif
295+
281296
pci_set_master(dev);
282297
pci_enable_pcie_error_reporting(dev);
283298

tools/fix-tt-hotplug-bars

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/bin/bash
2+
# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
3+
# SPDX-License-Identifier: GPL-2.0-only
4+
#
5+
# Fix BAR allocation for Thunderbolt-connected Tenstorrent devices.
6+
#
7+
# When a Tenstorrent device is connected via Thunderbolt, the kernel may fail
8+
# to allocate sufficient memory for the device's BARs. This script fixes that
9+
# by removing the Thunderbolt bridge and rescanning, which allows the kernel's
10+
# pci=realloc logic to properly size the bridge windows.
11+
#
12+
# PREREQUISITE: Boot with "pci=realloc" on the kernel command line.
13+
# 1. Edit /etc/default/grub
14+
# 2. Add "pci=realloc" to GRUB_CMDLINE_LINUX
15+
# 3. Run: sudo update-grub && sudo reboot
16+
#
17+
# Usage: fix-tt-hotplug-bars [--dry-run]
18+
19+
set -euo pipefail
20+
21+
TENSTORRENT_VENDOR="0x1e52"
22+
DRY_RUN=0
23+
24+
die() { echo "fix-tt-hotplug-bars: $*" >&2; exit 1; }
25+
log() { echo "fix-tt-hotplug-bars: $*" >&2; }
26+
27+
# Check if device has unassigned BARs
28+
has_unassigned_bars() {
29+
lspci -s "${1#0000:}" -v 2>/dev/null | grep -q '<unassigned>'
30+
}
31+
32+
# Walk up the PCI tree to find the Thunderbolt Host bridge.
33+
# "Host" = in your computer, "Hub" = in external docks.
34+
# We want the first Host bridge - the downstream port connecting to the dock.
35+
find_thunderbolt_bridge() {
36+
local dev="$1"
37+
local current="$dev"
38+
39+
while [[ -d "/sys/bus/pci/devices/$current" ]]; do
40+
local real_path parent_path parent
41+
real_path=$(readlink -f "/sys/bus/pci/devices/$current")
42+
parent_path=$(dirname "$real_path")
43+
parent=$(basename "$parent_path")
44+
45+
# Stop at root
46+
[[ "$parent" == pci* || "$parent" == "devices" ]] && break
47+
48+
# Look for Thunderbolt "Host" bridge (not "Hub" which is in docks)
49+
if [[ -f "/sys/bus/pci/devices/$parent/vendor" ]]; then
50+
if [[ "$(cat "/sys/bus/pci/devices/$parent/vendor")" == "0x8086" ]]; then
51+
if lspci -s "${parent#0000:}" 2>/dev/null | grep -qiE "thunderbolt.*host|usb4.*host"; then
52+
echo "$parent"
53+
return 0
54+
fi
55+
fi
56+
fi
57+
58+
current="$parent"
59+
done
60+
61+
return 1
62+
}
63+
64+
main() {
65+
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=1
66+
[[ $EUID -eq 0 ]] || die "must be run as root"
67+
68+
local fixed=0
69+
70+
echo 1 > /sys/bus/pci/rescan
71+
72+
for dev_path in /sys/bus/pci/devices/*; do
73+
[[ -f "$dev_path/vendor" ]] || continue
74+
[[ "$(cat "$dev_path/vendor")" == "$TENSTORRENT_VENDOR" ]] || continue
75+
76+
local dev=$(basename "$dev_path")
77+
78+
has_unassigned_bars "$dev" || continue
79+
80+
log "device $dev has unassigned BARs"
81+
82+
local bridge
83+
if ! bridge=$(find_thunderbolt_bridge "$dev"); then
84+
log "warning: $dev not behind a Thunderbolt bridge, skipping"
85+
continue
86+
fi
87+
88+
log "removing Thunderbolt bridge $bridge and rescanning..."
89+
90+
if [[ $DRY_RUN -eq 1 ]]; then
91+
log "[dry-run] would remove $bridge and rescan"
92+
else
93+
echo 1 > "/sys/bus/pci/devices/$bridge/remove"
94+
sleep 2
95+
echo 1 > /sys/bus/pci/rescan
96+
sleep 3
97+
98+
if has_unassigned_bars "$dev"; then
99+
log "warning: $dev still has unassigned BARs after rescan"
100+
continue
101+
fi
102+
fi
103+
104+
((fixed++)) || true
105+
done
106+
107+
[[ $fixed -eq 0 ]] && log "no Tenstorrent devices with BAR issues found"
108+
[[ $fixed -gt 0 ]] && log "fixed $fixed device(s)"
109+
}
110+
111+
main "$@"

0 commit comments

Comments
 (0)