Skip to content

Commit 3c8315a

Browse files
authored
Merge pull request #1595 from merkelmarrow/1-build-dir-nfs-robustness-pr
Harden build dir deletion against transient NFS failures [CI 1/9]
2 parents e7026f5 + aebc9f5 commit 3c8315a

5 files changed

Lines changed: 154 additions & 19 deletions

File tree

src/finn/transformation/fpgadataflow/cleanup.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929

3030
import os
3131
import qonnx.custom_op.registry as registry
32-
import shutil
3332
from qonnx.transformation.base import Transformation
3433

34+
from finn.util.basic import robust_rmtree
3535
from finn.util.fpgadataflow import is_fpgadataflow_node
3636

3737

@@ -45,12 +45,12 @@ def apply(self, model):
4545
# delete PYNQ project, if any
4646
vivado_pynq_proj_dir = model.get_metadata_prop("vivado_pynq_proj")
4747
if vivado_pynq_proj_dir is not None and os.path.isdir(vivado_pynq_proj_dir):
48-
shutil.rmtree(vivado_pynq_proj_dir)
48+
robust_rmtree(vivado_pynq_proj_dir)
4949
model.set_metadata_prop("vivado_pynq_proj", "")
5050
# delete IP stitching project, if any
5151
ipstitch_path = model.get_metadata_prop("vivado_stitch_proj")
5252
if ipstitch_path is not None and os.path.isdir(ipstitch_path):
53-
shutil.rmtree(ipstitch_path)
53+
robust_rmtree(ipstitch_path)
5454
model.set_metadata_prop("vivado_stitch_proj", "")
5555
for node in model.graph.node:
5656
op_type = node.op_type
@@ -61,22 +61,22 @@ def apply(self, model):
6161
# delete code_gen_dir from cppsim
6262
code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim")
6363
if os.path.isdir(code_gen_dir):
64-
shutil.rmtree(code_gen_dir)
64+
robust_rmtree(code_gen_dir)
6565
inst.set_nodeattr("code_gen_dir_cppsim", "")
6666
inst.set_nodeattr("executable_path", "")
6767
# delete code_gen_dir from ipgen and project folder
6868
code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
6969
ipgen_path = inst.get_nodeattr("ipgen_path")
7070
if os.path.isdir(code_gen_dir):
71-
shutil.rmtree(code_gen_dir)
71+
robust_rmtree(code_gen_dir)
7272
if os.path.isdir(ipgen_path):
73-
shutil.rmtree(ipgen_path)
73+
robust_rmtree(ipgen_path)
7474
inst.set_nodeattr("code_gen_dir_ipgen", "")
7575
inst.set_nodeattr("ipgen_path", "")
7676
# delete Java HotSpot Performance data log
7777
for d_name in os.listdir("/tmp/"):
7878
if "hsperfdata" in d_name:
79-
shutil.rmtree("/tmp/" + str(d_name))
79+
robust_rmtree("/tmp/" + str(d_name))
8080

8181
except KeyError:
8282
# exception if op_type is not supported

src/finn/util/basic.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,14 @@
2626
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828

29+
import errno
2930
import os
3031
import re
32+
import shutil
3133
import subprocess
3234
import sys
3335
import tempfile
36+
import time
3437
from qonnx.core.modelwrapper import ModelWrapper
3538
from qonnx.custom_op.registry import getCustomOp
3639
from qonnx.util.basic import gen_finn_dt_tensor, roundup_to_integer_multiple
@@ -159,16 +162,38 @@ def make_build_dir(prefix=""):
159162
Use this function instead of tempfile.mkdtemp to ensure any generated files
160163
will survive on the host after the FINN Docker container exits."""
161164
try:
162-
tmpdir = tempfile.mkdtemp(prefix=prefix)
163-
newdir = tmpdir.replace("/tmp", os.environ["FINN_BUILD_DIR"])
164-
os.makedirs(newdir)
165-
return newdir
165+
build_dir = os.environ["FINN_BUILD_DIR"]
166166
except KeyError:
167167
raise Exception(
168168
"""Environment variable FINN_BUILD_DIR must be set
169-
correctly. Please ensure you have launched the Docker contaier correctly.
169+
correctly. Please ensure you have launched the Docker container correctly.
170170
"""
171171
)
172+
os.makedirs(build_dir, exist_ok=True)
173+
new_dir = tempfile.mkdtemp(prefix=prefix, dir=build_dir)
174+
os.chmod(new_dir, 0o755)
175+
return new_dir
176+
177+
178+
def robust_rmtree(path, retries=6, initial_delay=0.1, backoff=2.0):
179+
"""Remove a directory tree with retries for transient NFS cleanup races.
180+
Retries ``ENOTEMPTY``/``EBUSY``. Other errors propagate immediately.
181+
"""
182+
if not path or not os.path.exists(path):
183+
return
184+
delay = initial_delay
185+
for attempt in range(retries):
186+
try:
187+
shutil.rmtree(path)
188+
return
189+
except FileNotFoundError:
190+
return
191+
except OSError as exc:
192+
transient = exc.errno in (errno.ENOTEMPTY, errno.EBUSY)
193+
if not transient or attempt == retries - 1:
194+
raise
195+
time.sleep(delay)
196+
delay *= backoff
172197

173198

174199
class CppBuilder:

tests/fpgadataflow/test_fifosizing.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import pytest
3131

3232
import json
33-
import shutil
3433
import torch
3534
from brevitas.export import export_qonnx
3635
from onnx import TensorProto, helper
@@ -46,7 +45,7 @@
4645
import finn.builder.build_dataflow_config as build_cfg
4746
from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
4847
from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
49-
from finn.util.basic import make_build_dir
48+
from finn.util.basic import make_build_dir, robust_rmtree
5049
from finn.util.test import get_trained_network_and_ishape
5150

5251

@@ -111,8 +110,8 @@ def test_fifosizing_linear(method, topology):
111110
node1_inst = getCustomOp(node1)
112111
assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth")
113112

114-
shutil.rmtree(tmp_output_dir)
115-
shutil.rmtree(tmp_output_dir_cmp)
113+
robust_rmtree(tmp_output_dir)
114+
robust_rmtree(tmp_output_dir_cmp)
116115

117116

118117
@pytest.mark.slow

tests/fpgadataflow/test_split_large_fifos.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import pytest
3131

3232
import json
33-
import shutil
3433
import torch
3534
from brevitas.export import export_qonnx
3635
from qonnx.core.modelwrapper import ModelWrapper
@@ -39,7 +38,7 @@
3938
import finn.builder.build_dataflow as build
4039
import finn.builder.build_dataflow_config as build_cfg
4140
from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs
42-
from finn.util.basic import make_build_dir
41+
from finn.util.basic import make_build_dir, robust_rmtree
4342
from finn.util.test import get_trained_network_and_ishape
4443

4544

@@ -103,7 +102,7 @@ def test_split_large_fifos(depth):
103102
assert fifo_depth == golden_cfg[i % len(golden_cfg)][0]
104103
assert fifo_depth > 1
105104

106-
shutil.rmtree(tmp_output_dir)
105+
robust_rmtree(tmp_output_dir)
107106

108107

109108
def test_split_large_fifo_configs():

tests/util/test_robust_rmtree.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
############################################################################
2+
# Copyright (C) 2026, Advanced Micro Devices, Inc.
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#
7+
############################################################################
8+
9+
import pytest
10+
11+
import errno
12+
import os
13+
import shutil
14+
import time
15+
from pathlib import Path
16+
17+
from finn.util.basic import make_build_dir, robust_rmtree
18+
19+
20+
@pytest.mark.util
21+
def test_robust_rmtree_succeeds_first_attempt():
22+
target = make_build_dir("test_rmtree_")
23+
sub = os.path.join(target, "sub")
24+
os.makedirs(sub)
25+
Path(os.path.join(sub, "f")).write_text("x")
26+
robust_rmtree(target)
27+
assert not os.path.exists(target)
28+
29+
30+
@pytest.mark.util
31+
def test_robust_rmtree_missing_path_is_noop():
32+
target = make_build_dir("test_rmtree_")
33+
robust_rmtree(os.path.join(target, "does_not_exist"))
34+
robust_rmtree("")
35+
robust_rmtree(None)
36+
robust_rmtree(target)
37+
38+
39+
@pytest.mark.util
40+
@pytest.mark.parametrize("transient_errno", [errno.ENOTEMPTY, errno.EBUSY])
41+
def test_robust_rmtree_retries_on_transient_then_succeeds(monkeypatch, transient_errno):
42+
target = make_build_dir("test_rmtree_")
43+
Path(os.path.join(target, "f")).write_text("x")
44+
45+
state = {"calls": 0}
46+
real_rmtree = shutil.rmtree
47+
48+
def flaky(path, *a, **kw):
49+
state["calls"] += 1
50+
if state["calls"] < 3:
51+
raise OSError(transient_errno, "fake", str(path))
52+
return real_rmtree(path, *a, **kw)
53+
54+
monkeypatch.setattr(shutil, "rmtree", flaky)
55+
monkeypatch.setattr(time, "sleep", lambda _s: None)
56+
57+
robust_rmtree(target, retries=5)
58+
59+
assert state["calls"] == 3
60+
assert not os.path.exists(target)
61+
62+
63+
@pytest.mark.util
64+
def test_robust_rmtree_propagates_non_transient_oserror(monkeypatch):
65+
target = make_build_dir("test_rmtree_")
66+
67+
calls = []
68+
69+
def always_eacces(path, *a, **kw):
70+
calls.append(path)
71+
raise OSError(errno.EACCES, "fake", str(path))
72+
73+
monkeypatch.setattr(shutil, "rmtree", always_eacces)
74+
monkeypatch.setattr(time, "sleep", lambda _s: None)
75+
76+
with pytest.raises(OSError) as excinfo:
77+
robust_rmtree(target, retries=5)
78+
79+
assert excinfo.value.errno == errno.EACCES
80+
# No retry: an errno outside (ENOTEMPTY, EBUSY) propagates on the first attempt.
81+
assert len(calls) == 1
82+
83+
84+
@pytest.mark.util
85+
def test_robust_rmtree_raises_after_retries_exhausted(monkeypatch):
86+
target = make_build_dir("test_rmtree_")
87+
88+
calls = []
89+
90+
def always_enotempty(path, *a, **kw):
91+
calls.append(path)
92+
raise OSError(errno.ENOTEMPTY, "fake", str(path))
93+
94+
monkeypatch.setattr(shutil, "rmtree", always_enotempty)
95+
monkeypatch.setattr(time, "sleep", lambda _s: None)
96+
97+
with pytest.raises(OSError) as excinfo:
98+
robust_rmtree(target, retries=4)
99+
100+
assert excinfo.value.errno == errno.ENOTEMPTY
101+
assert len(calls) == 4
102+
103+
104+
@pytest.mark.util
105+
def test_robust_rmtree_tolerates_filenotfounderror(monkeypatch):
106+
target = make_build_dir("test_rmtree_")
107+
108+
def fnf(path, *a, **kw):
109+
raise FileNotFoundError(str(path))
110+
111+
monkeypatch.setattr(shutil, "rmtree", fnf)
112+
robust_rmtree(target)

0 commit comments

Comments
 (0)