handle torch version edge cases (#37399)

winglian · LysandreJik · commit bffeefe471b8 · 2025-04-14T07:37:41.000+02:00
diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py
@@ -66,7 +66,7 @@ def __init__(self, training):
             # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
             # see https://github.com/pytorch/pytorch/issues/146260 for training
             self.training = training
-            if _torch_version == "2.6.0" and training:
+            if _torch_version.split("+")[0] == "2.6.0" and training:
                 self._compiled_flex_attention = torch.compile(
                     flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
                 )

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def __init__(self, training):`
`66`	`66`	`# cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"`
`67`	`67`	`# see https://github.com/pytorch/pytorch/issues/146260 for training`
`68`	`68`	`self.training = training`
`69`		`- if _torch_version == "2.6.0" and training:`
	`69`	`+ if _torch_version.split("+")[0] == "2.6.0" and training:`
`70`	`70`	`self._compiled_flex_attention = torch.compile(`
`71`	`71`	`flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"`
`72`	`72`	`)`