Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8668147
migrate to transformers v5
omkar-334 Feb 27, 2026
cae33e1
Merge branch 'develop' into transv5
omkar-334 Feb 27, 2026
81e2bc6
Apply suggestions from code review
Borda Feb 28, 2026
7a49c7c
Merge branch 'develop' into transv5
omkar-334 Mar 3, 2026
69d7f13
Merge branch 'develop' into transv5
Borda Mar 3, 2026
c33a949
fix: add explicit typing for output feature/index alignment helper
Borda Mar 3, 2026
2b6de9d
Merge branch 'develop' into transv5
omkar-334 Mar 3, 2026
699b373
fix: apply PR #760 review suggestions for transformers v5 migration
Borda Mar 3, 2026
4e0144c
Merge branch 'transv5' of https://github.com/omkar-334/rf-detr into t…
Borda Mar 3, 2026
7fed088
fix: correct TestSdpaFallbackWithOutputAttentions to match actual beh…
Borda Mar 3, 2026
f623e3e
fix: add missing Apache 2.0 copyright notices for copied transformers…
Borda Mar 3, 2026
0c49b15
docs: add maintenance note to copied transformers functions
Borda Mar 3, 2026
9789c91
Merge remote-tracking branch 'origin/develop' into resolve/pr-760-tmp
Borda Mar 16, 2026
3001214
style: reformat multiline comments and warning logs for readability
Borda Mar 16, 2026
4f0cbaf
Update AGENTS.md
Borda Mar 16, 2026
e25716d
chore: bump `transformers` dependency to `>=5.0.0` and update DINOv2 …
Borda Mar 16, 2026
94816dd
Merge branch 'transv5' of https://github.com/omkar-334/rf-detr into t…
Borda Mar 16, 2026
4879a89
refactor: prefix private utility functions with `_` for consistency a…
Borda Mar 16, 2026
aa7b713
refactor: update references to utility functions with `_` prefix and …
Borda Mar 16, 2026
b32464b
fix(pre-commit): 🎨 auto format pre-commit hooks
pre-commit-ci[bot] Mar 16, 2026
54b4e58
refactor: update `transformers` import path and simplify `_init_trans…
Borda Mar 16, 2026
b535c5c
fix(pre-commit): 🎨 auto format pre-commit hooks
pre-commit-ci[bot] Mar 16, 2026
f7681cf
feat: add `set_attn_implementation` method and extend support for att…
Borda Mar 16, 2026
e972a81
Merge branch 'transv5' of https://github.com/omkar-334/rf-detr into t…
Borda Mar 16, 2026
4ab0f0e
refactor: parameterize and consolidate test cases for head pruning, a…
Borda Mar 16, 2026
5b6fe40
fix(pre-commit): 🎨 auto format pre-commit hooks
pre-commit-ci[bot] Mar 16, 2026
d493ff3
docs: add example usage for `set_attn_implementation` in docstring
Borda Mar 16, 2026
2ce630a
Merge branch 'transv5' of https://github.com/omkar-334/rf-detr into t…
Borda Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies = [
"torchvision>=0.14.0",
"scipy",
"tqdm",
"transformers>4.0.0, <5.0.0",
"transformers>=5.0.0",
"peft",
"rf100vl",
"pydantic",
Expand Down
103 changes: 53 additions & 50 deletions src/rfdetr/models/backbone/dinov2_with_windowed_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.activations import ACT2FN
from transformers.backbone_utils import BackboneConfigMixin, BackboneMixin
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import (
BackboneOutput,
Expand All @@ -23,22 +24,61 @@
ImageClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from transformers.pytorch_utils import prune_linear_layer
from transformers.utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
torch_int,
)
from transformers.utils.backbone_utils import (
BackboneConfigMixin,
BackboneMixin,
get_aligned_output_features_output_indices,
)

logger = logging.get_logger(__name__)


def find_pruneable_heads_and_indices(
heads: set, n_heads: int, head_size: int, already_pruned_heads: set
) -> tuple[set, torch.LongTensor]:
mask = torch.ones(n_heads, head_size)
heads = set(heads) - already_pruned_heads
for head in heads:
head -= sum(1 if h < head else 0 for h in already_pruned_heads)
mask[head] = 0
mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long()
return heads, index


def _align_output_features_output_indices(
out_features: Optional[List[str]],
out_indices: Optional[Union[List[int], Tuple[int]]],
stage_names: List[str],
):
if out_indices is None and out_features is None:
out_indices = [len(stage_names) - 1]
out_features = [stage_names[-1]]
elif out_indices is None and out_features is not None:
out_indices = [stage_names.index(layer) for layer in out_features]
elif out_features is None and out_indices is not None:
out_features = [stage_names[idx] for idx in out_indices]
return out_features, out_indices


def get_aligned_output_features_output_indices(
out_features: Optional[List[str]],
out_indices: Optional[Union[List[int], Tuple[int]]],
stage_names: List[str],
) -> Tuple[List[str], List[int]]:
out_indices = list(out_indices) if out_indices is not None else None
out_features, out_indices = _align_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=stage_names
)
return out_features, out_indices


# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base"

# General docstring
_CONFIG_FOR_DOC = "WindowedDinov2WithRegistersConfig"

Expand Down Expand Up @@ -372,7 +412,7 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
return x.permute(0, 2, 1, 3)

def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
self, hidden_states, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
mixed_query_layer = self.query(hidden_states)

Expand All @@ -392,10 +432,6 @@ def forward(
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask

context_layer = torch.matmul(attention_probs, value_layer)

context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
Expand All @@ -413,17 +449,15 @@ def __init__(self, config: WindowedDinov2WithRegistersConfig) -> None:
self.attention_probs_dropout_prob = config.attention_probs_dropout_prob

def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
self, hidden_states, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"Dinov2WithRegistersModel is using Dinov2WithRegistersSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
)
return super().forward(hidden_states=hidden_states, output_attentions=output_attentions)

mixed_query_layer = self.query(hidden_states)

Expand All @@ -435,7 +469,7 @@ def forward(
query_layer,
key_layer,
value_layer,
head_mask,
None,
self.attention_probs_dropout_prob if self.training else 0.0,
is_causal=False,
scale=None,
Expand Down Expand Up @@ -494,10 +528,9 @@ def prune_heads(self, heads: Set[int]) -> None:
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions)
self_outputs = self.attention(hidden_states, output_attentions)

attention_output = self.output(self_outputs[0], hidden_states)

Expand Down Expand Up @@ -622,11 +655,9 @@ def __init__(self, config: WindowedDinov2WithRegistersConfig) -> None:
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
run_full_attention: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
assert head_mask is None, "head_mask is not supported for windowed attention"
assert not output_attentions, "output_attentions is not supported for windowed attention"
shortcut = hidden_states
if run_full_attention:
Expand All @@ -637,7 +668,6 @@ def forward(

self_attention_outputs = self.attention(
self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
Expand Down Expand Up @@ -678,7 +708,6 @@ def __init__(self, config: WindowedDinov2WithRegistersConfig) -> None:
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
Expand All @@ -696,18 +725,15 @@ def forward(

run_full_attention = i not in self.config.window_block_indexes

layer_head_mask = head_mask[i] if head_mask is not None else None

if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
run_full_attention,
)
else:
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, run_full_attention)
layer_outputs = layer_module(hidden_states, output_attentions, run_full_attention)

hidden_states = layer_outputs[0]

Expand Down Expand Up @@ -787,12 +813,6 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
pre-training.

head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.

output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
Expand Down Expand Up @@ -838,7 +858,6 @@ def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
bool_masked_pos: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
Expand Down Expand Up @@ -876,18 +895,10 @@ def forward(
if pixel_values is None:
raise ValueError("You have to specify pixel_values")

# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

encoder_outputs = self.encoder(
embedding_output,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
Expand All @@ -914,12 +925,6 @@ def forward(
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`BitImageProcessor.preprocess`] for details.

head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.

output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
Expand Down Expand Up @@ -961,7 +966,6 @@ def __init__(self, config: WindowedDinov2WithRegistersConfig) -> None:
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
Expand Down Expand Up @@ -1001,7 +1005,6 @@ def forward(

outputs = self.dinov2_with_registers(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
Expand Down Expand Up @@ -1062,7 +1065,7 @@ def forward(
class WindowedDinov2WithRegistersBackbone(WindowedDinov2WithRegistersPreTrainedModel, BackboneMixin):
def __init__(self, config: WindowedDinov2WithRegistersConfig):
super().__init__(config)
super()._init_backbone(config)
self._init_transformers_backbone()
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
self.embeddings = WindowedDinov2WithRegistersEmbeddings(config)
self.encoder = WindowedDinov2WithRegistersEncoder(config)
Expand Down