huggingface
diff --git a/‎docs/source/en/_toctree.yml
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/sam_hq.md
Lines changed: 127 additions & 0 deletions b/‎docs/source/en/model_doc/sam_hq.md
Lines changed: 127 additions & 0 deletions
diff --git a/‎src/transformers/models/__init__.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/configuration_auto.py
Lines changed: 5 additions & 0 deletions b/‎src/transformers/models/auto/configuration_auto.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/image_processing_auto.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/image_processing_auto.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/modeling_auto.py
Lines changed: 8 additions & 0 deletions b/‎src/transformers/models/auto/modeling_auto.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/processing_auto.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/processing_auto.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/sam_hq/__init__.py
Lines changed: 28 additions & 0 deletions b/‎src/transformers/models/sam_hq/__init__.py
Lines changed: 28 additions & 0 deletions
@@ -1017,6 +1017,8 @@
         title: Qwen2VL
       - local: model_doc/sam
         title: Segment Anything
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality
       - local: model_doc/shieldgemma2
         title: ShieldGemma2
       - local: model_doc/siglip
 
@@ -0,0 +1,127 @@
+# SAM-HQ
+
+## Overview
+
+SAM-HQ (High-Quality Segment Anything Model) was proposed in [Segment Anything in High Quality](https://arxiv.org/pdf/2306.01567.pdf) by Lei Ke, Mingqiao Ye, Martin Danelljan, Yifan Liu, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu.
+
+The model is an enhancement to the original SAM model that produces significantly higher quality segmentation masks while maintaining SAM's original promptable design, efficiency, and zero-shot generalizability.
+
+![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
+
+
+SAM-HQ introduces several key improvements over the original SAM model:
+
+1. High-Quality Output Token: A learnable token injected into SAM's mask decoder for higher quality mask prediction
+2. Global-local Feature Fusion: Combines features from different stages of the model for improved mask details
+3. Training Data: Uses a carefully curated dataset of 44K high-quality masks instead of SA-1B
+4. Efficiency: Adds only 0.5% additional parameters while significantly improving mask quality
+5. Zero-shot Capability: Maintains SAM's strong zero-shot performance while improving accuracy
+
+The abstract from the paper is the following:
+
+*The recent Segment Anything Model (SAM) represents a big leap in scaling up segmentation models, allowing for powerful zero-shot capabilities and flexible prompting. Despite being trained with 1.1 billion masks, SAM's mask prediction quality falls short in many cases, particularly when dealing with objects that have intricate structures. We propose HQ-SAM, equipping SAM with the ability to accurately segment any object, while maintaining SAM's original promptable design, efficiency, and zero-shot generalizability. Our careful design reuses and preserves the pre-trained model weights of SAM, while only introducing minimal additional parameters and computation. We design a learnable High-Quality Output Token, which is injected into SAM's mask decoder and is responsible for predicting the high-quality mask. Instead of only applying it on mask-decoder features, we first fuse them with early and final ViT features for improved mask details. To train our introduced learnable parameters, we compose a dataset of 44K fine-grained masks from several sources. HQ-SAM is only trained on the introduced dataset of 44k masks, which takes only 4 hours on 8 GPUs.*
+
+Tips:
+
+- SAM-HQ produces higher quality masks than the original SAM model, particularly for objects with intricate structures and fine details
+- The model predicts binary masks with more accurate boundaries and better handling of thin structures
+- Like SAM, the model performs better with input 2D points and/or input bounding boxes
+- You can prompt multiple points for the same image and predict a single high-quality mask
+- The model maintains SAM's zero-shot generalization capabilities
+- SAM-HQ only adds ~0.5% additional parameters compared to SAM
+- Fine-tuning the model is not supported yet
+
+This model was contributed by [sushmanth](https://huggingface.co/sushmanth).
+The original code can be found [here](https://github.com/SysCV/SAM-HQ).
+
+Below is an example on how to run mask generation given an image and a 2D point:
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import SamHQModel, SamHQProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
+processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+input_points = [[[450, 600]]]  # 2D location of a window in the image
+
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+You can also process your own masks alongside the input images in the processor to be passed to the model:
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import SamHQModel, SamHQProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
+processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1")
+input_points = [[[450, 600]]]  # 2D location of a window in the image
+
+inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM-HQ:
+
+- Demo notebook for using the model (coming soon)
+- Paper implementation and code: [SAM-HQ GitHub Repository](https://github.com/SysCV/SAM-HQ)
+
+## SamHQConfig
+
+[[autodoc]] SamHQConfig
+
+## SamHQVisionConfig
+
+[[autodoc]] SamHQVisionConfig
+
+## SamHQMaskDecoderConfig
+
+[[autodoc]] SamHQMaskDecoderConfig
+
+## SamHQPromptEncoderConfig
+
+[[autodoc]] SamHQPromptEncoderConfig
+
+## SamHQProcessor
+
+[[autodoc]] SamHQProcessor
+
+## SamHQVisionModel
+
+[[autodoc]] SamHQVisionModel
+
+
+## SamHQModel
+
+[[autodoc]] SamHQModel
+    - forward
@@ -254,6 +254,7 @@
     from .rt_detr_v2 import *
     from .rwkv import *
     from .sam import *
+    from .sam_hq import *
     from .seamless_m4t import *
     from .seamless_m4t_v2 import *
     from .segformer import *
 
@@ -286,6 +286,8 @@
         ("rt_detr_v2", "RTDetrV2Config"),
         ("rwkv", "RwkvConfig"),
         ("sam", "SamConfig"),
+        ("sam_hq", "SamHQConfig"),
+        ("sam_hq_vision_model", "SamHQVisionConfig"),
         ("sam_vision_model", "SamVisionConfig"),
         ("seamless_m4t", "SeamlessM4TConfig"),
         ("seamless_m4t_v2", "SeamlessM4Tv2Config"),
@@ -658,6 +660,8 @@
         ("rt_detr_v2", "RT-DETRv2"),
         ("rwkv", "RWKV"),
         ("sam", "SAM"),
+        ("sam_hq", "SAM-HQ"),
+        ("sam_hq_vision_model", "SamHQVisionModel"),
         ("sam_vision_model", "SamVisionModel"),
         ("seamless_m4t", "SeamlessM4T"),
         ("seamless_m4t_v2", "SeamlessM4Tv2"),
@@ -807,6 +811,7 @@
         ("qwen2_5_vl_text", "qwen2_5_vl"),
         ("qwen2_vl_text", "qwen2_vl"),
         ("sam_vision_model", "sam"),
+        ("sam_hq_vision_model", "sam_hq"),
         ("llama4_text", "llama4"),
         ("blip_2_qformer", "blip_2"),
     ]
 
@@ -141,6 +141,7 @@
             ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
             ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
             ("sam", ("SamImageProcessor",)),
+            ("sam_hq", ("SamImageProcessor",)),
             ("segformer", ("SegformerImageProcessor",)),
             ("seggpt", ("SegGptImageProcessor",)),
             ("shieldgemma2", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
 
@@ -257,6 +257,8 @@
         ("rt_detr_v2", "RTDetrV2Model"),
         ("rwkv", "RwkvModel"),
         ("sam", "SamModel"),
+        ("sam_hq", "SamHQModel"),
+        ("sam_hq_vision_model", "SamHQVisionModel"),
         ("sam_vision_model", "SamVisionModel"),
         ("seamless_m4t", "SeamlessM4TModel"),
         ("seamless_m4t_v2", "SeamlessM4Tv2Model"),
@@ -1495,6 +1497,12 @@
     ]
 )
 
+MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("sam_hq", "SamHQModel"),
+    ]
+)
+
 
 MODEL_FOR_KEYPOINT_DETECTION_MAPPING_NAMES = OrderedDict(
     [
 
@@ -104,6 +104,7 @@
         ("qwen2_audio", "Qwen2AudioProcessor"),
         ("qwen2_vl", "Qwen2VLProcessor"),
         ("sam", "SamProcessor"),
+        ("sam_hq", "SamHQProcessor"),
         ("seamless_m4t", "SeamlessM4TProcessor"),
         ("sew", "Wav2Vec2Processor"),
         ("sew-d", "Wav2Vec2Processor"),
 
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_sam_hq import *
+    from .modeling_sam_hq import *
+    from .processing_samhq import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
Original file line number	Diff line number	Diff line change
`@@ -257,6 +257,8 @@`
`257`	`257`	`("rt_detr_v2", "RTDetrV2Model"),`
`258`	`258`	`("rwkv", "RwkvModel"),`
`259`	`259`	`("sam", "SamModel"),`
	`260`	`+ ("sam_hq", "SamHQModel"),`
	`261`	`+ ("sam_hq_vision_model", "SamHQVisionModel"),`
`260`	`262`	`("sam_vision_model", "SamVisionModel"),`
`261`	`263`	`("seamless_m4t", "SeamlessM4TModel"),`
`262`	`264`	`("seamless_m4t_v2", "SeamlessM4Tv2Model"),`
`@@ -1495,6 +1497,12 @@`
`1495`	`1497`	`]`
`1496`	`1498`	`)`
`1497`	`1499`
	`1500`	`+MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(`
	`1501`	`+ [`
	`1502`	`+ ("sam_hq", "SamHQModel"),`
	`1503`	`+ ]`
	`1504`	`+)`
	`1505`	`+`
`1498`	`1506`
`1499`	`1507`	`MODEL_FOR_KEYPOINT_DETECTION_MAPPING_NAMES = OrderedDict(`
`1500`	`1508`	`[`