Skip to content

Commit d4cc066

Browse files
authored
Add video mask demo. (#149)
1 parent 80a6096 commit d4cc066

File tree

1 file changed

+81
-0
lines changed

1 file changed

+81
-0
lines changed

mlcd_vl/downstream/README.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
## Example:
88

99
![output](https://github.com/user-attachments/assets/85c023a1-3e0c-4ea5-a764-1eb9ee0fbddf)
10+
<video src="https://github.com/user-attachments/assets/380dee0d-47c4-4e01-8ff0-e69e62cccd7c" alt="output" width="1024"></video>
1011

1112

1213
## RefCOCO Segmentation Evaluation Results:
@@ -56,6 +57,86 @@ pred_mask = model.seg(seg_img, seg_prompt, tokenizer, force_seg=False)
5657

5758
```
5859

60+
If you want to use this code in video, please refer to this sample below
61+
```python
62+
from transformers import AutoModel, AutoTokenizer
63+
from PIL import Image
64+
import torch
65+
from torchvision import transforms
66+
import subprocess
67+
import os
68+
69+
# video path
70+
video_path = "updownfunk.mp4"
71+
input_dir = "frames"
72+
output_dir = "mask_frames"
73+
os.makedirs(input_dir, exist_ok=True)
74+
os.makedirs(output_dir, exist_ok=True)
75+
76+
# assert you have ffmpeg installed, mp4 -> jpg
77+
cmd = [
78+
"ffmpeg",
79+
"-i", video_path,
80+
"-vf", "fps=30", # 30FPS
81+
"-qscale:v", "1",
82+
os.path.join(input_dir, "frame_%04d.jpg")
83+
]
84+
subprocess.run(cmd)
85+
86+
# model path
87+
88+
model_path = "/DeepGlint-AI/MLCD-Seg/" # or use your local path
89+
mlcd_seg = AutoModel.from_pretrained(
90+
model_path,
91+
torch_dtype=torch.float16,
92+
trust_remote_code=True
93+
).cuda()
94+
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
95+
96+
97+
# read jpgs
98+
image_files = sorted([f for f in os.listdir(input_dir) if f.endswith(('.jpg', '.png', '.jpeg'))])
99+
100+
for idx, filename in enumerate(image_files, start=1):
101+
102+
src_path = os.path.join(input_dir, filename)
103+
seg_img = Image.open(src_path).convert('RGB')
104+
105+
seg_prompt = "This <video> depicts a group of people dancing.\nCould you provide a segmentation mask for the man in pink suit?"
106+
pred_mask = mlcd_seg.predict_forward(seg_img, seg_prompt, tokenizer, force_seg=True)
107+
108+
# Mask visualization
109+
pred_mask = pred_mask.squeeze(0).cpu()
110+
pred_mask = (pred_mask > 0.5).float()
111+
img_tensor = transforms.ToTensor()(seg_img)
112+
alpha = 0.2 # 20% transparency
113+
red_mask = torch.tensor([0.0, 1.0, 0.0]).view(3, 1, 1).to(img_tensor.device) # green mask
114+
black_bg = torch.zeros_like(img_tensor) # black background
115+
masked_area = red_mask * alpha + img_tensor * (1 - alpha)
116+
background = black_bg * alpha + img_tensor * (1 - alpha)
117+
combined = torch.where(pred_mask.unsqueeze(0).bool(), masked_area, background)
118+
combined = combined.cpu() # [3, H, W], CPU
119+
120+
# Save masked jpgs
121+
new_name = f"{idx:04d}{os.path.splitext(filename)[1]}"
122+
dst_path = os.path.join(output_dir, new_name)
123+
transforms.ToPILImage()(combined.clamp(0, 1)).save(dst_path)
124+
125+
cmd = [
126+
"ffmpeg",
127+
"-y",
128+
"-framerate", str(30), # fps
129+
"-i", os.path.join(output_dir, "%04d.jpg"),
130+
"-c:v", "libx264",
131+
"-crf", str(23),
132+
"-pix_fmt", "yuv420p",
133+
"-vf", "fps=" + str(23),
134+
"updownfunk_mask.mp4" # output video
135+
]
136+
# jpgs -> mp4
137+
subprocess.run(cmd, check=True)
138+
```
139+
59140
If you want to use this code measurement dataset (e.g. refcoco), then you need to use the following method
60141
```python
61142
from transformers import AutoModel, AutoTokenizer

0 commit comments

Comments
 (0)