|
7 | 7 | ## Example: |
8 | 8 |
|
9 | 9 |  |
| 10 | +<video src="https://github.com/user-attachments/assets/380dee0d-47c4-4e01-8ff0-e69e62cccd7c" alt="output" width="1024"></video> |
10 | 11 |
|
11 | 12 |
|
12 | 13 | ## RefCOCO Segmentation Evaluation Results: |
@@ -56,6 +57,86 @@ pred_mask = model.seg(seg_img, seg_prompt, tokenizer, force_seg=False) |
56 | 57 |
|
57 | 58 | ``` |
58 | 59 |
|
| 60 | +If you want to use this code in video, please refer to this sample below |
| 61 | +```python |
| 62 | +from transformers import AutoModel, AutoTokenizer |
| 63 | +from PIL import Image |
| 64 | +import torch |
| 65 | +from torchvision import transforms |
| 66 | +import subprocess |
| 67 | +import os |
| 68 | + |
| 69 | +# video path |
| 70 | +video_path = "updownfunk.mp4" |
| 71 | +input_dir = "frames" |
| 72 | +output_dir = "mask_frames" |
| 73 | +os.makedirs(input_dir, exist_ok=True) |
| 74 | +os.makedirs(output_dir, exist_ok=True) |
| 75 | + |
| 76 | +# assert you have ffmpeg installed, mp4 -> jpg |
| 77 | +cmd = [ |
| 78 | + "ffmpeg", |
| 79 | + "-i", video_path, |
| 80 | + "-vf", "fps=30", # 30FPS |
| 81 | + "-qscale:v", "1", |
| 82 | + os.path.join(input_dir, "frame_%04d.jpg") |
| 83 | +] |
| 84 | +subprocess.run(cmd) |
| 85 | + |
| 86 | +# model path |
| 87 | + |
| 88 | +model_path = "/DeepGlint-AI/MLCD-Seg/" # or use your local path |
| 89 | +mlcd_seg = AutoModel.from_pretrained( |
| 90 | + model_path, |
| 91 | + torch_dtype=torch.float16, |
| 92 | + trust_remote_code=True |
| 93 | +).cuda() |
| 94 | +tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) |
| 95 | + |
| 96 | + |
| 97 | +# read jpgs |
| 98 | +image_files = sorted([f for f in os.listdir(input_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]) |
| 99 | + |
| 100 | +for idx, filename in enumerate(image_files, start=1): |
| 101 | + |
| 102 | + src_path = os.path.join(input_dir, filename) |
| 103 | + seg_img = Image.open(src_path).convert('RGB') |
| 104 | + |
| 105 | + seg_prompt = "This <video> depicts a group of people dancing.\nCould you provide a segmentation mask for the man in pink suit?" |
| 106 | + pred_mask = mlcd_seg.predict_forward(seg_img, seg_prompt, tokenizer, force_seg=True) |
| 107 | + |
| 108 | + # Mask visualization |
| 109 | + pred_mask = pred_mask.squeeze(0).cpu() |
| 110 | + pred_mask = (pred_mask > 0.5).float() |
| 111 | + img_tensor = transforms.ToTensor()(seg_img) |
| 112 | + alpha = 0.2 # 20% transparency |
| 113 | + red_mask = torch.tensor([0.0, 1.0, 0.0]).view(3, 1, 1).to(img_tensor.device) # green mask |
| 114 | + black_bg = torch.zeros_like(img_tensor) # black background |
| 115 | + masked_area = red_mask * alpha + img_tensor * (1 - alpha) |
| 116 | + background = black_bg * alpha + img_tensor * (1 - alpha) |
| 117 | + combined = torch.where(pred_mask.unsqueeze(0).bool(), masked_area, background) |
| 118 | + combined = combined.cpu() # [3, H, W], CPU |
| 119 | + |
| 120 | + # Save masked jpgs |
| 121 | + new_name = f"{idx:04d}{os.path.splitext(filename)[1]}" |
| 122 | + dst_path = os.path.join(output_dir, new_name) |
| 123 | + transforms.ToPILImage()(combined.clamp(0, 1)).save(dst_path) |
| 124 | + |
| 125 | +cmd = [ |
| 126 | + "ffmpeg", |
| 127 | + "-y", |
| 128 | + "-framerate", str(30), # fps |
| 129 | + "-i", os.path.join(output_dir, "%04d.jpg"), |
| 130 | + "-c:v", "libx264", |
| 131 | + "-crf", str(23), |
| 132 | + "-pix_fmt", "yuv420p", |
| 133 | + "-vf", "fps=" + str(23), |
| 134 | + "updownfunk_mask.mp4" # output video |
| 135 | +] |
| 136 | +# jpgs -> mp4 |
| 137 | +subprocess.run(cmd, check=True) |
| 138 | +``` |
| 139 | + |
59 | 140 | If you want to use this code measurement dataset (e.g. refcoco), then you need to use the following method |
60 | 141 | ```python |
61 | 142 | from transformers import AutoModel, AutoTokenizer |
|
0 commit comments