Skip to content

Commit 4510060

Browse files
authored
feat: add activity recognition (#435)
* Add activity recognition and Gemini image generation officially to tool list
1 parent 52cf82b commit 4510060

8 files changed

Lines changed: 121 additions & 363 deletions

File tree

tests/integ/test_tools.py

Lines changed: 35 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from PIL import Image
44

55
from vision_agent.tools import (
6-
activity_recognition,
6+
agentic_activity_recognition,
77
agentic_document_extraction,
88
agentic_object_detection,
99
agentic_sam2_instance_segmentation,
@@ -21,7 +21,6 @@
2121
florence2_ocr,
2222
florence2_sam2_instance_segmentation,
2323
florence2_sam2_video_tracking,
24-
flux_image_inpainting,
2524
gemini_image_generation,
2625
generate_pose_image,
2726
ocr,
@@ -263,16 +262,45 @@ def test_qwen2_vl_video_vqa():
263262
assert "cat" in result.strip()
264263

265264

266-
def test_activity_recognition():
265+
def test_agentic_activity_recognition_no_audio():
266+
frames = [
267+
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
268+
]
269+
result = agentic_activity_recognition(
270+
prompt="cat",
271+
frames=frames,
272+
with_audio=False
273+
)
274+
assert len(result) == 1
275+
assert isinstance(result[0]["start_time"], int)
276+
assert isinstance(result[0]["end_time"], int)
277+
assert result[0]["location"] is not None and len(result[0]["location"]) > 0
278+
assert result[0]["description"] is not None and len(result[0]["description"]) > 0
279+
assert result[0]["label"] == 0
280+
281+
282+
def test_agentic_activity_recognition_multiple_activities_low_specificity():
267283
frames = [
268284
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(5)
269285
]
270-
result = activity_recognition(
271-
prompt="Is it there a cat in this video?",
286+
result = agentic_activity_recognition(
287+
prompt="cat; animal",
272288
frames=frames,
273-
model="qwen2vl",
289+
fps=1,
290+
with_audio=False,
291+
specificity="low",
274292
)
275-
assert len(result) == 5
293+
assert len(result) == 2
294+
assert isinstance(result[0]["start_time"], int)
295+
assert isinstance(result[0]["end_time"], int)
296+
assert result[0]["location"] is not None and len(result[0]["location"]) > 0
297+
assert result[0]["description"] is not None and len(result[0]["description"]) > 0
298+
assert result[0]["label"] == 0
299+
assert isinstance(result[1]["start_time"], int)
300+
assert result[1]["end_time"] > 0
301+
assert result[1]["location"] is not None and len(result[0]["location"]) > 0
302+
assert result[1]["description"] is not None and len(result[0]["description"]) > 0
303+
assert result[1]["label"] == 1
276304

277305

278306
def test_ocr():
@@ -385,23 +413,6 @@ def test_countgd_visual_object_detection_empty():
385413
assert result == []
386414

387415

388-
def test_flux_image_inpainting():
389-
mask_image = np.zeros((32, 32), dtype=np.uint8)
390-
mask_image[:4, :4] = 1
391-
image = np.zeros((32, 32), dtype=np.uint8)
392-
393-
result = flux_image_inpainting(
394-
prompt="horse",
395-
image=image,
396-
mask=mask_image,
397-
)
398-
399-
assert result.shape[0] == 32
400-
assert result.shape[1] == 32
401-
assert result.shape[0] == image.shape[0]
402-
assert result.shape[1] == image.shape[1]
403-
404-
405416
def test_gemini_image_inpainting():
406417
image = np.zeros((32, 32), dtype=np.uint8)
407418

@@ -476,38 +487,6 @@ def test_siglip_classification():
476487
assert result["scores"][0] > result["scores"][2]
477488

478489

479-
def test_flux_image_inpainting_resizing_not_multiple_8():
480-
mask_image = np.zeros((37, 37), dtype=np.uint8)
481-
mask_image[:4, :4] = 1
482-
image = np.zeros((37, 37), dtype=np.uint8)
483-
484-
result = flux_image_inpainting(
485-
prompt="horse",
486-
image=image,
487-
mask=mask_image,
488-
)
489-
490-
assert result.shape[0] == 32
491-
assert result.shape[1] == 32
492-
assert result.shape[0] != image.shape[0]
493-
assert result.shape[1] != image.shape[1]
494-
495-
496-
def test_flux_image_inpainting_resizing_big_image():
497-
mask_image = np.zeros((1200, 500), dtype=np.uint8)
498-
mask_image[:100, :100] = 1
499-
image = np.zeros((1200, 500), dtype=np.uint8)
500-
501-
result = flux_image_inpainting(
502-
prompt="horse",
503-
image=image,
504-
mask=mask_image,
505-
)
506-
507-
assert result.shape[0] == 512
508-
assert result.shape[1] == 208
509-
510-
511490
def test_video_tracking_with_countgd():
512491
frames = [
513492
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(5)

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vision_agent/.sim_tools/df.csv

Lines changed: 39 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,29 @@ desc,doc,name
406406
[
407407
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
408408
]",ocr
409+
"'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: Optional[numpy.ndarray] = None) -> numpy.ndarray:
410+
'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
411+
It can be used to edit parts of an image or the entire image according to the prompt given.
412+
413+
Parameters:
414+
prompt (str): A detailed text description guiding what should be generated
415+
in the image. More detailed and specific prompts typically yield
416+
better results.
417+
image (np.ndarray, optional): The source image to be inpainted. The image will serve as
418+
the base context for the inpainting process.
419+
420+
Returns:
421+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
422+
ranging from 0 to 255.
423+
424+
-------
425+
Example:
426+
>>> # Generate inpainting
427+
>>> result = gemini_image_generation(
428+
... prompt=""a modern black leather sofa with white pillows"",
429+
... image=image,
430+
... )
431+
>>> save_image(result, ""inpainted_room.png"")",gemini_image_generation
409432
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
410433
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
411434
images including regular images or images of documents or presentations. It can be
@@ -439,27 +462,28 @@ desc,doc,name
439462
-------
440463
>>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
441464
'Lionel Messi'",qwen25_vl_video_vqa
442-
'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen25vl', chunk_length_frames: int = 10) -> List[float]:
443-
'activity_recognition' is a tool that can recognize activities in a video given a
444-
text prompt. It can be used to identify where specific activities or actions
445-
happen in a video and returns a list of 0s and 1s to indicate the activity.
465+
"'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video. It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.","agentic_activity_recognition(prompt: str, frames: List[numpy.ndarray], fps: Optional[float] = 5, specificity: str = 'max', with_audio: bool = False) -> List[Dict[str, Any]]:
466+
'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video.
467+
It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.
446468
447469
Parameters:
448-
prompt (str): The event you want to identify, should be phrased as a question,
449-
for example, ""Did a goal happen?"".
450-
frames (List[np.ndarray]): The reference frames used for the question
451-
model (str): The model to use for the inference. Valid values are
452-
'claude-35', 'gpt-4o', 'qwen2vl'.
453-
chunk_length_frames (int): length of each chunk in frames
470+
prompt (str): The prompt for activity recognition. Multiple activieties can be separated by semi-colon.
471+
frames (List[np.ndarray]): The list of frames corresponding to the video.
472+
fps (float, optional): The frame rate per second to extract the frames at. Defaults to 5.
473+
specificity (str, optional): Specificity or precision level for activity recognition - low, medium, high, max. Default is max.
474+
with_audio (bool, optional): Whether to include audio processing in activity recognition. Set it to false if there is no audio in the video. Default is false.
454475
455476
Returns:
456-
List[float]: A list of floats with a value of 1.0 if the activity is detected in
457-
the chunk_length_frames of the video.
477+
List[Dict[str, Any]]: A list of dictionaries containing the start time, end time, location, description, and label for each detected activity.
478+
The start and end times are in seconds, the location is a string, the description is a string, and the label is an integer.
458479
459480
Example
460481
-------
461-
>>> activity_recognition('Did a goal happened?', frames)
462-
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",activity_recognition
482+
>>> agentic_activity_recognition('Person gets on bike; Person gets off bike', frames)
483+
[
484+
{'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
485+
{'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
486+
]",agentic_activity_recognition
463487
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
464488
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
465489
depth image from a given RGB image. The returned depth image is monochrome and
@@ -514,59 +538,6 @@ desc,doc,name
514538
-------
515539
>>> vit_nsfw_classification(image)
516540
{""label"": ""normal"", ""scores"": 0.68},",vit_nsfw_classification
517-
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
518-
'flux_image_inpainting' performs image inpainting to fill the masked regions,
519-
given by mask, in the image, given image based on the text prompt and surrounding
520-
image context. It can be used to edit regions of an image according to the prompt
521-
given.
522-
523-
Parameters:
524-
prompt (str): A detailed text description guiding what should be generated
525-
in the masked area. More detailed and specific prompts typically yield
526-
better results.
527-
image (np.ndarray): The source image to be inpainted. The image will serve as
528-
the base context for the inpainting process.
529-
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
530-
areas to be inpainted and 0 indicates areas to be preserved.
531-
532-
Returns:
533-
np.ndarray: The generated image(s) as a numpy array in RGB format with values
534-
ranging from 0 to 255.
535-
536-
-------
537-
Example:
538-
>>> # Generate inpainting
539-
>>> result = flux_image_inpainting(
540-
... prompt=""a modern black leather sofa with white pillows"",
541-
... image=image,
542-
... mask=mask,
543-
... )
544-
>>> save_image(result, ""inpainted_room.png"")
545-
",flux_image_inpainting
546-
"'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
547-
'gemini_image_generation' performs image inpainting given an image and text prompt.
548-
It can be used to edit parts of an image or the entire image according to the prompt given.
549-
550-
Parameters:
551-
prompt (str): A detailed text description guiding what should be generated
552-
in the image. More detailed and specific prompts typically yield
553-
better results.
554-
image (np.ndarray): The source image to be inpainted. The image will serve as
555-
the base context for the inpainting process.
556-
557-
Returns:
558-
np.ndarray: The generated image(s) as a numpy array in RGB format with values
559-
ranging from 0 to 255.
560-
561-
-------
562-
Example:
563-
>>> # Generate inpainting
564-
>>> result = gemini_image_generation(
565-
... prompt="a modern black leather sofa with white pillows",
566-
... image=image,
567-
... )
568-
>>> save_image(result, ""inpainted_room.png"")
569-
",gemini_image_generation
570541
'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
571542
'siglip_classification' is a tool that can classify an image or a cropped detection given a list
572543
of input labels or tags. It returns the same list of the input labels along with
@@ -718,4 +689,4 @@ desc,doc,name
718689
[0, 0, 0, ..., 0, 0, 0],
719690
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
720691
}],
721-
)",overlay_segmentation_masks
692+
)",overlay_segmentation_masks

vision_agent/.sim_tools/embs.npy

0 Bytes
Binary file not shown.

vision_agent/agent/vision_agent_planner_prompts_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ def countgd_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_len
519519
- "video object tracking" - tracking objects in a video.
520520
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
521521
- "activity recognition" - identifying time period(s) an event occurs in a video.
522-
- "inpainting" - filling in masked parts of an image.
522+
- "image generation" - generating images from a text prompt.
523523
524524
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
525525
"""

vision_agent/agent/vision_agent_prompts_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_detect_dogs():
5555
- Pose estimation
5656
- Visual question answering for both images and videos
5757
- Activity recognition in videos
58-
- Image inpainting
58+
- Image generation
5959
6060
How can I help you?</response>
6161
--- END EXAMPLE2 ---

vision_agent/tools/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .planner_tools import judge_od_results
88
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
99
from .tools import (
10-
activity_recognition,
10+
agentic_activity_recognition,
1111
agentic_document_extraction,
1212
agentic_object_detection,
1313
agentic_sam2_instance_segmentation,
@@ -30,7 +30,6 @@
3030
florence2_ocr,
3131
florence2_sam2_instance_segmentation,
3232
florence2_sam2_video_tracking,
33-
flux_image_inpainting,
3433
gemini_image_generation,
3534
generate_pose_image,
3635
get_tools,

0 commit comments

Comments
 (0)